• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python matcher.Matcher类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中spacy.matcher.Matcher的典型用法代码示例。如果您正苦于以下问题:Python Matcher类的具体用法?Python Matcher怎么用?Python Matcher使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Matcher类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: RussianTokenizer

class RussianTokenizer(object):
    name = 'russian_tokenizer'

    def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
        self.matcher = Matcher(nlp.vocab)
        self.token_merge = nlp.vocab.strings['pattern']
        self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
        if merge_patterns:
            self.matcher.add(self.token_merge, None, *merge_patterns)
        if terminal_patterns:
            self.matcher.add(self.sentence_terminal, None, *terminal_patterns)

    def __call__(self, doc):
        spans = []
        for id, start, end in self.matcher(doc):
            if id == self.token_merge:
                spans.append(doc[start:end])
            elif id == self.sentence_terminal:
                # remove all sentence start marks from span that match pattern
                for token in doc[start:end]:
                    if token.sent_start:
                        token.sent_start = False
        if spans:
            for span in spans:
                span.merge()
        return doc
开发者ID:aatimofeev,项目名称:spacy_russian_tokenizer,代码行数:26,代码来源:__init__.py


示例2: write_conllu

def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    raise ValueError(
                        "Invalid parse: head outside sentence (%s)" % token.text
                    )
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")
开发者ID:spacy-io,项目名称:spaCy,代码行数:26,代码来源:ud_train.py


示例3: test_operator_combos

def test_operator_combos(en_vocab):
    cases = [
        ("aaab", "a a a b", True),
        ("aaab", "a+ b", True),
        ("aaab", "a+ a+ b", True),
        ("aaab", "a+ a+ a b", True),
        ("aaab", "a+ a+ a+ b", True),
        ("aaab", "a+ a a b", True),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaa", "a+ b", False),
        ("aaa", "a+ a+ b", False),
        ("aaa", "a+ a+ a+ b", False),
        ("aaa", "a+ a b", False),
        ("aaa", "a+ a a b", False),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaab", "a+ a b", True),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", None, pattern)
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)
开发者ID:spacy-io,项目名称:spaCy,代码行数:34,代码来源:test_matcher_logic.py


示例4: test_issue615

def test_issue615(en_tokenizer):
    def merge_phrases(matcher, doc, i, matches):
        """Merge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match."""
        if i != len(matches) - 1:
            return None
        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                tag = "NNP" if span.label_ else span.root.tag_
                attrs = {"tag": tag, "lemma": span.text}
                retokenizer.merge(span, attrs=attrs)
                doc.ents = doc.ents + (span,)

    text = "The golf club is broken"
    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
    label = "Sport_Equipment"
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    matcher(doc)
    entities = list(doc.ents)
    assert entities != []
    assert entities[0].label != 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:25,代码来源:test_issue1-1000.py


示例5: test_matcher_match_zero_plus

def test_matcher_match_zero_plus(matcher):
    words = 'He said , " some words " ...'.split()
    pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
    matcher = Matcher(matcher.vocab)
    matcher.add("Quote", None, pattern)
    doc = Doc(matcher.vocab, words=words)
    assert len(matcher(doc)) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:7,代码来源:test_matcher_api.py


示例6: write_conllu

def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        # TODO: This shouldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
            file_.write("\n")
            for word in sent:
                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
                print(sent)
                print(i)
                for w in sent:
                    print(w.i, w.text, w.head.text, w.head.i, w.dep_)
                raise ValueError
开发者ID:spacy-io,项目名称:spaCy,代码行数:30,代码来源:ud_run_test.py


示例7: test_issue3555

def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue3555.py


示例8: test_issue1883

def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", None, [{"orth": "hello"}])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue1501-2000.py


示例9: test_matcher_operator_shadow

def test_matcher_operator_shadow(en_vocab):
    matcher = Matcher(en_vocab)
    doc = Doc(matcher.vocab, words=["a", "b", "c"])
    pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
    matcher.add("A.C", None, pattern)
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_matcher_api.py


示例10: test_match_consuming

def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_matcher_logic.py


示例11: test_issue_1971_2

def test_issue_1971_2(en_vocab):
    matcher = Matcher(en_vocab)
    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
    matcher.add("TEST1", None, pattern1, pattern2)
    matches = matcher(doc)
    assert len(matches) == 2
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue1501-2000.py


示例12: test_greedy_matching

def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_matcher_logic.py


示例13: test_issue1945

def test_issue1945():
    """Test regression in Matcher introduced in v2.0.6."""
    matcher = Matcher(Vocab())
    matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
    doc = Doc(matcher.vocab, words=["a", "a", "a"])
    matches = matcher(doc)  # we should see two overlapping matches here
    assert len(matches) == 2
    assert matches[0][1:] == (0, 2)
    assert matches[1][1:] == (1, 3)
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_issue1501-2000.py


示例14: test_matcher_compare_length

def test_matcher_compare_length(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"LENGTH": {">=": 2}}]
    matcher.add("LENGTH_COMPARE", None, pattern)
    doc = Doc(en_vocab, words=["a", "aa", "aaa"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["a"])
    matches = matcher(doc)
    assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例15: test_matcher_regex_shape

def test_matcher_regex_shape(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
    matcher.add("NON_ALPHA", None, pattern)
    doc = Doc(en_vocab, words=["99", "problems", "!"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例16: test_matcher_set_value

def test_matcher_set_value(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["an", "a"]}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "apple"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["aardvark"])
    matches = matcher(doc)
    assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例17: test_matcher_any_token_operator

def test_matcher_any_token_operator(en_vocab):
    """Test that patterns with "any token" {} work with operators."""
    matcher = Matcher(en_vocab)
    matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
    doc = Doc(en_vocab, words=["test", "hello", "world"])
    matches = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches) == 3
    assert matches[0] == "test"
    assert matches[1] == "test hello"
    assert matches[2] == "test hello world"
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例18: matcher

def matcher(en_vocab):
    rules = {
        "JS": [[{"ORTH": "JavaScript"}]],
        "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
        "Java": [[{"LOWER": "java"}]],
    }
    matcher = Matcher(en_vocab)
    for key, patterns in rules.items():
        matcher.add(key, None, *patterns)
    return matcher
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例19: test_matcher_set_value_operator

def test_matcher_set_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
    matcher.add("DET_HOUSE", None, pattern)
    doc = Doc(en_vocab, words=["In", "a", "house"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["my", "house"])
    matches = matcher(doc)
    assert len(matches) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py


示例20: test_matcher_regex

def test_matcher_regex(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
    matcher.add("A_OR_AN", None, pattern)
    doc = Doc(en_vocab, words=["an", "a", "hi"])
    matches = matcher(doc)
    assert len(matches) == 2
    doc = Doc(en_vocab, words=["bye"])
    matches = matcher(doc)
    assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py



注:本文中的spacy.matcher.Matcher类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokens.Doc类代码示例发布时间:2022-05-27
下一篇:
Python spacy.load函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap