Python matcher.Matcher类代码示例

OGeek|极客世界-中国程序员成长平台 › 门户 › 编程› Python›Python编程经验

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了Python中spacy.matcher.Matcher类的典型用法代码示例。如果您正苦于以下问题：Python Matcher类的具体用法？Python Matcher怎么用？Python Matcher使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Matcher类的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: RussianTokenizer

class RussianTokenizer(object):
    name = 'russian_tokenizer'

    def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
        self.matcher = Matcher(nlp.vocab)
        self.token_merge = nlp.vocab.strings['pattern']
        self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
        if merge_patterns:
            self.matcher.add(self.token_merge, None, *merge_patterns)
        if terminal_patterns:
            self.matcher.add(self.sentence_terminal, None, *terminal_patterns)

    def __call__(self, doc):
        spans = []
        for id, start, end in self.matcher(doc):
            if id == self.token_merge:
                spans.append(doc[start:end])
            elif id == self.sentence_terminal:
                # remove all sentence start marks from span that match pattern
                for token in doc[start:end]:
                    if token.sent_start:
                        token.sent_start = False
        if spans:
            for span in spans:
                span.merge()
        return doc

开发者ID:aatimofeev，项目名称:spacy_russian_tokenizer，代码行数:26，代码来源:__init__.py

示例2: write_conllu

def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                if token.head.i > sent[-1].i or token.head.i < sent[0].i:
                    for word in doc[sent[0].i - 10 : sent[0].i]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in sent:
                        print(word.i, word.head.i, word.text, word.dep_)
                    for word in doc[sent[-1].i : sent[-1].i + 10]:
                        print(word.i, word.head.i, word.text, word.dep_)
                    raise ValueError(
                        "Invalid parse: head outside sentence (%s)" % token.text
                    )
                file_.write(token._.get_conllu_lines(k) + "\n")
            file_.write("\n")

开发者ID:spacy-io，项目名称:spaCy，代码行数:26，代码来源:ud_train.py

示例3: test_operator_combos

def test_operator_combos(en_vocab):
    cases = [
        ("aaab", "a a a b", True),
        ("aaab", "a+ b", True),
        ("aaab", "a+ a+ b", True),
        ("aaab", "a+ a+ a b", True),
        ("aaab", "a+ a+ a+ b", True),
        ("aaab", "a+ a a b", True),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaa", "a+ b", False),
        ("aaa", "a+ a+ b", False),
        ("aaa", "a+ a+ a+ b", False),
        ("aaa", "a+ a b", False),
        ("aaa", "a+ a a b", False),
        ("aaab", "a+ a a", True),
        ("aaab", "a+", True),
        ("aaab", "a+ a b", True),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
        doc = Doc(matcher.vocab, words=list(string))
        pattern = []
        for part in pattern_str.split():
            if part.endswith("+"):
                pattern.append({"ORTH": part[0], "OP": "+"})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", None, pattern)
        matches = matcher(doc)
        if result:
            assert matches, (string, pattern_str)
        else:
            assert not matches, (string, pattern_str)

开发者ID:spacy-io，项目名称:spaCy，代码行数:34，代码来源:test_matcher_logic.py

示例4: test_issue615

def test_issue615(en_tokenizer):
    def merge_phrases(matcher, doc, i, matches):
        """Merge a phrase. We have to be careful here because we'll change the
        token indices. To avoid problems, merge all the phrases once we're called
        on the last match."""
        if i != len(matches) - 1:
            return None
        spans = [Span(doc, start, end, label=label) for label, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                tag = "NNP" if span.label_ else span.root.tag_
                attrs = {"tag": tag, "lemma": span.text}
                retokenizer.merge(span, attrs=attrs)
                doc.ents = doc.ents + (span,)

    text = "The golf club is broken"
    pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
    label = "Sport_Equipment"
    doc = en_tokenizer(text)
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    matcher(doc)
    entities = list(doc.ents)
    assert entities != []
    assert entities[0].label != 0

开发者ID:spacy-io，项目名称:spaCy，代码行数:25，代码来源:test_issue1-1000.py

示例5: test_matcher_match_zero_plus

def test_matcher_match_zero_plus(matcher):
    words = 'He said , " some words " ...'.split()
    pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
    matcher = Matcher(matcher.vocab)
    matcher.add("Quote", None, pattern)
    doc = Doc(matcher.vocab, words=words)
    assert len(matcher(doc)) == 1

开发者ID:spacy-io，项目名称:spaCy，代码行数:7，代码来源:test_matcher_api.py

示例6: write_conllu

def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start : end + 1] for _, start, end in matches]
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
        # TODO: This shouldn't be necessary? Should be handled in merge
        for word in doc:
            if word.i == word.head.i:
                word.dep_ = "ROOT"
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
            file_.write("\n")
            for word in sent:
                if word.head.i == word.i and word.dep_ == "ROOT":
                    break
            else:
                print("Rootless sentence!")
                print(sent)
                print(i)
                for w in sent:
                    print(w.i, w.text, w.head.text, w.head.i, w.dep_)
                raise ValueError

开发者ID:spacy-io，项目名称:spaCy，代码行数:30，代码来源:ud_run_test.py

示例7: test_issue3555

def test_issue3555(en_vocab):
    """Test that custom extensions with default None don't break matcher."""
    Token.set_extension("issue3555", default=None)
    matcher = Matcher(en_vocab)
    pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
    matcher.add("TEST", None, pattern)
    doc = Doc(en_vocab, words=["have", "apple"])
    matcher(doc)

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_issue3555.py

示例8: test_issue1883

def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add("pat1", None, [{"orth": "hello"}])
    doc = Doc(matcher.vocab, words=["hello"])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=["hello"])
    assert len(new_matcher(new_doc)) == 1

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_issue1501-2000.py

示例9: test_matcher_operator_shadow

def test_matcher_operator_shadow(en_vocab):
    matcher = Matcher(en_vocab)
    doc = Doc(matcher.vocab, words=["a", "b", "c"])
    pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
    matcher.add("A.C", None, pattern)
    matches = matcher(doc)
    assert len(matches) == 1
    assert matches[0][1:] == (0, 3)

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_matcher_api.py

示例10: test_match_consuming

def test_match_consuming(doc, text, pattern, re_pattern):
    """Test that matcher.__call__ consumes tokens on a match similar to
    re.findall."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    assert len(matches) == len(re_matches)

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_matcher_logic.py

示例11: test_issue_1971_2

def test_issue_1971_2(en_vocab):
    matcher = Matcher(en_vocab)
    pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
    pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}]  # {"IN": ["EUR"]}}]
    doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
    matcher.add("TEST1", None, pattern1, pattern2)
    matches = matcher(doc)
    assert len(matches) == 2

开发者ID:spacy-io，项目名称:spaCy，代码行数:8，代码来源:test_issue1501-2000.py

示例12: test_greedy_matching

def test_greedy_matching(doc, text, pattern, re_pattern):
    """Test that the greedy matching behavior of the * op is consistant with
    other re implementations."""
    matcher = Matcher(doc.vocab)
    matcher.add(re_pattern, None, pattern)
    matches = matcher(doc)
    re_matches = [m.span() for m in re.finditer(re_pattern, text)]
    for match, re_match in zip(matches, re_matches):
        assert match[1:] == re_match

开发者ID:spacy-io，项目名称:spaCy，代码行数:9，代码来源:test_matcher_logic.py

示例13: test_issue1945

def test_issue1945():
    """Test regression in Matcher introduced in v2.0.6."""
    matcher = Matcher(Vocab())
    matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
    doc = Doc(matcher.vocab, words=["a", "a", "a"])
    matches = matcher(doc)  # we should see two overlapping matches here
    assert len(matches) == 2
    assert matches[0][1:] == (0, 2)
    assert matches[1][1:] == (1, 3)

开发者ID:spacy-io，项目名称:spaCy，代码行数:9，代码来源:test_issue1501-2000.py