本文整理汇总了Python中spacy.matcher.Matcher类的典型用法代码示例。如果您正苦于以下问题:Python Matcher类的具体用法?Python Matcher怎么用?Python Matcher使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Matcher类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: RussianTokenizer
class RussianTokenizer(object):
name = 'russian_tokenizer'
def __init__(self, nlp, merge_patterns=None, terminal_patterns=None):
self.matcher = Matcher(nlp.vocab)
self.token_merge = nlp.vocab.strings['pattern']
self.sentence_terminal = nlp.vocab.strings['sentence_terminal']
if merge_patterns:
self.matcher.add(self.token_merge, None, *merge_patterns)
if terminal_patterns:
self.matcher.add(self.sentence_terminal, None, *terminal_patterns)
def __call__(self, doc):
spans = []
for id, start, end in self.matcher(doc):
if id == self.token_merge:
spans.append(doc[start:end])
elif id == self.sentence_terminal:
# remove all sentence start marks from span that match pattern
for token in doc[start:end]:
if token.sent_start:
token.sent_start = False
if spans:
for span in spans:
span.merge()
return doc
开发者ID:aatimofeev,项目名称:spacy_russian_tokenizer,代码行数:26,代码来源:__init__.py
示例2: write_conllu
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
for word in doc[sent[0].i - 10 : sent[0].i]:
print(word.i, word.head.i, word.text, word.dep_)
for word in sent:
print(word.i, word.head.i, word.text, word.dep_)
for word in doc[sent[-1].i : sent[-1].i + 10]:
print(word.i, word.head.i, word.text, word.dep_)
raise ValueError(
"Invalid parse: head outside sentence (%s)" % token.text
)
file_.write(token._.get_conllu_lines(k) + "\n")
file_.write("\n")
开发者ID:spacy-io,项目名称:spaCy,代码行数:26,代码来源:ud_train.py
示例3: test_operator_combos
def test_operator_combos(en_vocab):
cases = [
("aaab", "a a a b", True),
("aaab", "a+ b", True),
("aaab", "a+ a+ b", True),
("aaab", "a+ a+ a b", True),
("aaab", "a+ a+ a+ b", True),
("aaab", "a+ a a b", True),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaa", "a+ b", False),
("aaa", "a+ a+ b", False),
("aaa", "a+ a+ a+ b", False),
("aaa", "a+ a b", False),
("aaa", "a+ a a b", False),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaab", "a+ a b", True),
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
else:
pattern.append({"ORTH": part})
matcher.add("PATTERN", None, pattern)
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
else:
assert not matches, (string, pattern_str)
开发者ID:spacy-io,项目名称:spaCy,代码行数:34,代码来源:test_matcher_logic.py
示例4: test_issue615
def test_issue615(en_tokenizer):
def merge_phrases(matcher, doc, i, matches):
"""Merge a phrase. We have to be careful here because we'll change the
token indices. To avoid problems, merge all the phrases once we're called
on the last match."""
if i != len(matches) - 1:
return None
spans = [Span(doc, start, end, label=label) for label, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
tag = "NNP" if span.label_ else span.root.tag_
attrs = {"tag": tag, "lemma": span.text}
retokenizer.merge(span, attrs=attrs)
doc.ents = doc.ents + (span,)
text = "The golf club is broken"
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
label = "Sport_Equipment"
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
matcher.add(label, merge_phrases, pattern)
matcher(doc)
entities = list(doc.ents)
assert entities != []
assert entities[0].label != 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:25,代码来源:test_issue1-1000.py
示例5: test_matcher_match_zero_plus
def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split()
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
matcher = Matcher(matcher.vocab)
matcher.add("Quote", None, pattern)
doc = Doc(matcher.vocab, words=words)
assert len(matcher(doc)) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:7,代码来源:test_matcher_api.py
示例6: write_conllu
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
# TODO: This shouldn't be necessary? Should be handled in merge
for word in doc:
if word.i == word.head.i:
word.dep_ = "ROOT"
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
file_.write("\n")
for word in sent:
if word.head.i == word.i and word.dep_ == "ROOT":
break
else:
print("Rootless sentence!")
print(sent)
print(i)
for w in sent:
print(w.i, w.text, w.head.text, w.head.i, w.dep_)
raise ValueError
开发者ID:spacy-io,项目名称:spaCy,代码行数:30,代码来源:ud_run_test.py
示例7: test_issue3555
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", None, pattern)
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue3555.py
示例8: test_issue1883
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add("pat1", None, [{"orth": "hello"}])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=["hello"])
assert len(new_matcher(new_doc)) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue1501-2000.py
示例9: test_matcher_operator_shadow
def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
matcher.add("A.C", None, pattern)
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_matcher_api.py
示例10: test_match_consuming
def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to
re.findall."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
assert len(matches) == len(re_matches)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_matcher_logic.py
示例11: test_issue_1971_2
def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab)
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc)
assert len(matches) == 2
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue1501-2000.py
示例12: test_greedy_matching
def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
matcher = Matcher(doc.vocab)
matcher.add(re_pattern, None, pattern)
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches):
assert match[1:] == re_match
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_matcher_logic.py
示例13: test_issue1945
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
doc = Doc(matcher.vocab, words=["a", "a", "a"])
matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2
assert matches[0][1:] == (0, 2)
assert matches[1][1:] == (1, 3)
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_issue1501-2000.py
示例14: test_matcher_compare_length
def test_matcher_compare_length(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"LENGTH": {">=": 2}}]
matcher.add("LENGTH_COMPARE", None, pattern)
doc = Doc(en_vocab, words=["a", "aa", "aaa"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["a"])
matches = matcher(doc)
assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例15: test_matcher_regex_shape
def test_matcher_regex_shape(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
matcher.add("NON_ALPHA", None, pattern)
doc = Doc(en_vocab, words=["99", "problems", "!"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["bye"])
matches = matcher(doc)
assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例16: test_matcher_set_value
def test_matcher_set_value(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["an", "a"]}}]
matcher.add("A_OR_AN", None, pattern)
doc = Doc(en_vocab, words=["an", "a", "apple"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["aardvark"])
matches = matcher(doc)
assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例17: test_matcher_any_token_operator
def test_matcher_any_token_operator(en_vocab):
"""Test that patterns with "any token" {} work with operators."""
matcher = Matcher(en_vocab)
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
doc = Doc(en_vocab, words=["test", "hello", "world"])
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == "test"
assert matches[1] == "test hello"
assert matches[2] == "test hello world"
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例18: matcher
def matcher(en_vocab):
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": "java"}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, None, *patterns)
return matcher
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例19: test_matcher_set_value_operator
def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
matcher.add("DET_HOUSE", None, pattern)
doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["my", "house"])
matches = matcher(doc)
assert len(matches) == 1
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
示例20: test_matcher_regex
def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
matcher.add("A_OR_AN", None, pattern)
doc = Doc(en_vocab, words=["an", "a", "hi"])
matches = matcher(doc)
assert len(matches) == 2
doc = Doc(en_vocab, words=["bye"])
matches = matcher(doc)
assert len(matches) == 0
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_matcher_api.py
注:本文中的spacy.matcher.Matcher类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论