本文整理汇总了Python中spacy.tokens.Doc类的典型用法代码示例。如果您正苦于以下问题:Python Doc类的具体用法?Python Doc怎么用?Python Doc使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Doc类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_doc_array_to_from_string_attrs
def test_doc_array_to_from_string_attrs(en_vocab, attrs):
"""Test that both Doc.to_array and Doc.from_array accept string attrs,
as well as single attrs and sequences of attrs.
"""
words = ["An", "example", "sentence"]
doc = Doc(en_vocab, words=words)
Doc(en_vocab, words=words).from_array(attrs, doc.to_array(attrs))
开发者ID:spacy-io,项目名称:spaCy,代码行数:7,代码来源:test_array.py
示例2: __init__
def __init__(self, nlp, label='GPE'):
"""Initialise the pipeline component. The shared nlp instance is used
to initialise the matcher with the shared vocab, get the label ID and
generate Doc objects as phrase match patterns.
"""
# Make request once on initialisation and store the data
r = requests.get('https://restcountries.eu/rest/v2/all')
r.raise_for_status() # make sure requests raises an error if it fails
countries = r.json()
# Convert API response to dict keyed by country name for easy lookup
# This could also be extended using the alternative and foreign language
# names provided by the API
self.countries = {c['name']: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add('COUNTRIES', None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None.
Token.set_extension('is_country', default=False)
Token.set_extension('country_capital')
Token.set_extension('country_latlng')
Token.set_extension('country_flag')
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Doc.set_extension('has_country', getter=self.has_country)
Span.set_extension('has_country', getter=self.has_country)
开发者ID:AvinashGupta,项目名称:spaCy,代码行数:33,代码来源:custom_component_countries_api.py
示例3: get_doc
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
"""Create Doc object from given vocab, words and annotations."""
pos = pos or [""] * len(words)
tags = tags or [""] * len(words)
heads = heads or [0] * len(words)
deps = deps or [""] * len(words)
for value in deps + tags + pos:
vocab.strings.add(value)
doc = Doc(vocab, words=words)
attrs = doc.to_array([POS, HEAD, DEP])
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
attrs[i, 0] = doc.vocab.strings[p]
attrs[i, 1] = head
attrs[i, 2] = doc.vocab.strings[dep]
doc.from_array([POS, HEAD, DEP], attrs)
if ents:
doc.ents = [
Span(doc, start, end, label=doc.vocab.strings[label])
for start, end, label in ents
]
if tags:
for token in doc:
token.tag_ = tags[token.i]
return doc
开发者ID:spacy-io,项目名称:spaCy,代码行数:25,代码来源:util.py
示例4: test_doc_array_attr_of_token
def test_doc_array_attr_of_token(en_vocab):
doc = Doc(en_vocab, words=["An", "example", "sentence"])
example = doc.vocab["example"]
assert example.orth != example.shape
feats_array = doc.to_array((ORTH, SHAPE))
assert feats_array[0][0] != feats_array[0][1]
assert feats_array[0][0] != feats_array[0][1]
开发者ID:spacy-io,项目名称:spaCy,代码行数:7,代码来源:test_array.py
示例5: test_doc_api_similarity_match
def test_doc_api_similarity_match():
doc = Doc(Vocab(), words=["a"])
assert doc.similarity(doc[0]) == 1.0
assert doc.similarity(doc.vocab["a"]) == 1.0
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
with pytest.warns(ModelsWarning):
assert doc.similarity(doc2[:1]) == 1.0
assert doc.similarity(doc2) == 0.0
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_doc_api.py
示例6: test_doc_retokenize_merge_extension_attrs_invalid
def test_doc_retokenize_merge_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["hello", "world", "!"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2], attrs=attrs)
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_retokenize_merge.py
示例7: test_issue1547
def test_issue1547():
"""Test that entity labels still match after merging tokens."""
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[5:7])
assert [ent.text for ent in doc.ents]
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_issue1501-2000.py
示例8: test_serialize_empty_doc
def test_serialize_empty_doc(en_vocab):
doc = Doc(en_vocab)
data = doc.to_bytes()
doc2 = Doc(en_vocab)
doc2.from_bytes(data)
assert len(doc) == len(doc2)
for token1, token2 in zip(doc, doc2):
assert token1.text == token2.text
开发者ID:spacy-io,项目名称:spaCy,代码行数:8,代码来源:test_serialize_doc.py
示例9: test_underscore_dir
def test_underscore_dir(en_vocab):
"""Test that dir() correctly returns extension attributes. This enables
things like tab-completion for the attributes in doc._."""
Doc.set_extension("test_dir", default=None)
doc = Doc(en_vocab, words=["hello", "world"])
assert "_" in dir(doc)
assert "test_dir" in dir(doc._)
assert "test_dir" not in dir(doc[0]._)
assert "test_dir" not in dir(doc[0:2]._)
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_underscore.py
示例10: test_doc_to_json_underscore
def test_doc_to_json_underscore(doc):
Doc.set_extension("json_test1", default=False)
Doc.set_extension("json_test2", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
assert "_" in json_doc
assert json_doc["_"]["json_test1"] == "hello world"
assert json_doc["_"]["json_test2"] == [1, 2, 3]
开发者ID:spacy-io,项目名称:spaCy,代码行数:9,代码来源:test_to_json.py
示例11: test_doc_retokenize_split_extension_attrs_invalid
def test_doc_retokenize_split_extension_attrs_invalid(en_vocab, underscore_attrs):
Token.set_extension("x", default=False, force=True)
Token.set_extension("a", getter=lambda x: x, force=True)
Token.set_extension("b", method=lambda x: x, force=True)
doc = Doc(en_vocab, words=["LosAngeles", "start"])
attrs = {"_": underscore_attrs}
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
heads = [(doc[0], 1), doc[1]]
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
开发者ID:spacy-io,项目名称:spaCy,代码行数:10,代码来源:test_retokenize_split.py
示例12: test_doc_retokenize_split_heads_error
def test_doc_retokenize_split_heads_error(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
# Not enough heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1]])
# Too many heads
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["Los", "Angeles"], [doc[1], doc[1], doc[1]])
开发者ID:spacy-io,项目名称:spaCy,代码行数:11,代码来源:test_retokenize_split.py
示例13: test_doc_add_entities_set_ents_iob
def test_doc_add_entities_set_ents_iob(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
ner = EntityRecognizer(en_vocab)
ner.begin_training([])
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
开发者ID:spacy-io,项目名称:spaCy,代码行数:11,代码来源:test_ner.py
示例14: test_doc_retokenize_split_orths_mismatch
def test_doc_retokenize_split_orths_mismatch(en_vocab):
"""Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should
always conform with spaCy's non-destructive tokenization policy. Otherwise,
it can lead to very confusing and unexpected results.
"""
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["L", "A"], [(doc[0], 0), (doc[0], 0)])
开发者ID:spacy-io,项目名称:spaCy,代码行数:11,代码来源:test_retokenize_split.py
示例15: test_spans_override_sentiment
def test_spans_override_sentiment(en_tokenizer):
"""Test span.sentiment property's default averaging behaviour"""
text = "good stuff bad stuff"
tokens = en_tokenizer(text)
tokens.vocab[tokens[0].text].sentiment = 3.0
tokens.vocab[tokens[2].text].sentiment = -2.0
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
doc.user_span_hooks["sentiment"] = lambda span: 10.0
assert doc[:2].sentiment == 10.0
assert doc[-2:].sentiment == 10.0
assert doc[:-1].sentiment == 10.0
开发者ID:spacy-io,项目名称:spaCy,代码行数:11,代码来源:test_span.py
示例16: test_underscore_docstring
def test_underscore_docstring(en_vocab):
"""Test that docstrings are available for extension methods, even though
they're partials."""
def test_method(doc, arg1=1, arg2=2):
"""I am a docstring"""
return (arg1, arg2)
Doc.set_extension("test_docstrings", method=test_method)
doc = Doc(en_vocab, words=["hello", "world"])
assert test_method.__doc__ == "I am a docstring"
assert doc._.test_docstrings.__doc__.rsplit(". ")[-1] == "I am a docstring"
开发者ID:spacy-io,项目名称:spaCy,代码行数:12,代码来源:test_underscore.py
示例17: test_doc_retokenize_split_dependencies
def test_doc_retokenize_split_dependencies(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
dep1 = doc.vocab.strings.add("amod")
dep2 = doc.vocab.strings.add("subject")
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
["Los", "Angeles"],
[(doc[0], 1), doc[1]],
attrs={"dep": [dep1, dep2]},
)
assert doc[0].dep == dep1
assert doc[1].dep == dep2
开发者ID:spacy-io,项目名称:spaCy,代码行数:13,代码来源:test_retokenize_split.py
示例18: test_sbd_serialization_projective
def test_sbd_serialization_projective(EN):
"""
test that before and after serialization, the sentence boundaries are the same.
"""
example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
EN.tagger(example)
apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])
example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())
assert example.to_bytes() == example_serialized.to_bytes()
assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
开发者ID:Arttii,项目名称:spaCy,代码行数:13,代码来源:test_sbd.py
示例19: test_doc_is_nered
def test_doc_is_nered(en_vocab):
words = ["I", "live", "in", "New", "York"]
doc = Doc(en_vocab, words=words)
assert not doc.is_nered
doc.ents = [Span(doc, 3, 5, label="GPE")]
assert doc.is_nered
# Test creating doc from array with unknown values
arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
assert doc.is_nered
# Test serialization
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
assert new_doc.is_nered
开发者ID:spacy-io,项目名称:spaCy,代码行数:13,代码来源:test_doc_api.py
示例20: test_doc_retokenize_spans_entity_split_iob
def test_doc_retokenize_spans_entity_split_iob():
# Test entity IOB stays consistent after merging
words = ["abc", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
with doc.retokenize() as retokenizer:
retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
assert doc[3].ent_iob_ == "I"
开发者ID:spacy-io,项目名称:spaCy,代码行数:13,代码来源:test_retokenize_split.py
注:本文中的spacy.tokens.Doc类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论