本文整理汇总了Python中nltk.corpus.conll2000.chunked_sents函数的典型用法代码示例。如果您正苦于以下问题:Python chunked_sents函数的具体用法?Python chunked_sents怎么用?Python chunked_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了chunked_sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
train_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('train.txt', chunk_types=['NP']))
# test_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('test.txt', chunk_types=['NP']))
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
fd = np_tags_fd(train_sents)
print_frequencies(fd, num_results=50)
# pattern = regex_generator(fd)
# print pattern
# pattern = r"NP: {<NN>}"
print nltk.RegexpParser("").evaluate(test_sents)
print ''
pattern_book = r"NP: {<[CDJNP].*>+}"
print nltk.RegexpParser(pattern_book).evaluate(test_sents)
print ''
pattern_modified = r"NP: {<(\$)>?<[CDJNP].*>+}"
print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
print ''
pattern_modified = r"""NP: {<(\$)>?<[CDJNP].*>+}
{<W(P|DT)>}"""
print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
开发者ID:mikeholler,项目名称:CSC499-NLP,代码行数:25,代码来源:ngram_chunker_exploration.py
示例2: evaluate
def evaluate():
text = '''
he PRP B-NP
accepted VBD B-VP
the DT B-NP
position NN I-NP
of IN B-PP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
'''
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
print conll2000.chunked_sents('train.txt')[99]
print conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]
开发者ID:AkiraKane,项目名称:Python,代码行数:25,代码来源:c07_information_extraction.py
示例3: __init__
def __init__(self):
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents]
self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
开发者ID:pschulam-attic,项目名称:SCLE,代码行数:7,代码来源:chunker.py
示例4: get_noun_phrases_and_named_entities_data
def get_noun_phrases_and_named_entities_data(data):
# print data
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
chunker = BigramChunker.BigramChunker(train_sents + test_sents)
tagged_data = []
for sent in data:
tokens = nltk.word_tokenize(sent)
tagged = nltk.pos_tag(tokens)
tagged_data.append(tagged)
noun_phrases = []
for tagged_sent in tagged_data:
tree = chunker.parse(tagged_sent)
noun_phrases += nltk.chunk.tree2conlltags(tree)
named_entities = []
for tagged_sent in tagged_data:
tree = nltk.chunk.ne_chunk(tagged_sent)
named_entities += nltk.chunk.tree2conlltags(tree)
words = []
cnt = 0
for sent in data:
cnt += 1
tokens = nltk.word_tokenize(sent)
for token in tokens:
words.append((token, cnt))
# print words
# print noun_phrases
# print named_entities
return (words, noun_phrases, named_entities)
开发者ID:fydlzr,项目名称:coreference_resolution-1,代码行数:35,代码来源:mention_extraction.py
示例5: chunk_with_unigram_tagger
def chunk_with_unigram_tagger():
# use unigram tagger to find the IOB tag given its POS tag
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
unigram_chunker = UnigramChunker(train_sents)
print unigram_chunker.evaluate(test_sents)
postags = sorted(set(pos for sent in train_sents
for (word, pos) in sent.leaves()))
print unigram_chunker.tagger.tag(postags)
开发者ID:447327642,项目名称:nltk-examples,代码行数:10,代码来源:ch07.py
示例6: _load_data
def _load_data():
try:
train_set = conll2000.chunked_sents('train.txt')
test_set = conll2000.chunked_sents('test.txt')
except Exception:
if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False:
sys.exit(0)
nltk.download('conll2000')
train_set = conll2000.chunked_sents('train.txt')
test_set = conll2000.chunked_sents('test.txt')
train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set]
test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set]
return train_data, test_data
开发者ID:cdj0311,项目名称:nlp-architect,代码行数:13,代码来源:conll2000.py
示例7: main
def main(convert_func = None):
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
if convert_func:
# transform the sentence
print "convert leaf nodes"
test_sents = [convert_leaf_node(sent, convert_func)
for sent in test_sents]
print "train..."
chunker = ConsecutiveNPChunker(train_sents)
print "evaluate..."
print(chunker.evaluate(test_sents))
开发者ID:xiaohan2012,项目名称:capitalization-restoration-train,代码行数:13,代码来源:chunk.py
示例8: exercise3
def exercise3():
print "Exercise - 3"
grammar1 = r"""
NP: {<DT>?<JJ><NNS>}
{<CD><NNS>}
"""
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])[:100]
cp1 = nltk.RegexpParser(grammar1)
res1 = cp1.evaluate(test_sents)
print "Statistics data for custom chunker"
print res1
print
cp2 = nltk.RegexpParser("")
res2 = cp2.evaluate(test_sents)
print "Statistics data for baseline chunker"
print res2
print
grammar3 = r"""
NP: {<DT>?<JJ><NNS>}
{<CD><NNS>}
{<DT><NN>}
"""
cp3 = nltk.RegexpParser(grammar3)
res3 = cp3.evaluate(test_sents)
print "Statistics data for custom chunker with added regular expression: {<DT><NN>}"
print res3
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:30,代码来源:GirishSrinivas_ch7.py
示例9: get_noun_phrases_and_named_entities
def get_noun_phrases_and_named_entities(file_name, start_index, end_index):
sentences = conll2000.sents(file_name)
noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP'])
pos_tagged_sentences = conll2000.tagged_sents(file_name)
sentences = sentences[start_index:end_index]
pos_tagged_sentences = pos_tagged_sentences[start_index:end_index]
noun_phrase_sentences = noun_phrase_sentences[start_index:end_index]
# Extacting mentions.
words = []
cnt = 0
for sent in sentences:
cnt += 1
for word in sent:
words.append((word, cnt))
noun_phrases = []
for sent in noun_phrase_sentences:
noun_phrases += nltk.chunk.tree2conlltags(sent)
named_entities = []
for tagged_sent in pos_tagged_sentences:
tree = nltk.chunk.ne_chunk(tagged_sent)
named_entities += nltk.chunk.tree2conlltags(tree)
return (words, noun_phrases, named_entities)
开发者ID:fydlzr,项目名称:coreference_resolution-1,代码行数:28,代码来源:mention_extraction.py
示例10: __init__
def __init__(self):
try:
tagger = cPickle.load(open('nerdb_tagger.pkl'))
except IOError:
print 'failed to load nerdb_tagger, recreating...'
train_sents = conll2000.tagged_sents() + brown.tagged_sents()
tagger = nltk.DefaultTagger('NN')
tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
tagger = nltk.BigramTagger(train_sents, backoff=tagger)
tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w'))
print 'done'
try:
chunker = cPickle.load(open('nerdb_chunker.pkl'))
except IOError:
print 'failed to load nerdb_chunker, recreating...'
train_sents = conll2000.chunked_sents()
chunker = ConsecutiveNPChunker(tagger, train_sents)
cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w'))
print 'done'
self.chunker = chunker
self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()]
self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()]
self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()]
self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
开发者ID:jamt,项目名称:IMDBot,代码行数:25,代码来源:NERDb.py
示例11: __init__
def __init__(self):
try:
tagger = cPickle.load(open("nerdb_tagger.pkl"))
except IOError:
print "failed to load nerdb_tagger, recreating..."
train_sents = conll2000.tagged_sents() + brown.tagged_sents()
tagger = nltk.DefaultTagger("NN")
tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
tagger = nltk.BigramTagger(train_sents, backoff=tagger)
tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
print "done"
try:
chunker = cPickle.load(open("nerdb_chunker.pkl"))
except IOError:
print "failed to load nerdb_chunker, recreating..."
train_sents = conll2000.chunked_sents()
chunker = ConsecutiveNPChunker(tagger, train_sents)
cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
print "done"
self.chunker = chunker
self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
self.numbers = eval(open("numbers.txt").read())
开发者ID:gabsl,项目名称:IMDBot,代码行数:26,代码来源:NERDb.py
示例12: simple_np_bgram
def simple_np_bgram(documents):
bgram = BigramChunker(conll2000.chunked_sents('train.txt'))
for doc in documents:
buf = []
for sent in pos.preprocess(doc):
buf.append(bgram.parse(sent))
yield buf
开发者ID:juchiyama,项目名称:bigdata_fall2015,代码行数:7,代码来源:chunk.py
示例13: evaluate_chunker
def evaluate_chunker():
from nltk.corpus import conll2000
cp = nltk.RegexpParser("") # baseline
test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
print cp.evaluate(test_sents)
grammar = r"NP: {<[CDJNP].*>+}"
cp1 = nltk.RegexpParser(grammar) # naive tagger, look for all tags in NP chunk
print cp1.evaluate(test_sents)
开发者ID:447327642,项目名称:nltk-examples,代码行数:8,代码来源:ch07.py
示例14: _build_training_sents
def _build_training_sents(self ):
# This method randomly select a corpus from the provided lists and then
# build and return a train sentences that the chunkers will use
corpuses = [(conll2000,'train.txt'),(conll2002,'esp.train')]
#trainer = random.choice(corpuses)
#train_sents = trainer[0].chunked_sents(trainer[1],chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt',chunk_types=['NP'])
return train_sents
开发者ID:donama,项目名称:phraseengine,代码行数:8,代码来源:__init__.py
示例15: train_chunker
def train_chunker(filesDir):
# Create chunked sentences in the CoNLL format.
train_sents = conll2000.chunked_sents('train_locations.txt', chunk_types=['Loc'])
# Train the chunker with the NaiveBayesClassifier
chunker = ConsecutiveNPChunker(train_sents, combine_features, nltk.NaiveBayesClassifier)
return chunker
开发者ID:chatbotimporved,项目名称:chatbot,代码行数:8,代码来源:read_emails.py
示例16: __init__
def __init__(self):
try:
self.unigram_chunker = cPickle.load(open('chunker.pkl', 'r'))
except (EOFError, IOError):
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = ConsecutiveNPChunker(train_sents)
f = open('chunker.pkl', 'wb')
cPickle.dump(unigram_chunker, f, -1)
开发者ID:sergeyk,项目名称:csrec,代码行数:8,代码来源:nltk_chunker.py
示例17: drawParse
def drawParse(text):
sentences = posTagging(text)
# test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
chunker = ChunkParser(train_sents)
for s in sentences:
chunker.parse(s).draw()
开发者ID:5agado,项目名称:conversation-analyzer,代码行数:9,代码来源:nlp.py
示例18: simple_np_ugram
def simple_np_ugram(documents):
ugram = UnigramChunker(conll2000.chunked_sents('train.txt'))
"""String sentences get split up into a datastructure"""
for doc in documents:
buf = []
for sent in pos.preprocess(doc):
buf.append(ugram.parse(sent))
yield buf
开发者ID:juchiyama,项目名称:bigdata_fall2015,代码行数:9,代码来源:chunk.py
示例19: simpleEvaluation
def simpleEvaluation():
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print cp.evaluate(test_sents)
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print cp.evaluate(test_sents)
开发者ID:AkiraKane,项目名称:Python,代码行数:9,代码来源:c07_information_extraction.py
示例20: __init__
def __init__(self, POS):
'''
@param POS: the POS tagger is passed through
'''
train_sents = conll2000.chunked_sents()
train_data = [[(t, c) for w, t, c in tree2conlltags(sent)]
for sent in train_sents]
self.T = nltk.TrigramTagger(train_data)
self.Tagger = POS
self.tmp = []
开发者ID:danjamker,项目名称:N-Fly,代码行数:10,代码来源:Chunker.py
注:本文中的nltk.corpus.conll2000.chunked_sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论