本文整理汇总了Python中nltk.ne_chunk_sents函数的典型用法代码示例。如果您正苦于以下问题:Python ne_chunk_sents函数的具体用法?Python ne_chunk_sents怎么用?Python ne_chunk_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ne_chunk_sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: parts_of_speech
def parts_of_speech(self, corpus):
"returns named entity chunks in a given text"
sentences = nltk.sent_tokenize(corpus) #Uso toknenizer para español
tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
pos_tags = [nltk.pos_tag(sentence) for sentence in tokenized]
chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
return chunked_sents
开发者ID:IIC2113-Grupo3-2015,项目名称:Procesador-de-Textos,代码行数:7,代码来源:GeneradorRelaciones.py
示例2: chunkIntoEntities
def chunkIntoEntities( text ):
entities = []
sentences = sentenceTokenization(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
for idx,tree in enumerate(chunked_sentences):
entity_names = extract_entity_names(tree)
entities.extend(entity_names)
chunked_content = splitContentbyDelimiter(text, entities)
return [chunked_content, entities]
开发者ID:dxr1988,项目名称:NLTK-Research,代码行数:25,代码来源:nltk_helper.py
示例3: getEntities
def getEntities(filename):
with open('harry.txt', 'r') as f:
sample = f.read()
sample = sample.decode('unicode_escape').encode('ascii','ignore')
print "sentence tokenize..."
sentences = nltk.sent_tokenize(sample)
print len(sentences)
sentences = sentences[:len(sentences)/30]
print len(sentences)
print "word tokenize..."
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print "POS tagging..."
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
print "Chunking..."
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
print "getting entities..."
print "total sentences = ", len(chunked_sentences)
for i, tree in enumerate(chunked_sentences):
if i%100==0:
print "on sentence", i
entity_names.extend(extract_entity_names(tree))
uniques = list(set(entity_names))
#only returned named entities that are 2 words or more
output = [u for u in unique if len(u.split(" ")) >= 2]
开发者ID:wellesleynlp,项目名称:meganokeefe-finalproject,代码行数:25,代码来源:entities.py
示例4: extract_entity_names
def extract_entity_names(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def entity_names(t):
names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
names.append(' '.join([child[0] for child in t]))
else:
for child in t:
names.extend(entity_names(child))
return names
names = []
for tree in chunked_sentences:
# Print results per sentence
# print extract_entity_names(tree)
names.extend(entity_names(tree))
return set(names)
开发者ID:michal3141,项目名称:geomedia,代码行数:26,代码来源:ner_extract.py
示例5: nltk_extract_ner
def nltk_extract_ner(text):
"""
Use of NLTK NE
:param text:
:return: list of all extracted NE
"""
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
d = defaultdict(list)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
#if it is recognized as NE add with key of its type
if t.label() in ne_types:
d[t.label()].append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
for tree in chunked_sentences:
# Get results per sentence
extract_entity_names(tree)
# return all entity names
return d
开发者ID:bfurlan,项目名称:IE4MAS,代码行数:33,代码来源:nltk_ner_extractor.py
示例6: get_entities
def get_entities(story):
entities = {}
'''wrong code, before nltk.pos_tag(),
story need to be divide into sentences with',' and '.' using nltk.sent_tokenize(),
then tokenize each sentence to tokens with ',' and '.' using nltk.word_tokenize.
storytokens = tokenizer(story) #remove '\'', ',' and '.'
pos_words = nltk.pos_tag(storytokens)
'''
sentences = nltk.sent_tokenize(story)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
#label 'Boy' and 'Scout' as 'NNP' respectively
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
#label 'Boy Scout' as 'NE'(entity)
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
#
entity_in_sentences = []
for tree in chunked_sentences:
#extract_entity_names(tree) find entities in each chunked_sentence
entity_in_sentences.extend(extract_entity_names(tree))
#delete repeat entities in all chunked_sentences
entities_unique = set(entity_in_sentences)
#create entities(dict object)
i = 0
for entity in entities_unique:
entities[entity] = i
i += 1
return entities
开发者ID:YuzhouWang,项目名称:657-Project,代码行数:32,代码来源:preprocess_data.py
示例7: extractKeywords
def extractKeywords(data):
array = []
logging.warning('NLTK processing starts:')
logging.warning(data)
for i, item in enumerate(data):
sample = data[i]
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0].lower() for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
for item in entity_names:
if item not in stops:
array.append(item)
logging.warning('NLTK processing finished:')
logging.warning(array)
return array
开发者ID:KseniiaBelorustceva,项目名称:text-analyser,代码行数:30,代码来源:app.py
示例8: extract_named_entities
def extract_named_entities(text_blocks):
"""
Return a list of named entities extracted from provided text blocks (list of text strings).
"""
sentences = []
for text in text_blocks:
sentences.extend(nltk.sent_tokenize(text))
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label'):
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return set(entity_names)
开发者ID:hasgeek,项目名称:coaster,代码行数:29,代码来源:nlp.py
示例9: get_top_NEs
def get_top_NEs(tagged_sentences, n=TOP_NERs):
""" Return the n longest named entities of a text """
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return sorted(entity_names, key=len, reverse=True)[:n]
开发者ID:pan-webis-de,项目名称:maluleka16,代码行数:8,代码来源:source-retrieval.py
示例10: chunk_sentences
def chunk_sentences(sentences):
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
chunked_sentences = nltk.ne_chunk_sents(sentences, binary=True)
return chunked_sentences
开发者ID:Jwpe,项目名称:entity-extractor,代码行数:8,代码来源:extract_named_entities.py
示例11: ie_process
def ie_process(document):
"returns named entity chunks in a given text"
sentences = nltk.sent_tokenize(document)
tokenized = [nltk.word_tokenize(sentence.translate(string.punctuation)) for sentence in sentences]
pos_tags = [nltk.pos_tag(sentence) for sentence in tokenized]
#print(pos_tags)
chunked_sents = nltk.ne_chunk_sents(pos_tags, binary=True)
return chunked_sents
开发者ID:vipmunot,项目名称:Sentiment-Analysis,代码行数:8,代码来源:NLP+processing+and+Named+Entity+_+Relationship+Extraction.py
示例12: extract_person_names
def extract_person_names(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [pos_tagger.tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences)
return set(_flat_map(extract_person_names_from_tree(tree)
for tree in chunked_sentences))
开发者ID:csojinb,项目名称:name-extractor-api,代码行数:8,代码来源:name_extractor.py
示例13: extract_named_entities
def extract_named_entities(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return list(set(entity_names))
开发者ID:dibaunaumh,项目名称:fcs-skateboard,代码行数:9,代码来源:extract_article_concepts.py
示例14: chunked_sentences
def chunked_sentences(text):
"""Splits a large string into chunked sentences [http://www.nltk.org/book/ch07.html#chunking]
"""
import nltk
sentences = split_sentences(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
return chunked_sentences
开发者ID:makalaaneesh,项目名称:newspaper,代码行数:9,代码来源:nlp.py
示例15: name_rec1
def name_rec1(sample):
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return entity_names
开发者ID:Sapphirine,项目名称:Data-Analytics-of-Video-Popularity,代码行数:9,代码来源:NE.py
示例16: analyse_hansard_file
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
# Word frequency analysis
my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
stoplist = set(stopwords.words('english') + my_abbrev)
soup, sample = parse_hansard(filename)
# Tokenisation, tagging, chunking
sent_tokenizer = PunktSentenceTokenizer()
# Stop breaking sentence at "No."
sent_tokenizer._params.abbrev_types.add('no')
#sentences = nltk.sent_tokenize(sample)
# TODO: improve sentence tokenizer - still far from good
sentences = sent_tokenizer.tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
# Word frequency over all sentences
tokens = []
for sentence in tokenized_sentences:
tokens += [word for word in sentence if word.lower() not in stoplist]
display_freq(tokens)
# Part-of-speech analysis
tags = []
for sentence in tagged_sentences:
tags += sentence
pos_analysis(tags, my_abbrev)
# spaCy NER
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)
# Find named entities, phrases and concepts
ne_spacy = {}
for entity in doc.ents:
if entity.label_ in ne_spacy:
ne_spacy[entity.label_] += [entity.text]
else:
ne_spacy[entity.label_] = [entity.text]
logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
for k in ne_spacy.keys():
display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)
# Interjection analysis
parties = {}
all_interjections = soup.find_all('interjection')
for interjection in all_interjections:
# Can be either a party or a role (Speaker, President, etc, ...)
party = interjection.party.text or interjection.find('name', role='metadata').text
if party in parties:
parties[party] = parties[party] + 1
else:
parties[party] = 1
logger.debug("%s interjections: %s" % (len(all_interjections), parties))
开发者ID:hsenot,项目名称:parliament_of_australia,代码行数:55,代码来源:utils.py
示例17: get_ner_nltk
def get_ner_nltk(self, text):
sents = nltk.sent_tokenize(text) # sentences
tokenized_sents = [nltk.word_tokenize(s) for s in sents]
tagged_sents = [nltk.pos_tag(s) for s in tokenized_sents]
chunked_sents = [x for x in nltk.ne_chunk_sents(tagged_sents)]
raw = self.traverseTree(chunked_sents)
ners = {}
for n in self.entity_cols: ners[n] = []
[ners[k].append(v.lower()) for k,v in raw]
for n in self.entity_cols: ners[n] = list(set(ners[n]))
return ners
开发者ID:Marsan-Ma,项目名称:tnative,代码行数:11,代码来源:ner.py
示例18: nominated_entities
def nominated_entities(self):
sentences = nltk.sent_tokenize(self.article)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for chunked_sentence in chunked_sentences:
entity_names.extend(self._extract_entity_names(chunked_sentence))
return list(set(entity_names))
开发者ID:annanda,项目名称:nltk,代码行数:12,代码来源:NEExtractor.py
示例19: get_entities3
def get_entities3(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
#chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names=[]
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return filter_entities(entity_names)
开发者ID:bstewartny,项目名称:pnews,代码行数:12,代码来源:feeds.py
示例20: initialize
def initialize(self, sample):
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
# Print results per sentence
# print _extract_entity_names(tree)
entity_names.extend(self._extract_entity_names(tree))
return entity_names
开发者ID:yewwah,项目名称:recruit,代码行数:12,代码来源:ner.py
注:本文中的nltk.ne_chunk_sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论