本文整理汇总了Python中nltk.batch_ne_chunk函数的典型用法代码示例。如果您正苦于以下问题:Python batch_ne_chunk函数的具体用法?Python batch_ne_chunk怎么用?Python batch_ne_chunk使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了batch_ne_chunk函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, query_string):
self.query_string = query_string
sentences = nltk.sent_tokenize(query_string)
self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
开发者ID:summera,项目名称:python-natural-language-search,代码行数:8,代码来源:text_search.py
示例2: extract_entities
def extract_entities(shorttext_rows, site):
# { short text id -> (noun entities, named entities) }
shorttext_entities = {}
# nltk entity classes
nltk_entity_types = __get_nltk_entity_types__()
for shorttext_row in shorttext_rows:
shorttext_id = shorttext_row[0]
shorttext_str = shorttext_row[1]
noun_entities = []
named_entities = []
sentences = nltk.sent_tokenize(shorttext_str)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
for tree in chunked_sentences:
__extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types)
shorttext_entities[shorttext_id] = (noun_entities, named_entities)
# Cache extracted entities
pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
开发者ID:Big-Data,项目名称:reslve,代码行数:27,代码来源:nltk_extraction_dataset_mgr.py
示例3: _nltk_ner
def _nltk_ner(self, text, searched_entity, question):
# Entity Classification
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]
ne_chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
# Entity Extraction
entities = []
all_entities = []
for tree in ne_chunked_sentences:
for child in tree:
if isinstance(child, Tree):
entity = " ".join([word for (word, pos) in child.leaves()])
if child.node == searched_entity:
entities.append(entity)
all_entities.append(entity)
if 'OTHER' == searched_entity:
entities += self._other_recognition(tagged_sentences, all_entities, question)
if 'NUMBER' == searched_entity:
entities += self._number_recognition(text, tagged_sentences, all_entities)
return entities
开发者ID:danigarabato,项目名称:qa,代码行数:25,代码来源:answer.py
示例4: obtenerNEs
def obtenerNEs(lista):
listaGeneral = []
for (tweet, listaPalabras, clasificacion, diferenciaProbabilidad) in lista:
# Condicionamos para que solo evalue los positivos
print clasificacion
if clasificacion == 'positive':
sentences = nltk.tokenize.sent_tokenize(tweet)
# Hacemos split en lugar de tokenize, para poder extrar las menciones a usuario.
# El word_tokenize, separa el @ entonces no podemos filtrar
nuevaSentences = []
for s in sentences:
subLista = quitarExcedenteSimple(s.split())
nuevaSentences.append(' '.join(subLista))
tokens = [nltk.tokenize.word_tokenize(s) for s in nuevaSentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens, binary=True)
listaNEs = []
for subArbol in ne_chunks:
traverse(subArbol, listaNEs, False)
if listaNEs:
listaGeneral.append((tweet, listaPalabras, listaNEs))
web.debug('Tweets con NEs:' + str(len(listaGeneral)))
return listaGeneral
开发者ID:JavierOgg,项目名称:proyectoFinal,代码行数:30,代码来源:funciones.py
示例5: process_entities
def process_entities(sentence):
words = []
#print sentence
#now break sentences into tokens
tokens = nltk.word_tokenize(sentence)
#print tokens
#A bit of POS tagging
pos_tagged_tokens = [nltk.pos_tag(tokens)]
#Chunk extraction time
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
#Entity extraction
#Code from Mining data from the social web: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/blogs_and_nlp__extract_entities.py
post = {}
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
#print pos_tagged_tokens
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
current_entity_chunk = [token]
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
proper_nouns.append(entity)
#print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
#print entity
#[(entity, pos)]
return proper_nouns
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:60,代码来源:entities_speedcamera.py
示例6: nlp_extract_tags
def nlp_extract_tags(text, lang=None):
"""
Return a list of tags extracted from provided text.
"""
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, "node") and t.node:
if t.node == "NE":
entity_names.append(" ".join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
result = {"tags": list(set(entity_names))}
return jsonp({"status": "ok", "result": result})
开发者ID:rautarchana9,项目名称:hascore,代码行数:29,代码来源:nlp.py
示例7: get_named_entities
def get_named_entities(self,text):
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences] #takes 3ish seconds
nes = nltk.batch_ne_chunk(sentences,binary=False) #takes 2ish seconds
named_entities = {}
stop_names = ['Mr.']
# Loop through the tagged sentences, looking for named entites, and put their "leaves" together
# e.g. "White" + " " + "House"
#
for i in nes:
for j in i:
if re.search('PERSON|ORGANIZATION|LOCATION|GPE|FACILITY',str(j)):
name = ' '.join(c[0] for c in j.leaves())
# Attempt to merge people names if you've seen them before
# e.g. Ms. Clinton gets merged into Hillary Clinton
if not (name in stop_names):
regex = re.compile(r'^'+name.split(' ')[-1]+'|\s'+name.split(' ')[-1]+'$')
regex_match = filter(regex.search,named_entities.keys())
if (name in named_entities):
named_entities[name]+=1
elif (len(regex_match)>0 and re.search('PERSON',str(j))!=None):
named_entities[regex_match[0]]+=1
else:
named_entities[name] = 1
# Sort named entities by count and take first 8
sorted_names = sorted(named_entities.iteritems(), key=operator.itemgetter(1), reverse=True)
names=[]
for name in sorted_names[:8]:
names.append(name[0].lower())
return names
开发者ID:visbe,项目名称:long-view,代码行数:34,代码来源:keyword_getter.py
示例8: extractchunk
def extractchunk(tweettuple):
sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple]
cid = [str(a) for (a,w) in tweettuple]
tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
return dict(zip(cid, ne_chunks))
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:7,代码来源:iterate_couchdb__extract_timelinecomparisons.py
示例9: extract_entities
def extract_entities(sample):
print 'extracting entities'
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
#create a map with entity,count count representing
# the number of occurences of an entity
entity_count = {}
for entity in entity_names:
if entity in entity_count:
entity_count[entity] += 1
else:
entity_count[entity] = 1
sorted_occurences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1))
#return OrderedDict(entity_count)
# Print unique entity names
#print set(entity_names)
return sorted_occurences
开发者ID:ebegoli,项目名称:Agatha,代码行数:27,代码来源:agatha.py
示例10: extract_chunked_sentences
def extract_chunked_sentences( raw ):
"""
"""
sentences = nltk.sent_tokenize(raw)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
return chunked_sentences
开发者ID:ebegoli,项目名称:AffectiveNLP,代码行数:8,代码来源:recognizer.py
示例11: extractNamedEntities
def extractNamedEntities(sentences):
tok_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tag_sentences = [nltk.pos_tag(sentence) for sentence in tok_sentences]
cnk_sentences = nltk.batch_ne_chunk(tag_sentences, binary=True)
all_named_entities = []
for tree in cnk_sentences:
named_entities = extractNamedEntitiesFromChunkSentence(tree)
all_named_entities.extend(named_entities)
return list(set(all_named_entities))
开发者ID:Kevinwenya,项目名称:textmining-3,代码行数:9,代码来源:simple-nltk-webservice.py
示例12: extractchunk
def extractchunk(tweettuple):
#Break each tweet into groups of sentences and words
#Run through the nltk standard pos tag and chunker functions
sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(c))) for (a,w,c) in tweettuple]
cid = [str(a) for (a,w, c) in tweettuple]
tnum =[w for (a,w,c) in tweettuple]
tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
return zip(cid, tnum, ne_chunks)
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:11,代码来源:couchdb__extract_searchcomparisons.py
示例13: get_entities
def get_entities(sentences):
#sentences = nltk.sent_tokenize(doc) # some nltk preprocessing: tokenize, tag, chunk, NER
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entities = []
for t in chunked_sentences:
entities.append(entitify(t))
return entities
开发者ID:W4ngatang,项目名称:DocumentSummarizer,代码行数:11,代码来源:build.py
示例14: get_entities3
def get_entities3(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names=[]
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return filter_entities(entity_names)
开发者ID:bstewartny,项目名称:Political-News,代码行数:11,代码来源:feeds.py
示例15: gen_ners
def gen_ners(self,sample):
""" returns NERS in the sample given as a list """
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(self._extract_entity_names(tree))
unique_ners = list(set(entity_names))
return unique_ners
开发者ID:digitaltracer,项目名称:info-beanstalk,代码行数:11,代码来源:consumer_threads.py
示例16: get_entities
def get_entities(text):
'''
Extracts named entities from the supplied text.
Returns a list of entity names.
'''
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return entity_names
开发者ID:erochest,项目名称:Trove-newspapers,代码行数:13,代码来源:clean.py
示例17: char_recognition
def char_recognition(self, char_number = 20):
tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences)
self.entities = []
entity_names = []
if nltk.__version__[0] == '3':
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
for tree in chunked_sentences:
entity_names.extend(extract_entity_names3(tree))
else:
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False)
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
count = Counter([name for name in entity_names])
for c in count.most_common(char_number):
self.entities.append(c[0])
开发者ID:nbilenko,项目名称:narrative_explorer,代码行数:15,代码来源:content.py
示例18: get_named_entities
def get_named_entities(paragraph, ent_type):
sentences = nltk.sent_tokenize(paragraph)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree, ent_type))
keywords = ''
for ent in entity_names:
keywords = keywords + ' ' + ent
return keywords
开发者ID:valibanu,项目名称:image-from-text,代码行数:15,代码来源:ner.py
示例19: analyzeText
def analyzeText(self,question,answers):
#does tokenization , pos tagging , chunking and returns all 3 of them
# for answer in answers:
answers = ''.join(answers)
#print answer
#sentence tokenzier
sentences = sent_tokenize(answers)
# word tokenizer
tokens = [word_tokenize(sentence) for sentence in sentences]
#pos tagger
postags = [pos_tag(token) for token in tokens]
#chunking
chunks = batch_ne_chunk(postags,binary=True)
TAObj = TextAnalyticsObj(question,answers,sentences,tokens,postags,chunks)
return TAObj
开发者ID:nischalhp,项目名称:Feedlyze,代码行数:15,代码来源:textAnalytics.py
示例20: chunkSentences
def chunkSentences(text):
"""
Parses text into parts of speech tagged with parts of speech labels.
Used for reference: https://gist.github.com/onyxfish/322906
"""
sentences = nltk.sent_tokenize(text)
tokenizedSentences = [nltk.word_tokenize(sentence)
for sentence in sentences]
taggedSentences = [nltk.pos_tag(sentence)
for sentence in tokenizedSentences]
if nltk.__version__[0:2] == "2.":
chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True)
else:
chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True)
return chunkedSentences
开发者ID:emdaniels,项目名称:character-extraction,代码行数:16,代码来源:characterExtraction.py
注:本文中的nltk.batch_ne_chunk函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论