• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python nltk.batch_ne_chunk函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.batch_ne_chunk函数的典型用法代码示例。如果您正苦于以下问题:Python batch_ne_chunk函数的具体用法?Python batch_ne_chunk怎么用?Python batch_ne_chunk使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了batch_ne_chunk函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
开发者ID:summera,项目名称:python-natural-language-search,代码行数:8,代码来源:text_search.py


示例2: extract_entities

def extract_entities(shorttext_rows, site):

    # { short text id -> (noun entities, named entities) }
    shorttext_entities = {}
    
    # nltk entity classes
    nltk_entity_types = __get_nltk_entity_types__()
    
    for shorttext_row in shorttext_rows:
        
        shorttext_id = shorttext_row[0]
        shorttext_str = shorttext_row[1]
        
        noun_entities = []
        named_entities = []
        
        sentences = nltk.sent_tokenize(shorttext_str)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
        for tree in chunked_sentences:
            __extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types)    
            
        shorttext_entities[shorttext_id] = (noun_entities, named_entities)
        
    # Cache extracted entities
    pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
开发者ID:Big-Data,项目名称:reslve,代码行数:27,代码来源:nltk_extraction_dataset_mgr.py


示例3: _nltk_ner

    def _nltk_ner(self, text, searched_entity, question):
        # Entity Classification
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]
        ne_chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)

        # Entity Extraction
        entities = []
        all_entities = []
        for tree in ne_chunked_sentences:
            for child in tree:
                if isinstance(child, Tree):
                    entity = " ".join([word for (word, pos) in child.leaves()])
                    if child.node == searched_entity:
                        entities.append(entity)
                    all_entities.append(entity)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities, question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences, all_entities)

        return entities
开发者ID:danigarabato,项目名称:qa,代码行数:25,代码来源:answer.py


示例4: obtenerNEs

def obtenerNEs(lista):

    listaGeneral = []

    for (tweet, listaPalabras, clasificacion, diferenciaProbabilidad) in lista:
        # Condicionamos para que solo evalue los positivos
        print clasificacion
        if clasificacion == 'positive':
            sentences = nltk.tokenize.sent_tokenize(tweet)
            # Hacemos split en lugar de tokenize, para poder extrar las menciones a usuario.
            # El word_tokenize, separa el @ entonces no podemos filtrar
            nuevaSentences = []
            for s in sentences:
                subLista = quitarExcedenteSimple(s.split())
                nuevaSentences.append(' '.join(subLista))

            tokens = [nltk.tokenize.word_tokenize(s) for s in nuevaSentences]

            pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
            ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens, binary=True)

            listaNEs = []
            for subArbol in ne_chunks:
                traverse(subArbol, listaNEs, False)

            if listaNEs:
                listaGeneral.append((tweet, listaPalabras, listaNEs))

    web.debug('Tweets con NEs:' + str(len(listaGeneral)))
    return listaGeneral
开发者ID:JavierOgg,项目名称:proyectoFinal,代码行数:30,代码来源:funciones.py


示例5: process_entities

def process_entities(sentence):  
    words = []
    #print sentence

    #now break sentences into tokens
    tokens = nltk.word_tokenize(sentence)
    #print tokens

    #A bit of POS tagging
    pos_tagged_tokens = [nltk.pos_tag(tokens)]

    #Chunk extraction time
    ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)

    # Flatten the list since we're not using sentence structure
    # and sentences are guaranteed to be separated by a special
    # POS tuple such as ('.', '.')
    pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]

    #Entity extraction

    #Code from Mining data from the social web: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/blogs_and_nlp__extract_entities.py
    post = {}
    all_entity_chunks = []
    previous_pos = None
    current_entity_chunk = []
    #print pos_tagged_tokens
    for (token, pos) in pos_tagged_tokens:

        if pos == previous_pos and pos.startswith('NN'):
            current_entity_chunk.append(token)
        elif pos.startswith('NN'):
            if current_entity_chunk != []:

                # Note that current_entity_chunk could be a duplicate when appended,
                # so frequency analysis again becomes a consideration

                all_entity_chunks.append((' '.join(current_entity_chunk), pos))
            current_entity_chunk = [token]

        previous_pos = pos

    # Store the chunks as an index for the document
    # and account for frequency while we're at it...

    post['entities'] = {}
    for c in all_entity_chunks:
        post['entities'][c] = post['entities'].get(c, 0) + 1

    # For example, we could display just the title-cased entities


    proper_nouns = []
    for (entity, pos) in post['entities']:
        if entity.istitle():
            proper_nouns.append(entity)
            #print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
            #print entity
            #[(entity, pos)]
    return proper_nouns
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:60,代码来源:entities_speedcamera.py


示例6: nlp_extract_tags

def nlp_extract_tags(text, lang=None):
    """
    Return a list of tags extracted from provided text.
    """

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, "node") and t.node:
            if t.node == "NE":
                entity_names.append(" ".join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    result = {"tags": list(set(entity_names))}

    return jsonp({"status": "ok", "result": result})
开发者ID:rautarchana9,项目名称:hascore,代码行数:29,代码来源:nlp.py


示例7: get_named_entities

	def get_named_entities(self,text):
		sentences = nltk.sent_tokenize(text)
		sentences = [nltk.word_tokenize(sent) for sent in sentences]
		sentences = [nltk.pos_tag(sent) for sent in sentences] #takes 3ish seconds
		nes = nltk.batch_ne_chunk(sentences,binary=False) #takes 2ish seconds
		named_entities = {}
		stop_names = ['Mr.']
		
		# Loop through the tagged sentences, looking for named entites, and put their "leaves" together
		# e.g. "White" + " " + "House"
		#
		for i in nes:
			for j in i:
				if re.search('PERSON|ORGANIZATION|LOCATION|GPE|FACILITY',str(j)):
					name = ' '.join(c[0] for c in j.leaves())
					
					# Attempt to merge people names if you've seen them before
					# e.g. Ms. Clinton gets merged into Hillary Clinton
					if not (name in stop_names):
						regex = re.compile(r'^'+name.split(' ')[-1]+'|\s'+name.split(' ')[-1]+'$')
						regex_match = filter(regex.search,named_entities.keys())
						if (name in named_entities):
							named_entities[name]+=1
						elif  (len(regex_match)>0 and re.search('PERSON',str(j))!=None):
							named_entities[regex_match[0]]+=1
						else:
							named_entities[name] = 1
		
		# Sort named entities by count and take first 8
		sorted_names = sorted(named_entities.iteritems(), key=operator.itemgetter(1), reverse=True)
		names=[]
		for name in sorted_names[:8]:
			names.append(name[0].lower())		
		return names
开发者ID:visbe,项目名称:long-view,代码行数:34,代码来源:keyword_getter.py


示例8: extractchunk

 def extractchunk(tweettuple):
     sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple]
     cid = [str(a) for (a,w) in tweettuple]
     tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
     pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
     ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
     return dict(zip(cid, ne_chunks))
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:7,代码来源:iterate_couchdb__extract_timelinecomparisons.py


示例9: extract_entities

def extract_entities(sample):

    print 'extracting entities'
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
     
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    #create a map with entity,count count representing 
    # the number of occurences of an entity     
    entity_count = {}
    for entity in entity_names:
        if entity in entity_count:
            entity_count[entity] += 1
        else:
            entity_count[entity] = 1

    sorted_occurences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1))
    #return OrderedDict(entity_count)

    # Print unique entity names
    #print set(entity_names)
    return sorted_occurences
开发者ID:ebegoli,项目名称:Agatha,代码行数:27,代码来源:agatha.py


示例10: extract_chunked_sentences

def extract_chunked_sentences( raw ):
    """
    """    
    sentences = nltk.sent_tokenize(raw)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)    
    return chunked_sentences
开发者ID:ebegoli,项目名称:AffectiveNLP,代码行数:8,代码来源:recognizer.py


示例11: extractNamedEntities

def extractNamedEntities(sentences):
    tok_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tag_sentences = [nltk.pos_tag(sentence) for sentence in tok_sentences]
    cnk_sentences = nltk.batch_ne_chunk(tag_sentences, binary=True)
    all_named_entities = []
    for tree in cnk_sentences:      
        named_entities = extractNamedEntitiesFromChunkSentence(tree)
        all_named_entities.extend(named_entities)
    return list(set(all_named_entities))
开发者ID:Kevinwenya,项目名称:textmining-3,代码行数:9,代码来源:simple-nltk-webservice.py


示例12: extractchunk

def extractchunk(tweettuple):
    #Break each tweet into groups of sentences and words
    #Run through the nltk standard pos tag and chunker functions

    sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(c))) for (a,w,c) in tweettuple]
    cid = [str(a) for (a,w, c) in tweettuple]
    tnum =[w for (a,w,c) in tweettuple]
    tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
    pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
    ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
    return zip(cid, tnum, ne_chunks)
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:11,代码来源:couchdb__extract_searchcomparisons.py


示例13: get_entities

    def get_entities(sentences):
        #sentences = nltk.sent_tokenize(doc) # some nltk preprocessing: tokenize, tag, chunk, NER
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)

        entities = []
        for t in chunked_sentences:
            entities.append(entitify(t))

        return entities
开发者ID:W4ngatang,项目名称:DocumentSummarizer,代码行数:11,代码来源:build.py


示例14: get_entities3

def get_entities3(text):
  sentences = nltk.sent_tokenize(text)
  tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
  tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
  chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
  
  entity_names=[]
  for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

  return filter_entities(entity_names)
开发者ID:bstewartny,项目名称:Political-News,代码行数:11,代码来源:feeds.py


示例15: gen_ners

 def gen_ners(self,sample):
     """ returns NERS in the sample given as a list """
     sentences = nltk.sent_tokenize(sample)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
     entity_names = []
     for tree in chunked_sentences:
             entity_names.extend(self._extract_entity_names(tree))
     unique_ners = list(set(entity_names))
     return unique_ners
开发者ID:digitaltracer,项目名称:info-beanstalk,代码行数:11,代码来源:consumer_threads.py


示例16: get_entities

def get_entities(text):
    '''
    Extracts named entities from the supplied text.
    Returns a list of entity names.
    '''
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))
    return entity_names
开发者ID:erochest,项目名称:Trove-newspapers,代码行数:13,代码来源:clean.py


示例17: char_recognition

	def char_recognition(self, char_number = 20):
		tagged_sentences = nltk.pos_tag_sents(self.tokenized_sentences)
		self.entities = []
		entity_names = []
		if nltk.__version__[0] == '3':
			chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names3(tree))
		else:
			chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=False)
			for tree in chunked_sentences:
				entity_names.extend(extract_entity_names(tree))
		count = Counter([name for name in entity_names])
		for c in count.most_common(char_number):
			self.entities.append(c[0])
开发者ID:nbilenko,项目名称:narrative_explorer,代码行数:15,代码来源:content.py


示例18: get_named_entities

def get_named_entities(paragraph, ent_type):
    sentences = nltk.sent_tokenize(paragraph)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree, ent_type))
    
    keywords = ''
    for ent in entity_names:
        keywords = keywords + ' ' + ent

    return keywords
开发者ID:valibanu,项目名称:image-from-text,代码行数:15,代码来源:ner.py


示例19: analyzeText

	def analyzeText(self,question,answers):
		#does tokenization , pos tagging , chunking and returns all 3 of them		
		# for answer in answers:
		answers = ''.join(answers)
		#print answer
		#sentence tokenzier
		sentences = sent_tokenize(answers)
		# word tokenizer
		tokens = [word_tokenize(sentence) for sentence in sentences]
		#pos tagger
		postags = [pos_tag(token) for token in tokens]
		#chunking
		chunks = batch_ne_chunk(postags,binary=True)
		TAObj = TextAnalyticsObj(question,answers,sentences,tokens,postags,chunks)
		return TAObj
开发者ID:nischalhp,项目名称:Feedlyze,代码行数:15,代码来源:textAnalytics.py


示例20: chunkSentences

def chunkSentences(text):
    """
    Parses text into parts of speech tagged with parts of speech labels.

    Used for reference: https://gist.github.com/onyxfish/322906
    """
    sentences = nltk.sent_tokenize(text)
    tokenizedSentences = [nltk.word_tokenize(sentence)
                          for sentence in sentences]
    taggedSentences = [nltk.pos_tag(sentence)
                       for sentence in tokenizedSentences]
    if nltk.__version__[0:2] == "2.":
        chunkedSentences = nltk.batch_ne_chunk(taggedSentences, binary=True)
    else:
        chunkedSentences = nltk.ne_chunk_sents(taggedSentences, binary=True)
    return chunkedSentences
开发者ID:emdaniels,项目名称:character-extraction,代码行数:16,代码来源:characterExtraction.py



注:本文中的nltk.batch_ne_chunk函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.bigrams函数代码示例发布时间:2022-05-27
下一篇:
Python nls._函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap