• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python tokenize.PunktSentenceTokenizer类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer类的具体用法?Python PunktSentenceTokenizer怎么用?Python PunktSentenceTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了PunktSentenceTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self,sentence):
    f = open('data/training_data', 'r')
    train_text=f.read()
    #data=open('data2','r')
    #test_data=data.read()
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    self.tokenized = custom_sent_tokenizer.tokenize(sentence)
开发者ID:codehacken,项目名称:Athena,代码行数:7,代码来源:nlp.py


示例2: POS_tagging

def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = corpus
    #print(train_text)
    custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

    # textfile = open("POS_tagged",'w')
    # textfile.write(train_text)
    # textfile.write("\n\n\n\n\n\n\n\n\n\n")
    # print(custom_sentence_tokenizer)

    tokenized = custom_sentence_tokenizer.tokenize(sample_text)
    tuples_list = []
    def process_content():
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                for w in tagged:
                    tuples_list.append(w)
        except Exception as e:
            c=0
            # print(str(e))
    process_content()
    return tuples_list
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:25,代码来源:C_F_testing.py


示例3: extractNounPhrases

def extractNounPhrases(sentence):

    nounPhrases = []
    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        firstNN = False

        for tag in tagged:
            pos = tag[1]
            if "NN" in pos:
                if firstNN:
                    nounPhrase = firstNoun + " " + tag[0]
                    nounPhrases.append(nounPhrase)
                    firstNN = False
                    continue
                else:
                    firstNoun = tag[0]
                    firstNN = True
                    continue

            firstNN = False

    except Exception as e:
        print(str(e))

    return nounPhrases
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:31,代码来源:naturalLanguageWhiz.py


示例4: get_sentences

    def get_sentences(self, remove_url=True):
        '''
        generator
        :param remove_url --> replace URLs in sentences with one space char ;
        :return: tuple of sentences for each mime-part ;
        '''

        tokenizer = PunktSentenceTokenizer()

        for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):

            if 'html' in mime_type:
                soup = BeautifulSoup(raw_line)
                if not soup.body:
                    continue
                # cause exactly sentences are needed, soup.body.strings returns lines+0d0a
                lines = tuple(soup.body.strings)
                raw_line = ''.join(lines)

            try:
                sents = tuple(tokenizer.tokenize(raw_line))
            except Exception as err:
                sents = tuple(raw_line)

            if remove_url:
                sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))

            sents = (s.strip().lower() for s in sents)
            sents = tuple(s for s in tuple(sents) if s)
            if len(sents) == 0:
                continue

            yield sents
开发者ID:ml-course-stanford,项目名称:algos,代码行数:33,代码来源:msg_wrapper.py


示例5: normalize

def normalize(text):
    p = PunktSentenceTokenizer()
    bullet1 = '\xe2\x80\xa2'.decode('utf-8')
    bullet2 = '\xc2\xb7'.decode('utf-8')
    usable = ''
    for sentence in p.tokenize(text):
        if len(sentence) < 500:
            if bullet1 not in sentence and bullet2 not in sentence:
                usable += '%s ' % sentence
    return usable
开发者ID:tristaneuan,项目名称:wikia-nlp,代码行数:10,代码来源:batch-named-entity-harvester.py


示例6: tokenize_english_document

def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
开发者ID:ebu,项目名称:ebu-tt-live-toolkit,代码行数:54,代码来源:common.py


示例7: aristo_get_named_entities

 def aristo_get_named_entities(self, text):
     """
     Parses the texts to obtain named entities
     :param text: The text to parse
     :return:returns a named entity treexw
     """
     custom_sent_tokenizer = PunktSentenceTokenizer(text)
     tokenized = custom_sent_tokenizer.tokenize(text)
     for i in tokenized[5:]:
         words = nltk.word_tokenize(i)
         tagged = nltk.pos_tag(words)
         namedEnt = nltk.ne_chunk(tagged, binary=False)
         return ((namedEnt))
开发者ID:elangovana,项目名称:Aristo,代码行数:13,代码来源:text_analyser.py


示例8: tag

def tag(sentence):

    try:
        tokenizer = PunktSentenceTokenizer(sentence)
        tokenized = tokenizer.tokenize(sentence)

        words = nltk.word_tokenize(tokenized[0])
        tagged = nltk.pos_tag(words)

        return tagged

    except Exception as e:
        print(str(e))
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:13,代码来源:naturalLanguageWhiz.py


示例9: name_ent_recog

def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
    except Exception as e:
        print(str(e))
    return namedEnt
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:14,代码来源:join_sub_obj.py


示例10: sentenceTagging

def sentenceTagging(text, trainingText):
    csTokenizer = PunktSentenceTokenizer(trainingText)
    tokenized = csTokenizer.tokenize(text)
    taggedSentence = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            taggedSentence.append(tagged)
            #chinkingWords(tagged).draw()
            namedEntityRecog(tagged)
    except Exception as e:
        print(str(e))

    return taggedSentence
开发者ID:subhodip,项目名称:hacktags,代码行数:15,代码来源:createTags.py


示例11: pos

	def pos(self, paragraph):

		wordsdict = collections.OrderedDict()
		sent_tokenizer = PunktSentenceTokenizer()

		for sentence in self.sent_detector.tokenize(paragraph):
			tokens = sent_tokenizer.tokenize(sentence)

			for token in tokens:
				words = nltk.word_tokenize(token)
				tagged = nltk.pos_tag(words)
				for word in tagged:
					if word[1] in self.tagdict:
						wordsdict[word[0]] = self.tagdict[word[1]][0]

		return wordsdict
开发者ID:ponrajuganesh,项目名称:POSTagger,代码行数:16,代码来源:analysis.py


示例12: Tokenizer

class Tokenizer(object):

    def __init__(self, language, normalize=False, train_text_gen=None):
        """
        A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
        Params:
        language: Language to tokenize (currently doesn't do anything)
        train_text_gen: A generator of training text for the sentence tokenizer.
        """
        self.language = language
        self.train_text_gen = train_text_gen
        self.normalize = normalize
        
        if train_text_gen:
            self.sent_tokenizer = self._train_sentence_tokenizer()
        else:
            self.sent_tokenizer = PunktSentenceTokenizer()

    def _train_sentence_tokenizer(self):
        return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))

    def tokenize(self, text):
        tokenized = []
        for sentence in self.sent_tokenizer.tokenize(text):
            tokenized_sentence = []
            for word in word_tokenize(sentence):
                if self.normalize:
                    word = word.lower()
                tokenized_sentence.append(word)
            tokenized.append(tokenized_sentence)

        return tokenized
开发者ID:hihihippp,项目名称:plainstream,代码行数:32,代码来源:tokenizer.py


示例13: main

def main():
    training_text = state_union.raw('2005-GWBush.txt')
    sample_text = state_union.raw('2006-GWBush.txt')
    custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    choice = 0
    while choice < 5:
        choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
        if choice == 1:
            named_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 2:
            process_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 3:
            process_content(text_trained_tokenized(sample_text, training_text))
        elif choice == 4:
            print "try again, bitch!"
开发者ID:EricChristensen,项目名称:Python_Randomness,代码行数:17,代码来源:PosTagging.py


示例14: extract_features

    def extract_features(self):
        """
        All approach of extracting features from raw data implemented here
        """
        custom_tokenizer = PunktSentenceTokenizer()
        regex_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        ps = PorterStemmer()
        tokenized = []

        with open(self.file_path, 'r') as current_document:
            for each_line in current_document:
                tokenized.extend(custom_tokenizer.tokenize(each_line))  # tokenizing words line by line
        feature_list = []
        try:
            for each_sentence in tokenized:
                # words = nltk.word_tokenize(each_sentence)
                words = regex_tokenizer.tokenize(each_sentence)
                tagged = nltk.pos_tag(words)
                feature_list.extend([ps.stem(pos[0].lower()) for pos in tagged if pos[1] == 'NN'])  # listing the nouns in a list
        except Exception as E:
            print(str(E))
        feature_dictionary = Counter(feature_list)  # converts an iterable object(in this case, LIST) to dictionary
        return feature_dictionary
开发者ID:Jumayel06,项目名称:Thesis,代码行数:23,代码来源:DocumentProcess.py


示例15: __init__

 def __init__(self, language, normalize=False, train_text_gen=None):
     """
     A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
     Params:
     language: Language to tokenize (currently doesn't do anything)
     train_text_gen: A generator of training text for the sentence tokenizer.
     """
     self.language = language
     self.train_text_gen = train_text_gen
     self.normalize = normalize
     
     if train_text_gen:
         self.sent_tokenizer = self._train_sentence_tokenizer()
     else:
         self.sent_tokenizer = PunktSentenceTokenizer()
开发者ID:hihihippp,项目名称:plainstream,代码行数:15,代码来源:tokenizer.py


示例16: __init__

class NER:
    """docstring for ClassName"""
    def __init__(self, query):
        self.original_query = query
       	conf = shelve.open('conf') 
        self.train_text = conf['train_text']
        self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
        self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)

    def processContent(self):
        try:
            for i in self.tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                namedEnt = nltk.ne_chunk(tagged, binary=True)
                #print(namedEnt)
                #namedEnt.draw()
            return namedEnt
        except Exception as e:
            print(str(e))
        

    # Parse named entities from tree
    def structureNamedEntities(self):
    	ne = []
    	for subtree in self.named_entity_tree:
    		if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
    			ne_label = subtree.label()
    			ne_string = " ".join([token for token, pos in subtree.leaves()])
    			ne.append((ne_string, ne_label))
    	return ne

    def performNER(self):
        self.named_entity_tree = self.processContent()
        #print(type(self.named_entity_tree))
        self.named_entity_tuple = self.structureNamedEntities()
        #print(ne)
        names = [element[0] for element in self.named_entity_tuple]
        return names
开发者ID:Mitgorakh,项目名称:myproject,代码行数:39,代码来源:ner.py


示例17: PunktSentenceTokenizer

#!/usr/bin/env python

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer


train_text = state_union.raw('2005-GWBush.txt')

#print train_text

test_text = state_union.raw('2006-GWBush.txt')

custom_sent_token = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_token.tokenize(test_text)

#print tokenized
#print type(tokenized)

def chunk():
	try:
		for i in tokenized:
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)

			regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} 
								}<VB.?|IN|DT|TO>+{"""

			parser = nltk.RegexpParser(regexp)
开发者ID:allanis79,项目名称:machine-learning,代码行数:30,代码来源:chunkk.py


示例18: open

pa = parser.parse_args()
lang = pa.lang
filePath = pa.file
outputPath = filePath + '.sent'


if __name__ == "__main__":
    file = open(filePath, 'r')
    output = open(outputPath, 'w')
    sst = None
    if lang == 'EN':
        sst = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    elif lang == 'ES':
        sst = nltk.data.load('nltk:tokenizers/punkt/spanish.pickle')
    else:
        sst = PunktSentenceTokenizer()
    for line in file:
        if line == "\n":
            sys.stdout.write(line)
            continue
        line = line.replace("«", "'")
        line = line.replace("»", "'")
        line = line.replace("“", "'")
        line = line.replace("”", "'")
        line = line.replace("\"", "'")
        sentences = sst.tokenize(line.decode("utf-8"))
        for s in sentences:
            output.write((s+'\n').encode('utf-8'))
    file.close()
    output.close()
开发者ID:isi-metaphor,项目名称:Metaphor-ADP,代码行数:30,代码来源:nltk_tokenizer.py


示例19: PosterStemmer

from nltk.stem import PorterStemmer

ps = PosterStemmer()

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
	print(ps.stem(w))

##Part of Speech Tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#Unsupervised machine learning tokenizer -> PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #training on train_text
	
tokenized = custom_sent_tokenizer.tokenize(sample_text) #applying model to sample_text
#this will generate sentences

def process_content():
	try:
		for i in tokenized:
			words= nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)
			print(tagged)
	except: Exception as e:
		print(str(e))
		
process_content()
开发者ID:Utkagr,项目名称:NLPrel,代码行数:30,代码来源:all_nltk.py


示例20: tokenizer

VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""


# retrieving the corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')

# training the sentence tokenizer (unsupervised)
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)

# tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens
try:
    for s in sentence:
        token = word_tokenize(s)
        pos = pos_tag(token)
        print(pos)
        chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkreg)
        chunked = chunkParser.parse(pos)
        chunked.draw()

except Exception as e:
    print(str(e))
开发者ID:Khushmeet,项目名称:nltk-learning-tutorials,代码行数:31,代码来源:chunking.py



注:本文中的nltk.tokenize.PunktSentenceTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python tokenize.RegexpTokenizer类代码示例发布时间:2022-05-27
下一篇:
Python tokenize.wordpunct_tokenize函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap