本文整理汇总了Python中nltk.tokenize.PunktSentenceTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer类的具体用法?Python PunktSentenceTokenizer怎么用?Python PunktSentenceTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PunktSentenceTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self,sentence):
f = open('data/training_data', 'r')
train_text=f.read()
#data=open('data2','r')
#test_data=data.read()
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
self.tokenized = custom_sent_tokenizer.tokenize(sentence)
开发者ID:codehacken,项目名称:Athena,代码行数:7,代码来源:nlp.py
示例2: POS_tagging
def POS_tagging(corpus):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = corpus
#print(train_text)
custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
# textfile = open("POS_tagged",'w')
# textfile.write(train_text)
# textfile.write("\n\n\n\n\n\n\n\n\n\n")
# print(custom_sentence_tokenizer)
tokenized = custom_sentence_tokenizer.tokenize(sample_text)
tuples_list = []
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
for w in tagged:
tuples_list.append(w)
except Exception as e:
c=0
# print(str(e))
process_content()
return tuples_list
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:25,代码来源:C_F_testing.py
示例3: extractNounPhrases
def extractNounPhrases(sentence):
nounPhrases = []
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
firstNN = False
for tag in tagged:
pos = tag[1]
if "NN" in pos:
if firstNN:
nounPhrase = firstNoun + " " + tag[0]
nounPhrases.append(nounPhrase)
firstNN = False
continue
else:
firstNoun = tag[0]
firstNN = True
continue
firstNN = False
except Exception as e:
print(str(e))
return nounPhrases
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:31,代码来源:naturalLanguageWhiz.py
示例4: get_sentences
def get_sentences(self, remove_url=True):
'''
generator
:param remove_url --> replace URLs in sentences with one space char ;
:return: tuple of sentences for each mime-part ;
'''
tokenizer = PunktSentenceTokenizer()
for raw_line, mime_type, lang in tuple(self.get_text_mime_part()):
if 'html' in mime_type:
soup = BeautifulSoup(raw_line)
if not soup.body:
continue
# cause exactly sentences are needed, soup.body.strings returns lines+0d0a
lines = tuple(soup.body.strings)
raw_line = ''.join(lines)
try:
sents = tuple(tokenizer.tokenize(raw_line))
except Exception as err:
sents = tuple(raw_line)
if remove_url:
sents = tuple(map(lambda sent: self.__URLINTEXT_PAT.sub(' ', sent.lower()), sents))
sents = (s.strip().lower() for s in sents)
sents = tuple(s for s in tuple(sents) if s)
if len(sents) == 0:
continue
yield sents
开发者ID:ml-course-stanford,项目名称:algos,代码行数:33,代码来源:msg_wrapper.py
示例5: normalize
def normalize(text):
p = PunktSentenceTokenizer()
bullet1 = '\xe2\x80\xa2'.decode('utf-8')
bullet2 = '\xc2\xb7'.decode('utf-8')
usable = ''
for sentence in p.tokenize(text):
if len(sentence) < 500:
if bullet1 not in sentence and bullet2 not in sentence:
usable += '%s ' % sentence
return usable
开发者ID:tristaneuan,项目名称:wikia-nlp,代码行数:10,代码来源:batch-named-entity-harvester.py
示例6: tokenize_english_document
def tokenize_english_document(input_text):
"""
This is a crude tokenizer for input conversations in English.
:param input_text:
:return:
"""
end_list = []
block_tokenizer = BlanklineTokenizer()
sentence_tokenizer = PunktSentenceTokenizer()
word_tokenizer = WhitespaceTokenizer()
# using the 38 characters in one line rule from ITV subtitle guidelines
characters_per_line = 38
lines_per_subtitle = 2
blocks = block_tokenizer.tokenize(input_text)
for block in blocks:
# We have one speaker
sentences = sentence_tokenizer.tokenize(block)
# We have the sentences
for sentence in sentences:
words = word_tokenizer.tokenize(sentence)
reverse_words = words[::-1]
lines = []
current_line = ''
line_full = False
while reverse_words:
word = reverse_words.pop()
longer_line = ' '.join([current_line, word]).strip()
if len(longer_line) > characters_per_line and len(current_line):
# The longer line is overreaching boundaries
reverse_words.append(word)
line_full = True
elif len(word) >= characters_per_line:
# Very long words
current_line = longer_line
line_full = True
else:
current_line = longer_line
if line_full:
lines.append(current_line)
current_line = ''
line_full = False
if len(lines) >= lines_per_subtitle:
end_list.append(lines)
lines = []
if current_line:
lines.append(current_line)
if lines:
end_list.append(lines)
return end_list
开发者ID:ebu,项目名称:ebu-tt-live-toolkit,代码行数:54,代码来源:common.py
示例7: aristo_get_named_entities
def aristo_get_named_entities(self, text):
"""
Parses the texts to obtain named entities
:param text: The text to parse
:return:returns a named entity treexw
"""
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=False)
return ((namedEnt))
开发者ID:elangovana,项目名称:Aristo,代码行数:13,代码来源:text_analyser.py
示例8: tag
def tag(sentence):
try:
tokenizer = PunktSentenceTokenizer(sentence)
tokenized = tokenizer.tokenize(sentence)
words = nltk.word_tokenize(tokenized[0])
tagged = nltk.pos_tag(words)
return tagged
except Exception as e:
print(str(e))
开发者ID:robienoor,项目名称:NLTKForumScraper,代码行数:13,代码来源:naturalLanguageWhiz.py
示例9: name_ent_recog
def name_ent_recog(post):
train_text = state_union.raw("2005-GWBush.txt")
sample_text = post
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
namedEnt = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt.append(nltk.ne_chunk(tagged))
except Exception as e:
print(str(e))
return namedEnt
开发者ID:achuth-noob,项目名称:CHAT-BOT,代码行数:14,代码来源:join_sub_obj.py
示例10: sentenceTagging
def sentenceTagging(text, trainingText):
csTokenizer = PunktSentenceTokenizer(trainingText)
tokenized = csTokenizer.tokenize(text)
taggedSentence = []
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
taggedSentence.append(tagged)
#chinkingWords(tagged).draw()
namedEntityRecog(tagged)
except Exception as e:
print(str(e))
return taggedSentence
开发者ID:subhodip,项目名称:hacktags,代码行数:15,代码来源:createTags.py
示例11: pos
def pos(self, paragraph):
wordsdict = collections.OrderedDict()
sent_tokenizer = PunktSentenceTokenizer()
for sentence in self.sent_detector.tokenize(paragraph):
tokens = sent_tokenizer.tokenize(sentence)
for token in tokens:
words = nltk.word_tokenize(token)
tagged = nltk.pos_tag(words)
for word in tagged:
if word[1] in self.tagdict:
wordsdict[word[0]] = self.tagdict[word[1]][0]
return wordsdict
开发者ID:ponrajuganesh,项目名称:POSTagger,代码行数:16,代码来源:analysis.py
示例12: Tokenizer
class Tokenizer(object):
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
language: Language to tokenize (currently doesn't do anything)
train_text_gen: A generator of training text for the sentence tokenizer.
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize
if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
else:
self.sent_tokenizer = PunktSentenceTokenizer()
def _train_sentence_tokenizer(self):
return PunktSentenceTokenizer(train_text="\n".join(self.train_text_gen))
def tokenize(self, text):
tokenized = []
for sentence in self.sent_tokenizer.tokenize(text):
tokenized_sentence = []
for word in word_tokenize(sentence):
if self.normalize:
word = word.lower()
tokenized_sentence.append(word)
tokenized.append(tokenized_sentence)
return tokenized
开发者ID:hihihippp,项目名称:plainstream,代码行数:32,代码来源:tokenizer.py
示例13: main
def main():
training_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
choice = 0
while choice < 5:
choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
if choice == 1:
named_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 2:
process_chunks(text_trained_tokenized(sample_text, training_text))
elif choice == 3:
process_content(text_trained_tokenized(sample_text, training_text))
elif choice == 4:
print "try again, bitch!"
开发者ID:EricChristensen,项目名称:Python_Randomness,代码行数:17,代码来源:PosTagging.py
示例14: extract_features
def extract_features(self):
"""
All approach of extracting features from raw data implemented here
"""
custom_tokenizer = PunktSentenceTokenizer()
regex_tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
ps = PorterStemmer()
tokenized = []
with open(self.file_path, 'r') as current_document:
for each_line in current_document:
tokenized.extend(custom_tokenizer.tokenize(each_line)) # tokenizing words line by line
feature_list = []
try:
for each_sentence in tokenized:
# words = nltk.word_tokenize(each_sentence)
words = regex_tokenizer.tokenize(each_sentence)
tagged = nltk.pos_tag(words)
feature_list.extend([ps.stem(pos[0].lower()) for pos in tagged if pos[1] == 'NN']) # listing the nouns in a list
except Exception as E:
print(str(E))
feature_dictionary = Counter(feature_list) # converts an iterable object(in this case, LIST) to dictionary
return feature_dictionary
开发者ID:Jumayel06,项目名称:Thesis,代码行数:23,代码来源:DocumentProcess.py
示例15: __init__
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
language: Language to tokenize (currently doesn't do anything)
train_text_gen: A generator of training text for the sentence tokenizer.
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize
if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
else:
self.sent_tokenizer = PunktSentenceTokenizer()
开发者ID:hihihippp,项目名称:plainstream,代码行数:15,代码来源:tokenizer.py
示例16: __init__
class NER:
"""docstring for ClassName"""
def __init__(self, query):
self.original_query = query
conf = shelve.open('conf')
self.train_text = conf['train_text']
self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
self.tokenized = self.custom_sent_tokenizer.tokenize(self.original_query)
def processContent(self):
try:
for i in self.tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
#print(namedEnt)
#namedEnt.draw()
return namedEnt
except Exception as e:
print(str(e))
# Parse named entities from tree
def structureNamedEntities(self):
ne = []
for subtree in self.named_entity_tree:
if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
ne_label = subtree.label()
ne_string = " ".join([token for token, pos in subtree.leaves()])
ne.append((ne_string, ne_label))
return ne
def performNER(self):
self.named_entity_tree = self.processContent()
#print(type(self.named_entity_tree))
self.named_entity_tuple = self.structureNamedEntities()
#print(ne)
names = [element[0] for element in self.named_entity_tuple]
return names
开发者ID:Mitgorakh,项目名称:myproject,代码行数:39,代码来源:ner.py
示例17: PunktSentenceTokenizer
#!/usr/bin/env python
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer
train_text = state_union.raw('2005-GWBush.txt')
#print train_text
test_text = state_union.raw('2006-GWBush.txt')
custom_sent_token = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_token.tokenize(test_text)
#print tokenized
#print type(tokenized)
def chunk():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}
}<VB.?|IN|DT|TO>+{"""
parser = nltk.RegexpParser(regexp)
开发者ID:allanis79,项目名称:machine-learning,代码行数:30,代码来源:chunkk.py
示例18: open
pa = parser.parse_args()
lang = pa.lang
filePath = pa.file
outputPath = filePath + '.sent'
if __name__ == "__main__":
file = open(filePath, 'r')
output = open(outputPath, 'w')
sst = None
if lang == 'EN':
sst = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
elif lang == 'ES':
sst = nltk.data.load('nltk:tokenizers/punkt/spanish.pickle')
else:
sst = PunktSentenceTokenizer()
for line in file:
if line == "\n":
sys.stdout.write(line)
continue
line = line.replace("«", "'")
line = line.replace("»", "'")
line = line.replace("“", "'")
line = line.replace("”", "'")
line = line.replace("\"", "'")
sentences = sst.tokenize(line.decode("utf-8"))
for s in sentences:
output.write((s+'\n').encode('utf-8'))
file.close()
output.close()
开发者ID:isi-metaphor,项目名称:Metaphor-ADP,代码行数:30,代码来源:nltk_tokenizer.py
示例19: PosterStemmer
from nltk.stem import PorterStemmer
ps = PosterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
print(ps.stem(w))
##Part of Speech Tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#Unsupervised machine learning tokenizer -> PunktSentenceTokenizer
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #training on train_text
tokenized = custom_sent_tokenizer.tokenize(sample_text) #applying model to sample_text
#this will generate sentences
def process_content():
try:
for i in tokenized:
words= nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
print(tagged)
except: Exception as e:
print(str(e))
process_content()
开发者ID:Utkagr,项目名称:NLPrel,代码行数:30,代码来源:all_nltk.py
示例20: tokenizer
VBN verb, past participle taken
VBP verb, sing. present, non-3d take
VBZ verb, 3rd person sing. present takes
WDT wh-determiner which
WP wh-pronoun who, what
WP$ possessive wh-pronoun whose
WRB wh-abverb where, when
"""
# retrieving the corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')
# training the sentence tokenizer (unsupervised)
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)
# tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens
try:
for s in sentence:
token = word_tokenize(s)
pos = pos_tag(token)
print(pos)
chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkreg)
chunked = chunkParser.parse(pos)
chunked.draw()
except Exception as e:
print(str(e))
开发者ID:Khushmeet,项目名称:nltk-learning-tutorials,代码行数:31,代码来源:chunking.py
注:本文中的nltk.tokenize.PunktSentenceTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论