本文整理汇总了Python中nltk.tokenize.punkt.PunktSentenceTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python PunktSentenceTokenizer类的具体用法?Python PunktSentenceTokenizer怎么用?Python PunktSentenceTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了PunktSentenceTokenizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: preprocess
def preprocess(phys):
'''
:param fname: a text file
:return: a json of sentences, processed for searchability
'''
phys = phys.decode('utf-8')
phys = re.sub('(\n)+', '. ', phys)
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(phys)
for i in xrange(len(sentences)):
sentence = unicode(sentences[i])
sentence = sentence.replace('\n', ' ')
sentence = re.sub(' +',' ',sentence)
sentence = re.sub(r'\d+', '', sentence)
sentence = sentence.replace("-"," ")
exclude = string.punctuation
sentence = ''.join(ch for ch in sentence if ch not in exclude)
sentence = re.sub(' +',' ',sentence)
sentences[i] = sentence
# sentences[i] = sentence.encode('utf-8')
count = 0
for sentence in sentences:
if sentence == ' ' or sentence == '':
sentences.pop(count)
count +=1
# with open(fname.rstrip('txt')+'json', 'w') as outfile:
# json.dump(sentences, outfile)
return sentences
开发者ID:DevJared,项目名称:Manhattan-Project,代码行数:33,代码来源:pdf2text.py
示例2: _split_sentences
def _split_sentences(self, text):
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
sentences = sentence_splitter.tokenize(text)
return sentences
开发者ID:siolag161,项目名称:markov_generator,代码行数:7,代码来源:tokenizers.py
示例3: summarize
def summarize(self):
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
sentences = sentence_splitter.tokenize(self.text)
structure = {}
sentence_objects = []
for idx in range(len(sentences)):
obj = {'text' : sentences[idx], 'index' : idx , 'data': {}}
sentence_objects.append(obj)
structure['sentences'] = sentence_objects
self.sentencecount = len(structure['sentences'])
structure['ordered'] = []
structure['weights'] = {'words' : FreqDist(nltk.word_tokenize(preprocess(self.text))), 'total': 0, 'transformed': 0}
structure['weights']['total'] = sum(structure['weights']['words'].values())
self.sentenceIndex = 0
for each_sent in structure['sentences']:
each_sent['data']['tokens'] = nltk.word_tokenize(preprocess(each_sent['text']))
each_sent['data']['sinTransform'] = (1-math.sin(self.sentenceIndex*(math.pi/self.sentencecount)))+1
for each_word in structure['weights']['words']:
if each_word in each_sent['data']['tokens']:
structure['weights']['words'][each_word] *= each_sent['data']['sinTransform']
self.sentenceIndex += 1
structure['weights']['transformed'] = sum(structure['weights']['words'].values())
self.sentenceIndex = 0
for each_sent in structure['sentences']:
each_sent['data']['weights'] = {'words': self.calculate_relative_frequence(each_sent['data']['tokens'], structure['weights']['words']), 'total': 0}
each_sent['data']['weights']['total'] = sum(each_sent['data']['weights']['words'].values())
self.sentenceIndex += 1
structure['ordered'] = sorted(structure['sentences'], key=lambda x:x['data']['weights']['total'], reverse=True)
structure_keep = structure['ordered'][:self.quota]
structure_keep.sort(key=lambda x:x['index'])
for eac_sen in structure_keep:
self.summary.append(eac_sen['text'])
开发者ID:yagamiram,项目名称:NLP_Auto_Summarization,代码行数:34,代码来源:fractal_template.py
示例4: textrank
def textrank(document):
pst = PunktSentenceTokenizer()
sentences = pst.tokenize(document)
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow_matrix = cv.fit_transform(sentences)
from sklearn.feature_extraction.text import TfidfTransformer
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
## mirrored matrix where the rows and columns correspond to
## sentences, and the elements describe how similar the
## sentences are. score 1 means sentences are exactly the same.
similarity_graph = normalized_matrix * normalized_matrix.T
similarity_graph.toarray()
# PageRank
import networkx as nx
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
## mapping of sentence indices to scores. use them to associate
## back to the original sentences and sort them
scores = nx.pagerank(nx_graph)
ranked = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
print ranked[0][1]
开发者ID:ko,项目名称:random,代码行数:27,代码来源:textrank.py
示例5: featureize
def featureize(F, observation_files):
word_tokenizer = PunktSentenceTokenizer()
sent_tokenizer = PunktSentenceTokenizer()
m = len(observation_files)
# X is Nx2
X = np.zeros((m,2), dtype=np.float)
for (i,filename) in enumerate(observation_files,start=0):
file_text = read_file(filename).decode('string_escape')
try:
num_sents = len(sent_tokenizer.sentences_from_text(file_text))
except UnicodeDecodeError:
num_sents = 2
#num_tokens = len(word_tokenize(file_text))
num_tokens = len(file_text.split())
# Return two features:
# 1 (0) - Number of sentences per file
# 2 (1) - Number of tokens per file
X[i][0] = num_sents
X[i][1] = num_tokens
return X
开发者ID:mikeswoods,项目名称:cis530-project,代码行数:29,代码来源:sentence_info.py
示例6: get_key_sentences
def get_key_sentences(self, n=5):
'''
Uses a simple implementation of TextRank to extract the top N sentences
from a document.
Sources:
- Original paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
- Super useful blog post: http://joshbohde.com/blog/document-summarization
- Wikipedia: http://en.wikipedia.org/wiki/Automatic_summarization#Unsupervised_keyphrase_extraction:_TextRank
'''
# Tokenize the document into sentences. More NLP preprocesing should also happen here.
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(self.doc)
# Calculate word counts and TFIDF vectors
word_counts = CountVectorizer(min_df=0).fit_transform(sentences)
normalized = TfidfTransformer().fit_transform(word_counts)
# Normalized graph * its transpose yields a sentence-level similarity matrix
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
return sorted(((scores[i],s) for i,s in enumerate(sentences)),
reverse=True)[n]
开发者ID:joannaskao,项目名称:judgmental,代码行数:25,代码来源:metadata.py
示例7: tokenize_sentences
def tokenize_sentences(self, untokenized_string: str):
"""Tokenize sentences by reading trained tokenizer and invoking
``PunktSentenceTokenizer()``.
:type untokenized_string: str
:param untokenized_string: A string containing one of more sentences.
:rtype : list of strings
"""
# load tokenizer
assert isinstance(untokenized_string, str), \
'Incoming argument must be a string.'
if self.language == 'latin':
tokenizer = super()
elif self.language == 'greek': # Workaround for regex tokenizer
self.sent_end_chars=GreekLanguageVars.sent_end_chars
self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
elif self.language in INDIAN_LANGUAGES:
self.sent_end_chars=SanskritLanguageVars.sent_end_chars
self.sent_end_chars_regex = '|'.join(self.sent_end_chars)
self.pattern = rf'(?<=[{self.sent_end_chars_regex}])\s'
else:
# Warn that NLTK Punkt is being used by default???
tokenizer = PunktSentenceTokenizer()
# mk list of tokenized sentences
if self.language == 'greek' or self.language in INDIAN_LANGUAGES:
return re.split(self.pattern, untokenized_string)
else:
return tokenizer.tokenize(untokenized_string)
开发者ID:cltk,项目名称:cltk,代码行数:30,代码来源:sentence.py
示例8: preprocessin
def preprocessin(self, cell_value):
# to tokenize the tweet into sentences
tweet = PunktSentenceTokenizer().tokenize(cell_value)
# to remove 'u'
tweet = '\n'.join(tweet)
# to remove html tags
tweet = self.remTags(tweet)
# to lower aplphabets
tweet = tweet.lower()
##Removing all junk
tweet = re.sub(u'(RT |\\\\|\u201c)"[email protected]*?[: ]', ' ', tweet)
tweet = re.sub('@', ' ', tweet)
tweet = re.sub(r'[^\x00-\x7F]', ' ', tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub('_', ' ', tweet)
tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet)
tweet = re.sub(r'\\([^\s]+)', ' ', tweet)
tweet = re.sub(u'[\u2018\u2019]', '\'', tweet)
tweet = re.sub('(^|)?http?s?:?/?/?.*?( |$)', ' ', tweet)
tweet = re.sub(u'\u2026', ' ', tweet)
tweet = re.sub('---', ' ', tweet)
tweet = re.sub(u'[\u201c\u201d]', '"', tweet)
tweet = re.sub('\[email protected]*?( |:|$)', ' ', tweet)
tweet = re.sub(r"\.\.+", ' ', tweet)
tweet = re.sub('&', ' ', tweet)
tweet = re.sub('\.\.\.', ' ', tweet)
tweet = tweet.strip('\'"')
tweet = re.sub('(, |\.( |$))', ' ', tweet)
tweet = re.sub('[][!"$*,/;<=>[email protected]\\\\^_`{|}~]', ' ', tweet)
tweet = re.sub('( - )', ' ', tweet)
return tweet
开发者ID:KartikeyPradhan,项目名称:Projects,代码行数:34,代码来源:preprocessing.py
示例9: get_todo_items
def get_todo_items(text):
all_items = list()
tokenizer = PunktSentenceTokenizer()
sen_tokens = tokenizer.tokenize(text)
for sen_token in sen_tokens:
todo_items = list()
tokens = nltk.word_tokenize(sen_token)
tags = tagger.tag(tokens)
stop_words = [word for (word, tag) in tags if tag in (tagVB, tagVBP)]
ind = -1
for word in stop_words:
curr_ind = tokens.index(word)
if curr_ind != 0 and tags[curr_ind - 1][1] in (tagCC, tagRB):
to_ind = curr_ind - 1
else: to_ind = curr_ind
if ind != -1 and abs(to_ind - ind) > 1:
todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, to_ind)]))
elif ind != -1 and len(todo_items) > 0:
last_ind = len(todo_items)
todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[to_ind - 1]])
ind = curr_ind
if ind != -1 and abs(len(tokens) - ind) > 1:
todo_items.append(' '.join(tokens[ind:get_punctuation_index(tokens, ind, len(tokens))]))
elif ind != -1 and len(todo_items) > 0:
last_ind = len(todo_items)
todo_items[last_ind - 1] = ' '.join([todo_items[last_ind - 1], tokens[len(tokens) - 1]])
all_items.extend(todo_items)
return all_items
开发者ID:jumutc,项目名称:smartchecklist,代码行数:34,代码来源:calculations.py
示例10: fractal_representation
def fractal_representation(self):
punkt_param = PunktParameters()
for each_paragraph in self.paragraphs:
buffer_p = paragraph()
buffer_p.paragraph = each_paragraph
buffer_p.tokens = nltk.word_tokenize(preprocess(each_paragraph))
buffer_p.weights['words'] = FreqDist(buffer_p.tokens)
buffer_p.weights['total'] = {'words':0, 'sentences':0}
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
sentences = sentence_splitter.tokenize(each_paragraph)
for each_sentence in sentences:
self.stotal += 1
buffer_s = sentence()
buffer_s.sentence = each_sentence
buffer_s.tokens = nltk.word_tokenize(preprocess(each_sentence))
if len(buffer_s.tokens) > 0:
buffer_s.weights['sentence'] = FreqDist(buffer_s.tokens)
buffer_s.weights['paragraph'] = self.calculate_relative_frequence(buffer_s.tokens, buffer_p.weights['words'])
buffer_s.weights['document'] = self.calculate_relative_frequence(buffer_s.tokens, self.fractal.weights)
buffer_s.weights['total'] = {}
buffer_s.weights['total']['sentence'] = 1
buffer_s.weights['total']['paragraph'] = sum(buffer_s.weights['paragraph'].values())
buffer_s.weights['total']['document'] = sum(buffer_s.weights['document'].values())
self.s_weight += buffer_s.weights['total']['document']
buffer_p.weights['total']['sentences'] += buffer_s.weights['total']['document']
buffer_p.sentences.append(buffer_s)
buffer_p.weights['total']['words'] = sum(buffer_p.weights['words'].values())
self.fractal.paragraphs.append(buffer_p)
self.pindex += 1
开发者ID:yagamiram,项目名称:NLP_Auto_Summarization,代码行数:30,代码来源:fractal_template.py
示例11: sentence_tokenizer
def sentence_tokenizer(self, untokenized_string, language):
"""Reads language .pickle for right language"""
if language == 'greek':
pickle_path = os.path.expanduser('~/cltk_data/greek/cltk_linguistic_data/tokenizers/sentence/greek.pickle')
language_punkt_vars = PunktLanguageVars
language_punkt_vars.sent_end_chars = ('.', ';')
language_punkt_vars.internal_punctuation = (',', '·')
elif language == 'latin':
pickle_path = os.path.expanduser('~/cltk_data/latin/cltk_linguistic_data/tokenizers/sentence/latin.pickle')
language_punkt_vars = PunktLanguageVars
language_punkt_vars.sent_end_chars = ('.', '?', ':')
language_punkt_vars.internal_punctuation = (',', ';')
else:
print("No sentence tokenizer for this language available.")
with open(pickle_path, 'rb') as open_pickle:
tokenizer = pickle.load(open_pickle)
tokenizer.INCLUDE_ALL_COLLOCS = True
tokenizer.INCLUDE_ABBREV_COLLOCS = True
params = tokenizer.get_params()
sbd = PunktSentenceTokenizer(params)
tokenized_sentences = []
for sentence in sbd.sentences_from_text(untokenized_string,
realign_boundaries=True):
tokenized_sentences.append(sentence)
return tokenized_sentences
开发者ID:smargh,项目名称:cltk,代码行数:26,代码来源:tokenize_sentences.py
示例12: _punkt_sent_tokenize
def _punkt_sent_tokenize(text):
'''
Sentence segmentation using nltk PunktSentenceTokenizer.
'''
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(config.tokenize_abbrev)
sentence_splitter = PunktSentenceTokenizer(punkt_param)
return sentence_splitter.tokenize(text)
开发者ID:khasathan,项目名称:nlp,代码行数:8,代码来源:stringutil.py
示例13: tokenize
def tokenize(self):
"""
Returns a list of tokenized sentences
"""
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.sentences_from_text(self.text)
sentences = [sentence.split() for sentence in sentences]
sentences = [[word.strip(",.?!") for word in sentence] for sentence in sentences]
return sentences
开发者ID:dkush,项目名称:bluestocking,代码行数:9,代码来源:parse.py
示例14: transform
def transform(self,documents):
sentence_splitter = PunktSentenceTokenizer()
for doc in documents:
if not 'sentences' in doc.ext:
doc.ext['sentences'] = [s.strip() for s in sentence_splitter.tokenize(doc.text)]
# for doc in documents:
# if not 'sentences' in doc.ext:
# doc.ext['sentences'] = [s.strip() for s in doc.text.split('.') if s]
return documents
开发者ID:tribhuvanesh,项目名称:nlpfs14,代码行数:9,代码来源:nlplearn.py
示例15: parse
def parse (text):
"""Use nltk's PunktSentenceTokenizer to convert the text string into
a list of English-language sentences."""
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(ABBREVIATIONS)
sentence_splitter = PunktSentenceTokenizer(punkt_param)
return sentence_splitter.tokenize(preprocess(text))
开发者ID:csytan,项目名称:nastyboys,代码行数:9,代码来源:get_sentences.py
示例16: bayesSentiment
def bayesSentiment(self, text):
from nltk.tokenize.punkt import PunktSentenceTokenizer
from senti_classifier import senti_classifier
# break up text into sentences
stzr = PunktSentenceTokenizer()
sents = stzr.tokenize(text)
pos_score, neg_score = senti_classifier.polarity_scores(sents)
#print pos_score, neg_score
return [pos_score, neg_score]
开发者ID:vanjos,项目名称:kairos_sentiment,代码行数:10,代码来源:simplesentiment.py
示例17: split_into_sentences
def split_into_sentences(input_file_name, output_file_name):
tokenizer = PunktSentenceTokenizer()
with gzip.open(input_file_name) as input_file:
with gzip.open(output_file_name, 'w') as sentence_file:
for line in input_file:
labelled_review = json.loads(line)
tokenized_text = tokenizer.tokenize(labelled_review['text'])
json.dump([tokenized_text, labelled_review['score']], sentence_file)
sentence_file.write("\n")
开发者ID:BKJackson,项目名称:txtnets,代码行数:10,代码来源:prepare_amazon_reviews.py
示例18: analyse_hansard_file
def analyse_hansard_file(filename='House of Representatives_2018_05_10_6091.xml'):
# Word frequency analysis
my_abbrev = ['\'m', '.', ',', '\'s', '(', ')', 'n\'t', '\'ve', ';', '$', ':', '\'', '?', '\'ll', '\'re']
stoplist = set(stopwords.words('english') + my_abbrev)
soup, sample = parse_hansard(filename)
# Tokenisation, tagging, chunking
sent_tokenizer = PunktSentenceTokenizer()
# Stop breaking sentence at "No."
sent_tokenizer._params.abbrev_types.add('no')
#sentences = nltk.sent_tokenize(sample)
# TODO: improve sentence tokenizer - still far from good
sentences = sent_tokenizer.tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
# Word frequency over all sentences
tokens = []
for sentence in tokenized_sentences:
tokens += [word for word in sentence if word.lower() not in stoplist]
display_freq(tokens)
# Part-of-speech analysis
tags = []
for sentence in tagged_sentences:
tags += sentence
pos_analysis(tags, my_abbrev)
# spaCy NER
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)
# Find named entities, phrases and concepts
ne_spacy = {}
for entity in doc.ents:
if entity.label_ in ne_spacy:
ne_spacy[entity.label_] += [entity.text]
else:
ne_spacy[entity.label_] = [entity.text]
logger.debug("Entity number per type: %s" % {k:len(v) for k,v in ne_spacy.items()})
for k in ne_spacy.keys():
display_freq(ne_spacy[k], 'Named entities (%s)' % (k,), top=20)
# Interjection analysis
parties = {}
all_interjections = soup.find_all('interjection')
for interjection in all_interjections:
# Can be either a party or a role (Speaker, President, etc, ...)
party = interjection.party.text or interjection.find('name', role='metadata').text
if party in parties:
parties[party] = parties[party] + 1
else:
parties[party] = 1
logger.debug("%s interjections: %s" % (len(all_interjections), parties))
开发者ID:hsenot,项目名称:parliament_of_australia,代码行数:55,代码来源:utils.py
示例19: preprocess_doc
def preprocess_doc(doc):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.sentences_from_text(doc)
tokens = []
for sentence in sentences:
#sentence1 = sentence.split()
sentence1 = neg_scope(sentence)
tokens.extend(w for w in sentence1 if w.lower() not in stopwords.words("english"))
for ii in xrange(len(tokens)):
if tokens[ii][-1] == '.':
tokens[ii] = tokens[ii][:-1]
return tokens
开发者ID:dwins,项目名称:bluestocking,代码行数:12,代码来源:ShallowConsistency.py
示例20: parseTextToSentences
def parseTextToSentences(text):
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'ms', 'mrs', 'prof', 'inc', 'no', 'e.g', 'i.e'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
data = text
data = data.replace('?"', '? "').replace('!"', '! "').replace('."', '. "')
sentences = []
for para in data.split('\n'):
if para:
sentences.extend(sentence_splitter.tokenize(para))
return sentences
开发者ID:ercgn,项目名称:11411-proj,代码行数:12,代码来源:nltk_helper.py
注:本文中的nltk.tokenize.punkt.PunktSentenceTokenizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论