本文整理汇总了Python中nltk.tokenize.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了word_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: createTrainingVectors
def createTrainingVectors(tokenized_texts_dict):
"""
Given the filenames and their contents, this methods creates the training
vectors by creating a unique list of all words together in the training
set
"""
print("Creating vectors for training data")
unique_words = []
for filename, text in tokenized_texts_dict.iteritems():
# print("Reading {0} and adding to unique word list".format(filename))
unique_words.extend(word_tokenize(text))
unique_words = set(unique_words)
# Creating the initial vector with counts 0 for all training sets
zero_vector = OrderedDict(zip(unique_words, [0] * len(unique_words)))
print("Creating the zero vector")
# For each training file, create an OrderedDict containing its word counts (together with zero counts),
# and store it in a dict, indexed by its corresponding filename
vectors = {}
for filename, token_list in tokenized_texts_dict.iteritems():
current_vector = zero_vector.copy()
current_vector.update(Counter(word_tokenize(token_list)))
vectors[filename] = current_vector
return vectors, zero_vector
开发者ID:gkeswani92,项目名称:N-Gram-Language-Modeling,代码行数:28,代码来源:KNearestNeighbourClassifier.py
示例2: max_similarity
def max_similarity(context_sentence, ambiguous_word, option="path",
lemma=True, context_is_lemmatized=False, pos=None, best=True):
"""
Perform WSD by maximizing the sum of maximum similarity between possible
synsets of all words in the context sentence and the possible synsets of the
ambiguous words (see http://goo.gl/XMq2BI):
{argmax}_{synset(a)}(\sum_{i}^{n}{{max}_{synset(i)}(sim(i,a))}
"""
ambiguous_word = lemmatize(ambiguous_word)
# If ambiguous word not in WordNet return None
if not wn.synsets(ambiguous_word):
return None
if context_is_lemmatized:
context_sentence = word_tokenize(context_sentence)
else:
context_sentence = [lemmatize(w) for w in word_tokenize(context_sentence)]
result = {}
for i in wn.synsets(ambiguous_word):
try:
if pos and pos != str(i.pos()):
continue
except:
if pos and pos != str(i.pos):
continue
result[i] = sum(max([sim(i,k,option) for k in wn.synsets(j)]+[0]) \
for j in context_sentence)
if option in ["res","resnik"]: # lower score = more similar
result = sorted([(v,k) for k,v in result.items()])
else: # higher score = more similar
result = sorted([(v,k) for k,v in result.items()],reverse=True)
##print result
if best: return result[0][1];
return result
开发者ID:ChenglongChen,项目名称:pywsd,代码行数:34,代码来源:similarity.py
示例3: main
def main():
# Load up txt files
speech_file = open('trump-speeches/speeches.txt').read()
tweets = json.load(open('trump_tweets.json'))
tweet_list = []
for tweet in tweets:
tweet_list.append(tweet['text'])
tweet_list = ' '.join(tweet_list)
# Tokenize
logging.info('Formatting training text')
speech_token = word_tokenize(speech_file)
tweet_token = word_tokenize(tweet_list)
# Train trigram models
logging.info('Setting up models')
speech_gram, speech_format = ngram(speech_token, 3)
tweet_gram, tweet_format = ngram(tweet_token, 3)
# Generate responses
cont = True
while cont:
response = input("Hello sir, what can I Trumpinate for you?: ")
num_words = input("And how many words should I write?: ")
# Print Phrases
gen_phrase(speech_gram, int(num_words), starter_word=[response])
print('')
gen_phrase(tweet_gram, int(num_words), starter_word=[response])
more = input("Would you like to generate more? (Yes, No): ")
if more != 'Yes':
cont = False
开发者ID:bhagerman00,项目名称:bh_lant,代码行数:32,代码来源:trumpinator.py
示例4: getBigramBeginWithNotCount
def getBigramBeginWithNotCount(sent):
negative_keywords = ["bad", "sad", "don't", "could not", "crappy", "unfortunately", "remove", "why", "poor",
"bothersome", "terrible", "although", "complaints", "outrageous", "isn't", "poorly",
"drawback", "annoying", "against", "irritating", "wouldn't", "won't", "wasn't", "couldn't",
"awful", "didn't", "hasn't", "difficult", "hate", "incorrect", "junk", "trash", "removed",
"complain", "complained", "hated", "negative"]
bigramPostiveCount = 0
'''
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
for bigram in nltk.bigrams(word_tokenize(sent)):
if bigram[0].lower() == "not" and bigram[1].lower() in negative_keywords:
print sent
print bigram
print unigram_tagger.tag(word_tokenize(sent))
bigramNotCount += 1
'''
for i, word in enumerate(word_tokenize(sent)):
if word.lower() == "not":
if word_tokenize(sent)[i + 1] in negative_keywords : # e.g. NOT bad
bigramPostiveCount += 1
if i < len(word_tokenize(sent)) - 2 and word_tokenize(sent)[i + 2] in negative_keywords: # e.g. NOT too bad
bigramPostiveCount += 1
else: # e.g. NOT good
bigramPostiveCount -= 1
return bigramPostiveCount
开发者ID:seekshreyas,项目名称:nlp-reviews-classifier,代码行数:29,代码来源:extractor.py
示例5: test
def test(testAccents, testNoAccents, dictnoAccents):
count = 0
correct = 0
notWord = []
result = []
incorrect = {}
wordCount = 0
nonWordCount = 0
for i in range(len(testAccents)):
sent = ""
sentenceAccents = testAccents[i]
sentenceNoAccents = testNoAccents[i]
tokensAccents = word_tokenize(sentenceAccents)
tokensNoAccents = word_tokenize(sentenceNoAccents)
if len(tokensAccents) == len(tokensNoAccents):
for j in range(len(tokensAccents)):
tA = tokensAccents[j]
tNA = tokensNoAccents[j]
if tNA not in punctuation and not tNA.isdigit():
wordCount +=1
if tNA in dictnoAccents.keys():
newToken = max(dictnoAccents[tNA], key=dictnoAccents[tNA].get)
#print(newToken)
#print("YES")
else:
newToken = tNA
if newToken == tA:
correct +=1
else:
incorrect[newToken] = tA
# print(newToken)
# print(tA)
count +=1
#print("HI")
if j != 0:
newToken = " " + newToken
else:
nonWordCount +=1
notWord.append(tNA)
newToken = tNA
sent = sent + newToken
result.append(sent)
print("Le nombre de mot dans le corpus: " + str(wordCount) )
print("Le nombre de ponctuation et de nombres dans le corpus: " + str(nonWordCount))
print("Nombre au total de changements/non changements possibles " + str(count ))
print("Nombre au total de decisions correctes " + str(correct))
print("Accuracy: " + str(correct/count) )
return([incorrect,correct/count, wordCount, nonWordCount])
开发者ID:Alex-Fabbri,项目名称:DiacriticRestoration,代码行数:60,代码来源:accents.py
示例6: load_data
def load_data(loc='./data/'):
"""
Load MSRP dataset
"""
trainloc = os.path.join(loc, 'msr_paraphrase_train.txt')
testloc = os.path.join(loc, 'msr_paraphrase_test.txt')
trainA, trainB, testA, testB = [],[],[],[]
trainS, devS, testS = [],[],[]
f = open(trainloc, 'rb')
for line in f:
text = line.strip().split('\t')
trainA.append(' '.join(word_tokenize(text[3])))
trainB.append(' '.join(word_tokenize(text[4])))
trainS.append(text[0])
f.close()
f = open(testloc, 'rb')
for line in f:
text = line.strip().split('\t')
testA.append(' '.join(word_tokenize(text[3])))
testB.append(' '.join(word_tokenize(text[4])))
testS.append(text[0])
f.close()
trainS = [int(s) for s in trainS[1:]]
testS = [int(s) for s in testS[1:]]
return [trainA[1:], trainB[1:]], [testA[1:], testB[1:]], [trainS, testS]
开发者ID:2020zyc,项目名称:nlg-eval,代码行数:29,代码来源:eval_msrp.py
示例7: tokenize
def tokenize(s, stem=True, digit=False, stop=True, use_re=False):
"""
:type s: str
:type stem: bool
:type use_re: bool
:rtype: set(str)
"""
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
table = string.maketrans("","")
if use_re:
s = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', s)
if digit:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation + string.digits)))
else:
tokens = set(word_tokenize(unify_units(s).translate(table, string.punctuation)))
if stop:
tokens = set(word for word in tokens if word not in stop_words)
if stem:
tokens = set(stemmer.stem(word) for word in tokens)
return tokens
开发者ID:lingcheng99,项目名称:search-term-relevance-home-depot,代码行数:27,代码来源:preprocess.py
示例8: clean_raw_txt
def clean_raw_txt(body, headline, punct_dct=None, stopwrds_set=None):
"""Clean the body and headline to remove punctuation, stopwords, etc.
Args:
----
body: str
headline: str
punct_dct (optional): dict
Translation dict resulting from a `str.maketrans()` call
stopwords_set (optional): set
Return:
------
(body, headline): tuple
"""
if punct_dct:
body = body.translate(punct_dct)
headline = headline.translate(punct_dct)
body_wrds = word_tokenize(body)
headline_wrds = word_tokenize(headline)
stopwrds_set = set() if stopwrds_set is None else stopwrds_set
body_wrds = [wrd.lower() for wrd in body_wrds if wrd.lower() not in stopwrds_set]
headline_wrds = [wrd.lower() for wrd in headline_wrds if wrd.lower() not in stopwrds_set]
return (body_wrds, headline_wrds)
开发者ID:sallamander,项目名称:headline-generation,代码行数:29,代码来源:twenty_news_gen.py
示例9: obtaindata
def obtaindata(pos_file,neg_file):
##read the input files
short_pos = open(pos_file, "r").read()
short_neg = open(neg_file, "r").read()
documents = [] # documents is gonna be a list of tuples that have a line of review and a class (pos or neg)
for r in short_pos.split('\n'):
documents.append((r, "pos"))
for r in short_neg.split('\n'):
documents.append((r, "neg"))
all_words = [] # gonna contain all the words in both corpuses combined (nonunique)
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)
for w in short_pos_words:
all_words.append(w.lower())
for w in short_neg_words:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]#gets the top 5000 most common words to use as features
featuresets = [(find_features(rev,word_features), category) for (rev, category) in documents]
random.shuffle(featuresets)
return featuresets
开发者ID:akshaynavada,项目名称:NLP,代码行数:27,代码来源:sentimentPractice.py
示例10: load_samples
def load_samples(question, prop_labels):
samples = []
q = word_tokenize(question)
for label in prop_labels:
text = word_tokenize(label.lower())
samples.append({'qtext': ' '.join(q), 'label': 0, 'atext': ' '.join(text)})
return samples
开发者ID:BenjaminHess,项目名称:dataset-sts,代码行数:7,代码来源:scoring-api.py
示例11: _doc2vec_doc_stream
def _doc2vec_doc_stream(paths, n, sentences=True):
"""
Generator to feed sentences to the dov2vec model.
"""
phrases = Bigram()
i = 0
p = Progress()
for path in paths:
with open(path, 'r') as f:
for line in f:
i += 1
p.print_progress(i/n)
# We do minimal pre-processing here so the model can learn
# punctuation
line = line.lower()
if sentences:
for sent in sent_tokenize(line):
tokens = word_tokenize(sent)
yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
else:
tokens = word_tokenize(line)
yield LabeledSentence(phrases[tokens], ['SENT_{}'.format(i)])
开发者ID:frnsys,项目名称:factory,代码行数:25,代码来源:doc2vec.py
示例12: load_sick2014
def load_sick2014(dsfile, mode='relatedness'):
""" load a dataset in the sick2014 tsv .txt format;
mode='relatedness': use the sts relatedness score as label
mode='entailment': use -1 (contr.), 0 (neutral), 1 (ent.) as label """
s0 = []
s1 = []
labels = []
with open(dsfile) as f:
first = True
for line in f:
if first:
# skip first line with header
first = False
continue
line = line.rstrip()
pair_ID, sentence_A, sentence_B, relatedness_score, entailment_judgement = line.split('\t')
if mode == 'relatedness':
label = float(relatedness_score)
elif mode == 'entailment':
if entailment_judgement == 'CONTRADICTION':
label = -1
elif entailment_judgement == 'NEUTRAL':
label = 0
elif entailment_judgement == 'ENTAILMENT':
label = +1
else:
raise ValueError('invalid label on line: %s' % (line,))
else:
raise ValueError('invalid mode: %s' % (mode,))
labels.append(label)
s0.append(word_tokenize(sentence_A))
s1.append(word_tokenize(sentence_B))
return (s0, s1, np.array(labels))
开发者ID:quinsulon,项目名称:dataset-sts,代码行数:34,代码来源:loader.py
示例13: load_anssel
def load_anssel(dsfile, subsample0=3):
""" load a dataset in the anssel csv format;
subsample0=N denotes that only every N-th 0-labelled sample
should be loaded; so e.g. N=3 reduces 80k negatives to 28k
negatives in the training set (vs. 4k positives); N=10k
gets you just 8k negatives, etc. """
s0 = []
s1 = []
labels = []
i = 0
with open(dsfile) as f:
c = csv.DictReader(f)
for l in c:
label = int(l['label'])
if label == 0 and (i % subsample0) != 0:
i += 1
continue
labels.append(label)
try:
qtext = l['qtext'].decode('utf8')
atext = l['atext'].decode('utf8')
except AttributeError: # python3 has no .decode()
qtext = l['qtext']
atext = l['atext']
s0.append(word_tokenize(qtext))
s1.append(word_tokenize(atext))
i += 1
return (s0, s1, np.array(labels))
开发者ID:quinsulon,项目名称:dataset-sts,代码行数:29,代码来源:loader.py
示例14: testing
def testing():
# - tokenize on sentence and word
ex_txt = "hello there Mr. Bartuska, How are you? The weather is great and I enjoy Python. cheers!"
print(sent_tokenize(ex_txt))
print(word_tokenize(ex_txt, language='english'))
# - stop words (pre-defined by nltk)
stop_words = set(stopwords.words('english'))
print(stop_words)
words = word_tokenize(ex_txt)
print(words)
filtered_sent = []
for w in words:
if w not in stop_words:
filtered_sent.append(w)
print(filtered_sent)
filtered_sent = [w for w in words if not w in stop_words]
print(filtered_sent)
# - stemming
ps = PorterStemmer()
example_words = [python,pythoner,pythoning,pythoned,pythonly]
# for w in example_words:
# print(ps.stem(w))
new_text = "it is very important to be pothonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)
for w in words:
print(ps.stem(w))
开发者ID:gbartusk,项目名称:coursera_data_science_capstone,代码行数:28,代码来源:capstone.py
示例15: __init__
def __init__(self, txt_type: str, txt: str):
self.txt_type = txt_type
if txt_type is "paragraph":
self.sentences = [word_tokenize(w) for w in sent_tokenize(txt)]
else:
self.title = word_tokenize(txt)
开发者ID:NGrech,项目名称:FYP,代码行数:7,代码来源:indexers.py
示例16: get_doc_abstract_query_List
def get_doc_abstract_query_List(norm):
ranked_top_10_doc_list = map(operator.itemgetter(0), ranked_scores_top_10)
result_query = ""
count = 0
synonym_words_list = []
for docID in ranked_top_10_doc_list:
if dir_of_docs.endswith("/"):
docID_file_dir = dir_of_docs + docID + ".xml"
else:
docID_file_dir = dir_of_docs + "/" + docID + ".xml"
xml_doc = Document(docID, docID_file_dir)
title = xml_doc.get_title()
result_query += title + " "
"""
if count < 1: # Only get abstract from top document(s)
result_query += xml_doc.get_abstract() + " "
"""
# Adds synonyms for the top ranked document's title to new query
if count <= 10:
title_words = word_tokenize(title)
for w in title_words:
synonym_words_list = norm.combine_list(synonym_words_list, norm.get_synonym_list(w))
count += 1
result_query_list = word_tokenize(result_query)
result_query_list = norm.combine_list(result_query_list, synonym_words_list)
normalized = norm.normalize_tokens(result_query_list)
return normalized
开发者ID:NatashaKSS,项目名称:BeefDumplings,代码行数:32,代码来源:search.py
示例17: load_ace_file
def load_ace_file(textfile, fmt):
print ' - %s' % os.path.split(textfile)[1]
annfile = textfile+'.tmx.rdc.xml'
# Read the xml file, and get a list of entities
entities = []
xml = ET.parse(open(annfile)).getroot()
for entity in xml.findall('document/entity'):
typ = entity.find('entity_type').text
for mention in entity.findall('entity_mention'):
if mention.get('TYPE') != 'NAME': continue # only NEs
s = int(mention.find('head/charseq/start').text)
e = int(mention.find('head/charseq/end').text)+1
entities.append( (s, e, typ) )
# Read the text file, and mark the entities.
text = open(textfile).read()
# Strip XML tags, since they don't count towards the indices
text = re.sub('<(?!/?TEXT)[^>]+>', '', text)
# Blank out anything before/after <TEXT>
def subfunc(m): return ' '*(m.end()-m.start()-6)
text = re.sub('[\s\S]*<TEXT>', subfunc, text)
text = re.sub('</TEXT>[\s\S]*', '', text)
# Simplify quotes
text = re.sub("``", ' "', text)
text = re.sub("''", '" ', text)
entity_types = set(typ for (s,e,typ) in entities)
# Binary distinction (NE or not NE)
if fmt == 'binary':
i = 0
toks = Tree('S', [])
for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree('NE', text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
# Multiclass distinction (NE type)
elif fmt == 'multiclass':
i = 0
toks = Tree('S', [])
for (s,e,typ) in sorted(entities):
if s < i: s = i # Overlapping! Deal with this better?
if e <= s: continue
toks.extend(word_tokenize(text[i:s]))
toks.append(Tree(typ, text[s:e].split()))
i = e
toks.extend(word_tokenize(text[i:]))
yield toks
else:
raise ValueError('bad fmt value')
开发者ID:approximatelylinear,项目名称:nltk,代码行数:60,代码来源:named_entity.py
示例18: calculate_pmi_use_case2
def calculate_pmi_use_case2(self, schema):
print("Calculating PMI for " + schema)
corpus_count = 0
text = []
for item in self.__mongo_db.get(schema, {}):
text += word_tokenize(item['text'], language='german')
corpus_count += len(word_tokenize(item['text'], language='german'))
print(corpus_count)
counter = Counter(text)
single_pattern_table = self.__postgre_db.get_data_from_table(schema, "bscale_single_pattern")
# counting single pattern occurrences
for item in single_pattern_table:
word = item['single_pattern']
count = counter[word]
self.__postgre_db.update(schema, "bscale_single_pattern", "count=" + str(count), "single_pattern=" + add_quotes(word))
# pmi calculation
co_occ_table = self.__postgre_db.get_data_from_table(schema, "correlating_pattern")
for item in co_occ_table:
item_id = item['id']
co_occ_freq = float(item['count'] / corpus_count)
word1_id = item['pattern_a']
word2_id = item['pattern_b']
word1_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word1_id), "count")
print(word1_occ)
word2_occ = self.__postgre_db.get(schema, "bscale_single_pattern", "id=" + str(word2_id), "count")
print(word2_occ)
pmi = log2(co_occ_freq / (float(word1_occ / corpus_count) * float(word2_occ / corpus_count)))
print(pmi)
self.__postgre_db.update(schema, "correlating_pattern", "pmi=" + str(pmi), "id=" + str(item_id))
开发者ID:mdth,项目名称:Masterarbeit,代码行数:30,代码来源:Prototype.py
示例19: write_anotations_to_file
def write_anotations_to_file(lst_annotation, file_name):
with codecs.open(file_name, 'w', 'utf-8') as f:
for annotation in lst_annotation:
annotation_full_text = annotation.text
car_name = preprocessor_text(annotation.name)
annotation_start = annotation_full_text.find(car_name)
annotation_end = annotation.start + len(car_name)
full_text_before_annotation = preprocessor_text(annotation_full_text[:annotation_start].strip())
before_tokens = word_tokenize(full_text_before_annotation)
for token in before_tokens:
f.write( token + u' ' + u'O' + u'\n' )
annotation_tokens = word_tokenize(car_name)
for idx, token in enumerate(annotation_tokens):
if idx == 0:
label = u'B'
else:
label = u'I'
f.write( token + u' ' + label + u'\n' )
full_text_after_annotation = preprocessor_text(annotation_full_text[annotation_end:]).strip()
after_tokens = word_tokenize(full_text_after_annotation)
for token in after_tokens:
f.write( token + u' ' + u'O' + '\n' )
f.write( u'\n' )
开发者ID:EgorLakomkin,项目名称:clearspending,代码行数:33,代码来源:conll.py
示例20: sentence_matches
def sentence_matches(self, sentence_text):
"""Returns true iff the sentence contains this mention's upstream
and downstream participants, and if one of the stemmed verbs in
the sentence is the same as the stemmed action type."""
has_upstream = False
has_downstream = False
has_verb = False
# Get the first word of the action type and assume this is the verb
# (Ex. get depends for depends on)
actiontype_words = word_tokenize(self.mention.actiontype)
actiontype_verb_stemmed = stem(actiontype_words[0])
words = word_tokenize(sentence_text)
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.upstream.lower()):
has_upstream = True
if self.string_matches_sans_whitespace(sentence_text.lower(),
self.mention.downstream.lower()):
has_downstream = True
for word in words:
if actiontype_verb_stemmed == stem(word):
has_verb = True
return has_upstream and has_downstream and has_verb
开发者ID:johnbachman,项目名称:indra,代码行数:28,代码来源:find_full_text_sentence.py
注:本文中的nltk.tokenize.word_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论