本文整理汇总了Python中nltk.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了word_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: load_file_without_frequency
def load_file_without_frequency(self,positif, negatif):
tab = []
maxs = self.nbFeatures
phrases = []
y = []
with codecs.open(positif,"r",encoding='latin-1') as my_file:
for line in my_file:
line= line.strip().lower() # remove the \n*
phrases.append(line)
y.append(1)
for mot in word_tokenize(line):
tab.append(mot)
with codecs.open(negatif,"r",encoding='latin-1') as my_file:
for line in my_file:
line= line.strip().lower() # remove the \n*
phrases.append(line)
y.append(0)
for mot in word_tokenize(line):
tab.append(mot)
word_fd = FreqDist(tab)
print(word_fd)
for i in range(len(phrases)):
mots = word_tokenize(phrases[i])
tmp = []
for element in mots:
tmp.append(word_fd[element])
if(len(tmp) < maxs):
for j in range(maxs - len(tmp)):
tmp.append(0)
elif(len(tmp)>maxs):
tmp = tmp[:maxs]
phrases[i] = tmp
return (np.array(phrases),np.array(list(set(tab))),np.array(y))
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:33,代码来源:neural.py
示例2: __init__
def __init__(self, title, full_text, sentence):
self.title = title
self.sentence = sentence
# map of word -> number of times it appears in the full article text
self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text))
# map of word -> number of times it appears in the given sentence
self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
开发者ID:jeevnayak,项目名称:gapfill,代码行数:7,代码来源:keyword_chooser.py
示例3: vectorize
def vectorize(data, s):
'''
:param data: list of instances for a given lexelt with the following structure:
{
[(instance_id, left_context, head, right_context, sense_id), ...]
}
:param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
:return: vectors: A dictionary with the following structure
{ instance_id: [w_1 count, w_2 count, ...],
...
}
labels: A dictionary with the following structure
{ instance_id : sense_id }
'''
vectors = {}
labels = {}
for (instance_id, left_context, head, right_context, sense_id) in data:
labels[instance_id] = sense_id
left_tokens = nltk.word_tokenize(left_context)
right_tokens = nltk.word_tokenize(right_context)
words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)
vectors[instance_id] = frequency_vector_from_near_words(s, words)
return vectors, labels
开发者ID:williamFalcon,项目名称:NLP_HW3,代码行数:26,代码来源:A.py
示例4: colocation
def colocation(windowSize, pos, context,dictionary):
if windowSize<=0:
return dictionary
#going forward
forward= context[:(pos)]
f= forward[(-windowSize/2):]
#going backward
backward= context[pos+1:]
b= backward[:windowSize/2]
for item in f:
key= "pre"+str(len(f)-f.index(item))+"-word"
value= item
dictionary[key]=value
key= "pre"+str(len(f)-f.index(item))+"-pos"
text = nltk.word_tokenize(item)
value= nltk.pos_tag(text)[0][1]
dictionary[key]=value
for item in b:
key= "fol"+str(b.index(item)+1)+"-word"
value= item
dictionary[key]=value
key= "fol"+str(b.index(item)+1)+"-pos"
text = nltk.word_tokenize(item)
value= nltk.pos_tag(text)[0][1]
dictionary[key]=value
return dictionary
开发者ID:ansuabraham,项目名称:cs4740_3,代码行数:26,代码来源:colocation.py
示例5: __tokenize
def __tokenize(self, utter, semantic_tagged=None):
result = None
if semantic_tagged is None:
result = [(word, None) for word in nltk.word_tokenize(utter)]
else:
parser_raw = SemanticTagParser(False)
parser_tagged = SemanticTagParser(False)
segmented = ' '.join(nltk.word_tokenize(utter))
tagged = ' '.join(semantic_tagged)
parser_raw.feed(segmented)
parser_tagged.feed(tagged)
raw_chr_seq = parser_raw.get_chr_seq()
raw_space_seq = parser_raw.get_chr_space_seq()
tagged_chr_seq = parser_tagged.get_chr_seq()
tagged_space_seq = parser_tagged.get_chr_space_seq()
if raw_chr_seq == tagged_chr_seq:
merged_space_seq = [
x or y for x, y in zip(raw_space_seq, tagged_space_seq)]
word_seq = parser_tagged.tokenize(merged_space_seq)
tag_seq = parser_tagged.get_word_tag_seq()
result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]
return result
开发者ID:ishalyminov,项目名称:dstc5,代码行数:30,代码来源:baseline_slu.py
示例6: reading_level
def reading_level(full_text):
#Clean the full_text
full_text_clean = ""
for char in full_text:
if char == ".":
full_text_clean += ". "
else:
full_text_clean += char
#Language features
import nltk
words = nltk.word_tokenize(full_text_clean)
n_sents = len(nltk.sent_tokenize(full_text_clean))
n_words = len(nltk.word_tokenize(full_text_clean))
#Count the syllables
n_syll = 0
for word in words:
n_syll += syllable_count(word)
#Calculate the reading level
#https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words)
return round(grade_level,1)
开发者ID:ECohen16,项目名称:rapid_reader,代码行数:26,代码来源:views.py
示例7: update
def update(self, other):
"""Adds counts for elements in other"""
if isinstance(other, self.__class__):
self.n_sents += other.n_sents
for x, n in other.items():
self[x] += n
else:
for sent in other:
self.n_sents += 1
# import pdb;pdb.set_trace()
if self.poscache is not None:
if sent in self.poscache:
tags = self.poscache[sent]
else:
self.poscache[sent] = tags = nltk.pos_tag(
nltk.word_tokenize(sent))
else:
tags = nltk.pos_tag(nltk.word_tokenize(sent))
for x in tags:
tok, tag = x
self[tag] += 1
if self.normalize:
for x, n in self.items():
self[x] /= float(self.n_sents)
开发者ID:Axighi,项目名称:Scripts,代码行数:27,代码来源:PosTagFreqVectorizer.py
示例8: main
def main(question, article):
ddict = {}
counts = get_counts()
for tok in nltk.word_tokenize(article):
ddict[tok] = ddict.get(tok, 0) + 1
vec = []
for tok in nltk.word_tokenize(question):
# count in article
tf = ddict.get(tok, 0)
# total articles is 108 / number that have current token
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
vec.append(tf*idf)
largest = max(vec)
normalized = map(lambda y: y/largest, vec)
finDic = {}
for word,i in enumerate(nltk.word_tokenize(question)):
finDic[word] = normalized[i]
print finDic
return finDic
开发者ID:NLP-Project,项目名称:NLP-project,代码行数:25,代码来源:tdIDF.py
示例9: next_note
def next_note(tokenizer):
print 'SemEval data'
for semeval_file in semeval_files:
print 'File', semeval_file
with open(semeval_file, 'r') as f:
st = []
for line in f:
st += [line.strip()]
text = read_visit_sem(st)
text = tokenizer.tokenize(text)
for sent in text:
yield nltk.word_tokenize(sent.lower())
print 'MIMIC data'
for notes_file in subset(notes_files, 15): # 15 random MIMIC files
print 'File', notes_file
try:
with open(notes_file, 'r') as f:
ct = 0
st = []
for line in f:
ct += 1
if ct % 50000 == 0:
print ct
if line.strip() == '</VISIT>':
text = read_visit(st)
text = tokenizer.tokenize(text)
for sent in text:
yield nltk.word_tokenize(sent.lower())
st = []
elif line.strip() != '<VISIT>':
st += [line.strip()]
except IOError:
pass
开发者ID:ankitkv,项目名称:MIMICTools,代码行数:33,代码来源:PhraseDetect.py
示例10: PushDataPair
def PushDataPair(data, database):
last = len(database['Q'].keys())
for pair in data:
database['Q'][last] = nltk.word_tokenize(pair['question'])
database['A'][last] = nltk.word_tokenize(pair['answer'])
last += 1
return database
开发者ID:echoyuzhou,项目名称:ticktock_text_api,代码行数:7,代码来源:Loader.py
示例11: build_s
def build_s(data):
'''
Compute the context vector for each lexelt
:param data: dict with the following structure:
{
lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
...
}
:return: dict s with the following structure:
{
lexelt: [w1,w2,w3, ...],
...
}
'''
s = {}
# implement your code here
for key,value in data.items():
for i in value:
tokens_left = nltk.word_tokenize(i[1])
tokens_right = nltk.word_tokenize(i[3])
left = [w for w in tokens_left if w not in string.punctuation][-window_size:]
right = [w for w in tokens_right if w not in string.punctuation][:window_size]
context = left + right
if key not in s:
s[key]=[]
for word in context:
if word not in s[key]:
s[key].append(word)
return s
开发者ID:jubimishra,项目名称:Natural-Language-Processing,代码行数:32,代码来源:A.py
示例12: paragraph_features
def paragraph_features(paragraph_sents):
global count
count += 1
print '\r', count,
if FEATURE == FEAT_CONTAINS:
paragraph_words = set(
sents_to_words(paragraph_sents)
)
elif FEATURE == FEAT_LINKED_TITLES:
paragraph_words = ' '.join(paragraph_sents)
elif FEATURE == FEAT_FIRST_SENT:
paragraph_words = nltk.word_tokenize(
paragraph_sents[0]
)
elif FEATURE == FEAT_BEGIN_SENT:
paragraph_words = {
nltk.word_tokenize(sent)[0]
for sent in paragraph_sents
}
else:
paragraph_words = None
print 'FEATURE NOT SUPPORTED'
exit()
features = dict()
for word in word_features:
features[word_features[word]] = (
word in paragraph_words
)
return features
开发者ID:mikeholler,项目名称:thesis-undergrad,代码行数:32,代码来源:classifier.py
示例13: synsym
def synsym(s1,s2):
ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
# adj
jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
if len(jj0) == 0 or len(jj1) ==0:
jjps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
# noum
jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
if len(jj0) == 0 or len(jj1) ==0:
nps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
nps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
# verb
jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
if len(jj0) == 0 or len(jj1) ==0:
vps = 0
else:
v1 = makeFeatureVec(jj0,model,300)
v2 = makeFeatureVec(jj1,model,300)
vps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
return [jjps,nps,vps]
开发者ID:gtesei,项目名称:fast-furious,代码行数:31,代码来源:gensin_1.py
示例14: build_s
def build_s(data):
"""
Compute the context vector for each lexelt
:param data: dic with the following structure:
{
lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
...
}
:return: dic s with the following structure:
{
lexelt: [w1,w2,w3, ...],
...
}
"""
s = {}
# implement your code here
for lexelt in data:
words = set()
for instance in data[lexelt]:
left_context = word_tokenize(instance[1].strip())
for token in left_context[-window_size:]:
if token not in puncts:
words.add(token)
right_context = word_tokenize(instance[3].strip())
for token in right_context[:window_size]:
if token not in puncts:
words.add(token)
s[lexelt] = list(words)
return s
开发者ID:keyu-lai,项目名称:NLP,代码行数:35,代码来源:A.py
示例15: parseFile
def parseFile(file):
""" Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent
information about the file)
"""
#print file
bindings = []
# Load header file
tokens = []
if (file['header'] != ''):
with open(file['header'], 'r') as f:
# Tokenize
for line in f.readlines():
tokens += nltk.word_tokenize(line)
# Parse tokens
bindings += parseTokens( tokens, file, 'header' )
# Load source file
tokens = []
if (file['source'] != ''):
with open(file['source'], 'r') as f:
# Tokenize
for line in f.readlines():
tokens += nltk.word_tokenize(line)
# Parse tokens
bindings += parseTokens( tokens, file, 'source' )
return bindings
开发者ID:jarrettchisholm,项目名称:pyliteserializer,代码行数:33,代码来源:pyliteserializer.py
示例16: nltk_filter
def nltk_filter(sent):
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = pos_tag(tokens)
filtered_sent = ' '
for token in tokens:
filtered_sent += '1'+token + ' '
# for pos_t in pos_tags:
# if pos_t[1] in filterList:
# #filtered_sent += stemmer.stem(pos_t[0]) + ' '
# filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = pos_tag(tokens)
# filtered_sent = ' '
# for pos_t in pos_tags:
# if pos_t[1] in filterList:
# #filtered_sent += stemmer.stem(pos_t[0]) + ' '
# filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
for token in tokens:
filtered_sent += '2' + token + ' '
return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:30,代码来源:builder.py
示例17: read_liveqa
def read_liveqa(prefix = '../data/qalab-liveqa/dataset/qrels/', train = 'LiveQA2015-ver2.qrels', tokenize = True):
import nltk
f = open_file(prefix + train)
np.random.seed(0)
data_split = {0: [], 1 : [], 2 : []}
ref_split = {0: [], 1 : [], 2 : []}
for i,line in enumerate(f):
l = line.strip().split('\t')
if l[2] == '':
first = " ? ".join(l[3].strip().split("?"))
second = " . ".join(first.strip().split("."))
q = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
split_id = np.random.choice([0,0,0,1,2])
continue
label = int(l[2]) >= 3
first = " ? ".join(l[3].strip().split("?"))
second = " . ".join(first.strip().split("."))
a = " ".join(nltk.word_tokenize(second.strip())).lower().split(' ')
data_split[split_id] += [(q,a,label,'','')]
ref_split[split_id] += [(l[0],'0',l[0]+'_'+l[1]+'_'+str(i),str(int(label)))]
return data_split[0],data_split[1],data_split[2],(ref_split[0],ref_split[1],ref_split[2])
开发者ID:wolet,项目名称:11797-project,代码行数:26,代码来源:prepare_data.py
示例18: stanford_corenlp_filter
def stanford_corenlp_filter(sent):
from nltk.tag.stanford import POSTagger
posTagger = POSTagger('/Users/gt/Downloads/'
'stanford-postagger-2013-06-20/models/'
'wsj-0-18-bidirectional-nodistsim.tagger',
'/Users/gt/Downloads/stanford-postagger-2013-06-20'
'/stanford-postagger-3.2.0.jar',encoding=encoding)
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:32,代码来源:builder.py
示例19: extract_pos_pair
def extract_pos_pair(event_mention_1, event_mention_2):
trigger1=""
extent1=""
trigger2=""
extent2=""
for one_anchor in event_mention_1.findall("anchor"):
trigger1=one_anchor[0].text
for one_anchor in event_mention_2.findall("anchor"):
trigger2=one_anchor[0].text
for one_extent in event_mention_1.findall("extent"):
extent1=one_extent[0].text
for one_extent in event_mention_2.findall("extent"):
extent2=one_extent[0].text
text1 = nltk.word_tokenize(extent1)
dict1 = nltk.pos_tag(text1)
for one_pair in dict1:
if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
pos1=one_pair[1]
break
text2 = nltk.word_tokenize(extent2)
dict2 = nltk.pos_tag(text2)
for one_pair in dict2:
if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
pos2=one_pair[1]
break
return (pos1, pos2)
开发者ID:wtl-zju,项目名称:KBP2015,代码行数:26,代码来源:coref_feature_extraction.py
示例20: checkTypeWordCount
def checkTypeWordCount(answer,question):
count = 0
status = ''
sum = 0
status1 = 'false'
for word1 in word_tokenize(answer):
if word1 == '.' or word1 == ',' or word1 == '\'' or word1 == '\"' or word1 == ':' or word1 == ';' or word1 == '?' or word1 == '/' or word1 == '\\' or word1 == '|' or word1 == ']' or word1 == '[' or word1 == '}' or word1 == '{' or word1 == '(' or word1 == ')' or word1 == '*' or word1 == '&' or word1 == '^' or word1 == '%' or word1 == '$' or word1 == '#' or word1 == '@' or word1 == '!' or word1 == '`' or word1 == '~' or word1 == '-' or word1 == '_' or word1 == '='or word1 == '+':
print 'error'
else:
sum = sum +1
#print word1
print sum
words_ans = word_tokenize(answer)
words_qus = word_tokenize(question)
if words_ans[0]=="NOTICE"or words_ans[0]=="Notice":
print "Correct"
count = count+0.25
else:
status = "Wrong"
for word in words_qus:
if en.is_number(word) and words_qus[words_qus.index(word)+1]== 'words':
if sum >= word:
print word
count = count+0.25
status1='true'
if status1 == 'false':
count = count+0.25
return count,status
开发者ID:amilamadhushanka,项目名称:englishbuddy,代码行数:32,代码来源:notice.py
注:本文中的nltk.word_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论