本文整理汇总了Python中nltk.tokenize.wordpunct_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python wordpunct_tokenize函数的具体用法?Python wordpunct_tokenize怎么用?Python wordpunct_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了wordpunct_tokenize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: formatting_features
def formatting_features(obj):
question = obj['question_text'].strip()
topics = [ t['name'] for t in obj['topics'] ]
tokens = [ w for w in wordpunct_tokenize(question) if not re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',w) ]
punct = [ p for p in wordpunct_tokenize(question) if re.match(r'[\'\"\.\?\!\,\/\\\(\)\`]',p) ]
top_toks = set([ w.lower() for t in obj['topics']
for w in wordpunct_tokenize(t['name']) ])
qn_toks = set(tokens)
#qn_topic_words = len(top_toks & qn_toks)
qn_mark = 1 if "?" in question else -1
start_cap = 1 if re.match(r'^[A-Z]',question) else -1
if tokens:
qn_type = [ sum(1.0 for w in tokens if w in qws)
for qws in qn_type_words ]
nm_pres = sum(1.0 for w in tokens if w.lower() in names
and re.match(r'^[A-Z]',w))
pl_pres = sum(1.0 for w in tokens if w.lower() in places
and re.match(r'^[A-Z]',w))
else:
qn_type = [0.0]*len(qn_type_words)
nm_pres = -1.0
pl_pres = -1.0
# qn_somewhere = 1 if sum(qn_type) and (re.match(r'\?$',question)
# or re.match(r'\?\s*[A-Z]',question)) else -1
total_words = len(tokens)
dict_words = sum(1 for w in tokens if w.lower() in eng_words)
correct_form_count = sum(1.0 for w in tokens
if (w.lower() in eng_words and not re.match(r'^[A-Z]+$',w))
or re.match(r'^[A-Z]',w)
)
question_form = 1 if '?' in punct and sum(1 for w in tokens if w in qn_words) else -1
correct_form_ratio = correct_form_count/float(total_words+1)
#topic_word_ratio = qn_topic_words/float(total_words+1)
name_ratio = (nm_pres + pl_pres)/float(total_words+1)
punctuation_ratio = len(punct)/float(total_words+1)
result = [
# 1 if nm_pres else 0,
nm_pres,
# 1 if pl_pres else 0,
pl_pres,
qn_mark,
start_cap,
# qn_somewhere,
correct_form_ratio,
#len(punct),
punctuation_ratio,
math.log(len(topics)+1),
#len(topics),
name_ratio,
# topic_word_ratio,
dict_words,
# qn_topic_words,
# correct_form_count,
# math.log(total_words+1),
total_words,
] + qn_type
return result
开发者ID:shawntan,项目名称:quora-codesprint-2013,代码行数:60,代码来源:qn1.py
示例2: getResult
def getResult(textFile, ind1, ind2, outFile, outFile2):
fout = open(outFile,"w")
fout2 = open(outFile2, "w")
#probs = []
for line in open(textFile):
hyp1 = wordpunct_tokenize(line.strip().split("|||")[ind1].strip().decode("utf-8"))
hyp2 = wordpunct_tokenize(line.strip().split("|||")[ind2].strip().decode("utf-8"))
f = open("temp.txt","w")
f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp1]))
f.close()
os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
prob1 = getProb("temp_out.txt")
f = open("temp.txt","w")
f.write("%s\n"%" ".join([x.encode("utf-8") for x in hyp2]))
f.close()
os.system("~/Course/AMMML/project/FeatureAugmentedRNNToolkit/rnnlm -rnnlm ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/model -test temp.txt -features-matrix ~/Course/AMMML/project/FeatureAugmentedRNNToolkit/feature.txt -independent > temp_out.txt")
prob2 = getProb("temp_out.txt")
#probs.append([prob1,prob2])
fout.write("%f\t%f\n"%(prob1,prob2))
fout2.write("%f\t%f\n"%(prob1/float(len(hyp1)),prob2/float(len(hyp2))))
fout.close()
fout2.close()
开发者ID:sshiang,项目名称:sp2016.11-731,代码行数:27,代码来源:rnnlm.py
示例3: text_to_sentences
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
print "text_to_sentence"
#from nltk.tokenize import wordpunct_tokenize
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
text=text.decode("utf8")
from nltk.tokenize import sent_tokenize,wordpunct_tokenize
# 1. Use the NLTK tokenizer to split the paragraph into sentences
#raw_sentences = tokenizer.tokenize(text.strip())
raw_sentences = sent_tokenize(text.strip())
print "finish tokenize sentence",len(raw_sentences)
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
#print "sentence:",raw_sentence
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
#sentences.append( text_to_wordlist( raw_sentence, \
# remove_stopwords ))
#print removePunctuation(raw_sentence).lower().split()
print raw_sentence
sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
print wordpunct_tokenize(raw_sentence)
#print text_to_wordlist( raw_sentence, remove_stopwords )
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
开发者ID:billy322,项目名称:BioNLP-2016,代码行数:32,代码来源:utilities.py
示例4: getFormattingFeatures
def getFormattingFeatures(obj):
question = obj["question_text"].strip()
topics = [t["name"] for t in obj["topics"]]
tokens = [w for w in wordpunct_tokenize(question) if not re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", w)]
punct = [p for p in wordpunct_tokenize(question) if re.match(r"[\'\"\.\?\!\,\/\\\(\)\`]", p)]
top_toks = set([w.lower() for t in obj["topics"] for w in wordpunct_tokenize(t["name"])])
qn_toks = set(tokens)
qn_topic_words = len(top_toks & qn_toks)
start_cap = 1 if re.match(r"^[A-Z]", question) else 0
if len(tokens) > 0:
qn_type = [1 if sum(1.0 for w in tokens if w in qws) else 0 for qws in qn_type_words]
else:
# penalize having no token words
qn_type = [-1.0] * len(qn_type_words)
total_words = len(tokens)
correct_form_count = sum(1.0 for w in tokens if (not re.match(r"^[A-Z]+$", w)) or re.match(r"^[A-Z]", w))
topic_word_ratio1 = max(0, qn_topic_words - 2) / float(total_words + 1)
topic_word_ratio2 = max(0, 2 - qn_topic_words) / float(total_words + 1)
topic_word_ratio = qn_topic_words / float(total_words + 1)
punctuation_ratio = len(punct) / float(total_words + 1)
word_overshoot = max(0, total_words - 10.1)
word_undershoot = max(0, 10.1 - total_words)
result = [
start_cap,
punctuation_ratio,
math.log(len(topics) + 1),
topic_word_ratio1,
topic_word_ratio2,
topic_word_ratio,
word_overshoot,
word_undershoot,
] + qn_type
return result
开发者ID:ChandanBP,项目名称:quora,代码行数:33,代码来源:interest_solution.py
示例5: check_len_stats
def check_len_stats(std_dev):
fraction = 0
for i in range(1,5):
fraction+=0.25
count1 = 0
count2 = 0
mcount = 0
ncount = 0
threshold = fraction*std_dev
print threshold
with open(infile, 'r') as f:
for line in f:
mem_len = 0
nonmem_len= 0
if(line.strip().split('\t')[1]=='M'):
mem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
mcount +=1
if (float(mem_len) < threshold):
count1+=1
else:
nonmem_len+=len(wordpunct_tokenize(line.strip().split('\t')[0]))
ncount+=1
if (float(nonmem_len) < threshold):
count2+=1
f.close()
print "iteration-" , i
print "memorable quotes below threshold-", count1
print "total memorable quotes-",mcount
print "non-memorable quotes below threshold-",count2
print "non memorable quotes-",ncount
开发者ID:anushabala,项目名称:memorability,代码行数:30,代码来源:get_average_length.py
示例6: jaccard_sentence
def jaccard_sentence(sentence1, sentence2):
"""
Determines jaccard value of two sentences
:param sentence1:
:param sentence2:
:return: jaccard value
"""
return jaccard(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:9,代码来源:SimilarityUtil.py
示例7: dice_sentence
def dice_sentence(sentence1, sentence2):
"""
Determines the Dice value of two sentences
:param sentence1:
:param sentence2:
:return: dice value
"""
return dice(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2))
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:9,代码来源:SimilarityUtil.py
示例8: common_words
def common_words(sent1,sent2):
# remove stop words, lemmatise and return count of common words
porter = PorterStemmer()
#stop = stopwords.words('english')
s1_words = [porter.stem(i.lower()) for i in wordpunct_tokenize(sent1) ]
s2_words = [porter.stem(i.lower()) for i in wordpunct_tokenize(sent2) ]
s1 = set(s1_words)
s2 = set(s2_words)
return len(s1.intersection(s2)) / ((len(s1)+0.1+len(s2))/2.0) # normalised
开发者ID:lavanyats,项目名称:iMATCH,代码行数:9,代码来源:wordnet_utils.py
示例9: load_memes
def load_memes (self, filenames):
for filename in filenames:
f = open(filename, 'r')
contents = f.readlines()
for entry in contents:
fields = [s.strip() for s in entry.split("|")]
meme_type = fields[0]
top_text = wordpunct_tokenize(fields[1].lower())
bottom_text = wordpunct_tokenize(fields[2].lower())
self.memes[meme_type].append ((top_text, bottom_text))
开发者ID:AlexeyMK,项目名称:DATASS,代码行数:11,代码来源:SentimentAnalysis.py
示例10: generate_vocabulary
def generate_vocabulary(self, review_summary_file):
self.rev_sum_pair = pd.read_csv(review_summary_file,header=0).values
for review,summary in self.rev_sum_pair:
rev_lst = wordpunct_tokenize(review)
sum_lst = wordpunct_tokenize(summary)
self.__add_list_to_dict(rev_lst)
self.__add_list_to_dict(sum_lst)
# Now store the "" empty string as the last word of the voacabulary
self.map[""] = len(self.map)
self.revmap[len(self.map)] = ""
开发者ID:githubgzc,项目名称:deep-summarization,代码行数:12,代码来源:data2tensor.py
示例11: features_from_dump
def features_from_dump(infile,variant,embeddings,bowfilter):
frame = read_dump(infile)
refstatements = [wordpunct_tokenize(st) for st in list(frame.Ref)]
targetstatements = [wordpunct_tokenize(st) for st in list(frame.Target)]
featuredicts = []
for i in range(len(refstatements)):
sp = StatementPair(i, refstatements[i], targetstatements[i], 0)
commonwords, onlyref, onlytarget = sp._word_venn_diagram()
trainingbow.update(onlyref)
featuredicts.append(sp.featurize(variant, embeddings,bowfilter))
return featuredicts
开发者ID:hectormartinez,项目名称:verdisandbox,代码行数:13,代码来源:classify_dga_dump.py
示例12: med_sentence
def med_sentence(sentence1, sentence2, c1=1, c2=1, c3=1):
"""
Determines minimum edit distance of two sentences.
:param sentence1: first sentence
:param sentence2: second sentence
:param c1: optional weight
:param c2: optional weight
:param c3: optional weight
:return: integer, minimum edit distance
"""
return med(wordpunct_tokenize(sentence1), wordpunct_tokenize(sentence2), c1, c2, c3)
开发者ID:bphenriques,项目名称:NLPMiniProjects,代码行数:13,代码来源:SimilarityUtil.py
示例13: main
def main():
# related_words = {
# 'art':['art', 'arts', , 'op art', 'pop art', 'art deco', 'art form', 'art house', 'art-house', 'clip art', 'fine art', 'art gallery', 'art nouveau', 'art therapy', 'kinetic art', 'martial art', 'art director', 'conceptual art', "objet d'art", 'performance art', 'work of art', 'state-of-the-art', 'the black art', 'thou art', 'noble art', 'craft', 'craftsmanship', 'ingenuity', 'mastery', 'artistry', 'imagination', 'Biedermeier', 'Parian', 'Queen Anne', 'annulate', 'anomphalous', 'banded', 'chryselephantine', 'aperture', 'collared', 'artificial', 'condensed', 'camera', 'copied'],
# 'sport':['athletcis', 'recreation', 'candidacy', 'championship', 'clash', 'contention', 'event', 'fight', 'game', 'match', 'race', 'rivalry', 'run', 'sport', 'sports', 'struggle', 'tournament', 'trial', 'basketball', 'football', 'soccer', 'badminton', 'archery', 'tennis', 'swim']
# }
result = dict()
clubs = list(Club.objects.all())
print len(clubs)
for club in clubs:
score = 0
# try:
if club.introduction:
intro = club.introduction
else:
intro = ""
name = club.name
max_score = 0
max_cat = None
for category in CATEGORIES:
all_words = wordpunct_tokenize(intro.lower())
all_name_words = wordpunct_tokenize(name.lower())
score = 0
for word in determinstic_words[category]:
score += all_words.count(word) * 2
score += all_name_words.count(word) * 10
if score > max_score:
max_cat = category
max_score = score
if max_cat and max_score > 2:
category = Category.objects.get(name=max_cat)
club.categories.add(category)
club.save()
try:
# print name, max_cat, max_score
result[max_cat].append(name)
except KeyError:
result[max_cat] = [name]
for category in CATEGORIES:
print category
try:
for club in result[category]:
print club
except:
pass
print "\n"
开发者ID:hpec,项目名称:rateyourclub,代码行数:51,代码来源:categorize.py
示例14: hypernym_count
def hypernym_count(sent1,sent2):
s1_words = [i.lower() for i in wordpunct_tokenize(sent1) ]
s2_words = [i.lower() for i in wordpunct_tokenize(sent2) ]
s1_all = []
s2_all = []
for w in s1_words:
s1_all.extend(get_hypernyms(w))
for w in s2_words:
s2_all.extend(get_hypernyms(w))
w1_hypernym = len(set(s1_words).intersection(set(s2_all)))
w2_hypernym = len(set(s2_words).intersection(set(s1_all)))
return w1_hypernym-w2_hypernym
开发者ID:lavanyats,项目名称:iMATCH,代码行数:14,代码来源:wordnet_utils.py
示例15: best_dressed
def best_dressed(year):
if year not in yearMap.keys():
prep_year(year)
strings = yearMap[year]['strings']
dressPattern = re.compile(r'(dress)|(red carpet)|(redcarpet)', re.IGNORECASE)
posPattern = re.compile(r'(best)|(beautiful)|(stun)|(love)', re.IGNORECASE)
negPattern = re.compile(r'(worst)|(bad)|(ugly)|(hate)', re.IGNORECASE)
namePattern = re.compile(r'[A-Z]\w* [A-Z]\w*')
stoplist = ['new','red','carpet','redcarpet','globes','golden','best','worst','movie','motion','picture','film','drama','comedy','musical','cecil','demille','award','tv','performance', 'actress','actor','television','feature','foreign','language','supporting','role','director','original','series']
dress_mentions = Counter()
dress_mentions_neg = Counter()
dress_mentions_pos = Counter()
for tweet in strings:
if re.search(dressPattern, tweet):
matches = re.findall(namePattern, tweet)
matches = (w.lower() for w in matches)
for match in matches:
match_words = wordpunct_tokenize(match)
if match_words[0] not in stoplist and match_words[1] not in stoplist:
dress_mentions[match] += 1
if re.search(posPattern, tweet):
dress_mentions_pos[match] += 1
if re.search(negPattern, tweet):
dress_mentions_neg[match] += 1
discussed_dress = dress_mentions.most_common(1)
best_dress = dress_mentions_pos.most_common(1)
worst_dress = dress_mentions_neg.most_common(1)
return best_dress[0][0], worst_dress[0][0], discussed_dress[0][0]
开发者ID:irabkina,项目名称:gg-project-master-2016,代码行数:34,代码来源:gg_api.py
示例16: pick_top
def pick_top(number, sortedLst, ratio):
unigrams = []
bigramsplus = []
for element in sortedLst:
tokens = wordpunct_tokenize(element[0])
if len(tokens) is 1:
unigrams.append(element)
else:
bigramsplus.append(element)
#will be a list of the top *number* strings
topList = []
unigramIndex = 0
bigramIndex = 0
while len(topList) < number:
if unigramIndex is len(unigrams):
if bigramIndex is len(bigramsplus):
break
else:
topList.append(bigramsplus[bigramIndex][0])
bigramIndex += 1
elif bigramIndex is len(bigramsplus):
topList.append(unigrams[unigramIndex][0])
unigramIndex += 1
else:
if unigrams[unigramIndex][1] * ratio < bigramsplus[bigramIndex][1]:
topList.append(bigramsplus[bigramIndex][0])
bigramIndex += 1
else:
topList.append(unigrams[unigramIndex][0])
unigramIndex += 1
return topList
开发者ID:irabkina,项目名称:gg-project-master-2016,代码行数:33,代码来源:gg_api.py
示例17: tokenStem
def tokenStem(words):
words = words.strip('[').strip(']').lower() #remove brackets and lowercase
words = re.sub('[(){}<>:,.!?\'"]', '', words)
stemmer = PorterStemmer()
stops = stopwords.words('english')
output = [stemmer.stem(token) for token in wordpunct_tokenize(words) if token not in stops ] #stem words
return " ".join(output) #merge into strings
开发者ID:dingchaoz,项目名称:machine_learning,代码行数:7,代码来源:tokenStemRemove.py
示例18: tokenize
def tokenize(directory,exclude_files):
full_content = ''
for _file in os.listdir(directory):
#disp_count = 5
if exclude_files and (_file in exclude_files):
continue
with open(directory+_file,'r') as f:
contents = f.readlines()
for item in contents:
try:
sentence = item.split('\t')[1].strip()
full_content += sentence
except IndexError:
continue
# if np.random.binomial(1,0.1):
# print sentence
# time.sleep(2)
# disp_count -=1
# if not disp_count:
# print '*'*100
# break
# else:
# print '#'
return wordpunct_tokenize(full_content.lower())
开发者ID:hashbangCoder,项目名称:Word2Vec,代码行数:27,代码来源:tokenized_text.py
示例19: preprocess
def preprocess(line, is_lmz=False):
line = wordpunct_tokenize(line.strip())
if is_lmz:
lemmatizer = WordNetLemmatizer()
line = [lemmatizer.lemmatize(word) for word in line]
return line
开发者ID:lngvietthang,项目名称:imageqa,代码行数:7,代码来源:quest2num.py
示例20: split_sentence_from_document
def split_sentence_from_document(document):
max_counts = 0
for sent in tokenize.sent_tokenize(document):
max_counts = max(max_counts, len(tokenize.wordpunct_tokenize(sent)))
# if max_counts>4000:
# print(document)
return max_counts
开发者ID:xiabofei,项目名称:python_details,代码行数:7,代码来源:probe_sentence.py
注:本文中的nltk.tokenize.wordpunct_tokenize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论