本文整理汇总了Python中nltk.util.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ngrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: modified_precision
def modified_precision(candidate, references, n):
candidate_ngrams=[]
candidate_n = ngrams(candidate, n)
for x in candidate_n:
#print x
candidate_ngrams.append(x)
# print candidate_ngrams
#print type(candidate_ngrams)
#length+=1
if len(candidate_ngrams) == 0:
return 0
#raw_input()
c_words = set(candidate_ngrams)
#print c_words
for word in c_words:
count_w = candidate_ngrams.count(word) + 1
#print count_w
count_max = 0
for reference in references:
reference_ngrams=[]
reference_n = ngrams(reference, n)
for x in reference_n:
reference_ngrams.append(x)
count = reference_ngrams.count(word) + 1
if count > count_max:
count_max = count
return min(count_w, count_max) / (len(candidate) + len(c_words))
开发者ID:ab93,项目名称:Text-Summarization,代码行数:34,代码来源:evalSummary.py
示例2: getTrainData
def getTrainData(corpus, embedsize, ngramsize, m):
f = open(corpus)
datap = []
for line in f:
data = line.strip().split('\t')
s1 = data[0]
s2 = data[1]
label = data[2]
s1ng = ngrams(s1.split(' '), ngramsize)
s2ng = ngrams(s2.split(' '), ngramsize)
s1ng = set([ng for ng in s1ng])
s2ng = set([ng for ng in s2ng])
#diff = s2ng.difference(s1ng)
all = s1ng.union(s2ng)
for ng in all:
datap.append([ng, label])
X = np.zeros((len(datap), ngramsize, embedsize))
Y = np.zeros((len(datap), 3))
wildcard = np.array([0.0]*embedsize)
for i in range(0, len(datap)):
item = datap[i]
ngram = item[0]
label = item[1]
vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
labels = getLabels(label)
X[i] = vectors
Y[i] = labels
return X, Y
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:28,代码来源:Run_NN_MLP.py
示例3: str_common_grams
def str_common_grams(str1, str2, length=3):
'''Return how many times the ngrams (of length min_len to max_len) of str1
appeared on str2
'''
grams1 = list(ngrams(str1, length))
grams2 = list(ngrams(str2, length))
return sum(grams2.count(gram) for gram in grams1)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:7,代码来源:homedepot.py
示例4: getTestData
def getTestData(corpus, embedsize, ngramsize, m):
f = open(corpus)
datap = []
for line in f:
data = line.strip().split('\t')
s1 = data[0]
s2 = data[1]
label = data[2]
s1ng = ngrams(s1.split(' '), ngramsize)
s2ng = ngrams(s2.split(' '), ngramsize)
s1ng = set([ng for ng in s1ng])
s2ng = set([ng for ng in s2ng])
#diff = s2ng.difference(s1ng)
all = s1ng.union(s2ng)
datap.append(list(all))
Xs = []
wildcard = np.array([0.0]*embedsize)
for ngs in datap:
X = np.zeros((len(ngs), ngramsize, embedsize))
for i in range(0, len(ngs)):
ngram = ngs[i]
vectors = getEmbedVectors(ngramsize, embedsize, ngram, m, wildcard)
X[i] = vectors
Xs.append(X)
return Xs
开发者ID:ghpaetzold,项目名称:phd-backup,代码行数:25,代码来源:Run_NN_MLP.py
示例5: extract_terms_features
def extract_terms_features(terms, separateGrams=False):
vector = dict()
while('' in terms):
terms.remove('')
# for term in terms:
# if vector.has_key(term):
# vector[term] += 1
# else:
# vector[term] = 1
# for i in range(len(terms) - 2):
# cb2 = ' '.join(terms[i:i+1])
# cb3 = ' '.join(terms[i:i+2])
# if vector.has_key(cb2):
# vector[cb2] += 1
# else:
# vector[cb2] = 1
# if vector.has_key(cb3):
# vector[cb3] += 1
# else:
# vector[cb3] = 1
# cb2 = ' '.join(terms[len(terms)-2:len(terms)])
# if vector.has_key(cb2):
# vector[cb2] += 1
# else:
# vector[cb2] = 1
# print terms
g2 = ngrams(terms, 2)
g3 = ngrams(terms, 3)
g2j = [' '.join(gterms) for gterms in g2]
g3j = [' '.join(gterms) for gterms in g3]
vec1 = {}
vec2 = {}
vec3 = {}
for t in terms:
if(not vector.has_key(t)):
vec1[t] = 1
else:
vec1[t] += 1
for t in g2j:
if(not vector.has_key(t)):
vec2[t] = 1
else:
vec2[t] += 1
for t in g3j:
if(not vector.has_key(t)):
vec3[t] = 1
else:
vec3[t] += 1
vector = dict(vec1.items() + vec2.items() + vec3.items())
if(separateGrams == True):
return (vector, vec1, vec2, vec3)
else:
return vector
开发者ID:klyc0k,项目名称:EDSFilter,代码行数:60,代码来源:twitter_methods.py
示例6: format_text
def format_text(entries, LSTM_shape=True):
THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
sentences = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
decoded = base64.b64decode(entries)
decoded = str(decoded)
decoded = decoded[2:]
decoded = decoded[:-1]
decoded = decoded.split(".")
#print(decoded, "is decoded")
for entry in decoded:
token_sentences = tokenizer.tokenize(entry)
for sentence in token_sentences:
sentences.append(sentence)
tokenized_sentences = []
#remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\'']
#remove_tokens = string.punctuation
remove_tokens = '!"#$%&\'()*+,-./:;<=>[email protected][\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
tweet_tknzr = TweetTokenizer()
for sentence in sentences:
tokens = tweet_tknzr.tokenize(sentence)
tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens))
tokenized_sentences.append(tokens)
all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item()
all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item()
all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item()
#once the model gets updated with good data, ngrams.py needs to get changed/updated too!
X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3)))
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 1)
for gram in my_ngrams:
if gram in all_ngrams1:
index = all_ngrams1[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 2)
for gram in my_ngrams:
if gram in all_ngrams2:
index = len(all_ngrams1) + all_ngrams2[gram]
X[i][index] = 1
for i in range(len(tokenized_sentences)):
sentence = tokenized_sentences[i]
my_ngrams = ngrams(sentence, 3)
for gram in my_ngrams:
if gram in all_ngrams3:
index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram]
X[i][index] = 1
if LSTM_shape:
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
else:
X = np.reshape(X, (X.shape[0], X.shape[1]))
return X
开发者ID:mit-teaching-systems-lab,项目名称:threeflows,代码行数:60,代码来源:calculate_emotion.py
示例7: getNgramProbs
def getNgramProbs(file):
f = open(file,'r');
unigramList = [] ;
for line in f.read().split():
unigramList.append( line );
bigramList = ngrams(unigramList, 2);
trigramList = ngrams(unigramList, 3);
#dictionary of unigrams, bigrams, trigrams
unigramDict = dict()
bigramDict = dict()
trigramDict = dict()
#Counts for Unigrams
countUni = 0 ;
for item in unigramList:
countUni += 1
if item not in unigramDict:
unigramDict[item] = 1
else:
unigramDict[item] += 1
#Counts for Bigram
for item in bigramList:
if item not in bigramDict:
bigramDict[item] = 1
else:
bigramDict[item] += 1
#Counts for Trigrams
for item in trigramList:
if item not in trigramDict:
trigramDict[item] = 1
else:
trigramDict[item] += 1
#Probabilities for Trigrams
for key,item in trigramDict.iteritems():
trigramDict[key] /= float(bigramDict[(key[0],key[1])]) ;
#Probabilities for Bigrams
for key,item in bigramDict.iteritems():
bigramDict[key] /= float(unigramDict[key[0]]) ;
#Probabilities for Unigrams
for key,item in unigramDict.iteritems():
unigramDict[key] /= float(countUni) ;
# print "***** Unigrams";
# for key,item in unigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
# print "***** Bigrams";
# for key,item in bigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
# print "***** Trigrams";
# for key,item in trigramDict.iteritems():
# print str(key) + ' ' + str(item) ;
return [unigramDict,bigramDict,trigramDict];
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:60,代码来源:kldiv.py
示例8: scoreScopeOverlap
def scoreScopeOverlap(self,scopeHyp,scopeRef):
totalScore = 0
for scope_h in scopeHyp:
bestScore = 0
for scope_r in scopeRef:
if scope_r==[] or scope_h==[]:
partialScore = 0
if partialScore > bestScore: bestScore = partialScore
else:
ngram_range=range(1,len(scope_h)+1)
logging.info("ngram_range")
logging.info(ngram_range)
score_weights=map(lambda x: round(x/reduce(lambda x,y:x+y,ngram_range),4),ngram_range)
logging.info(score_weights)
partialScore=float()
for i in ngram_range:
hyp=ngrams(scope_h,i)
ref=ngrams(scope_r,i)
partialScore+=(len(set(hyp).intersection(set(ref)))*score_weights[i-1])
logging.info("partialScore")
logging.info(partialScore)
if partialScore > bestScore: bestScore = partialScore
totalScore+=bestScore
logging.info("totalScore")
logging.info(totalScore)
return totalScore
开发者ID:wilkeraziz,项目名称:chisel-features,代码行数:32,代码来源:main.py
示例9: create_candidate_list
def create_candidate_list(sentence):
tokens = nltk.tokenize.word_tokenize(sentence)
candidates_lists = create_candidates_lists(tokens)
# Create list of 1-grams.
candidates = []
for l in candidates_lists:
candidates += l
# Remove irrelevant stop words in 1-grams.
res = [token for token in candidates
if token not in ENGLISH_STOPWORDS]
# Create list of bigrams.
bigrams = []
for l in candidates_lists:
bigrams += ngrams(l, 2)
# Create list of trigrams.
trigrams = []
for l in candidates_lists:
trigrams += ngrams(l, 3)
# Create list of 4-grams.
fourgrams = []
for l in candidates_lists:
fourgrams += ngrams(l, 4)
res += [' '.join(a) for a in bigrams]
res += [' '.join(a) for a in trigrams]
res += [' '.join(a) for a in fourgrams]
return res
开发者ID:srom,项目名称:ensu,代码行数:34,代码来源:select_aliases.py
示例10: calc_ngram
def calc_ngram(htokens,etokens):
features = []
for n in range(1,5):
hgrams = nltk.FreqDist(ngrams(htokens,n))
egrams = nltk.FreqDist(ngrams(etokens,n))
prec = 0
num = 0
for k in hgrams:
if k in egrams:
prec = prec + hgrams[k]
num = num + hgrams[k]
if num > 0:
prec = float(prec) / num
features.append(prec)
recall = 0
num = 0
for k in egrams:
if k in hgrams:
recall = recall + egrams[k]
num = num + egrams[k]
if num > 0:
recall = float(recall) / num
features.append(recall)
features.append(calc_f1(prec,recall))
return features
开发者ID:da03,项目名称:sp2016.11-731,代码行数:25,代码来源:generate_feature.py
示例11: rouge_s
def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False):
rouge_s_list = []
k_c = len(candidate) if d_skip is None else d_skip
cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate),
n=2, k=k_c))
for ref in references:
k_ref = len(ref) if d_skip is None else d_skip
ref_skip_list = list(skipgrams(tokenizer.tokenize(ref),
n=2, k=k_ref))
count = 0
for bigram in cand_skip_list:
if bigram in ref_skip_list:
count = count+1
if not smoothing:
r_skip = count/len(ref_skip_list)
p_skip = count/len(cand_skip_list)
else:
cand_ungm = list(ngrams(tokenizer.tokenize(candidate),
n=1))
ref_ungm = list(ngrams(tokenizer.tokenize(ref),
n=1))
for ungm in cand_ungm:
if ungm in ref_ungm:
count += 1
r_skip = count/(len(ref_skip_list)+len(ref_ungm))
p_skip = count/(len(cand_skip_list)+len(cand_ungm))
score = Rouge.get_score(r_skip, p_skip, beta)
rouge_s_list.append(score)
return Rouge.jacknifing(rouge_s_list, averaging=averaging)
开发者ID:53X,项目名称:NLP-Metrics,代码行数:30,代码来源:rouge.py
示例12: char_ngram_similarity
def char_ngram_similarity(doc1, doc2, n, top=100):
"""
Gives a positive dissimilarity score of two documents with respect to their top m character n-grams distribution.
If the value is 0 the documents are identical (or at least share an identical top m character n-grams distribution.
:param doc1:
:param doc2:
:param n: the n-gram length
:param top: Only use the N most frequent n-grams from each document.
:return: A positive dissimilarity score. If the value is 0 the documents are identical (or at least their top m
character n-grams distribution.)
"""
ngrams1 = Counter(ngrams(doc1, n))
ngrams2 = Counter(ngrams(doc2, n))
profile1 = [n[0] for n in ngrams1.most_common(top)]
profile2 = [n[0] for n in ngrams2.most_common(top)]
# normalise the two ngram distributions
total1 = np.sum(list(ngrams1.values()))
for key in ngrams1:
ngrams1[key] /= total1
total2 = np.sum(list(ngrams2.values()))
for key in ngrams2:
ngrams2[key] /= total2
# calculate global dissimilarity score
score = 0
for n in set(profile1 + profile2):
f1 = ngrams1[n]
f2 = ngrams2[n]
score += ((2 * (f1 - f2)) / (f1 + f2)) ** 2
return score
开发者ID:rug-compling,项目名称:glad,代码行数:34,代码来源:glad-main.py
示例13: jaccardIdx
def jaccardIdx(w1, w2):
w1ngrams = set(ngrams(w1, 2))
w2ngrams = set(ngrams(w2, 2))
union = w1ngrams.union(w2ngrams)
intersect = w1ngrams.intersection(w2ngrams)
return 1.0 - float(len(intersect)) / float(len(union))
开发者ID:weezel,项目名称:ITIS13,代码行数:8,代码来源:russiannames.py
示例14: count_word
def count_word(self,doc,unigram = True,bigram = False,binary = False):
str = word_tokenize(self.remove_non_ascii(doc))
doc_voc = {}
if(unigram):
uni = ngrams(str,1)
self.count_word_sub(doc_voc,uni,binary)
if(bigram):
bi = ngrams(str,2)
self.count_word_sub(doc_voc,bi,binary)
开发者ID:akshaynavada,项目名称:NLP,代码行数:10,代码来源:NB.py
示例15: trainModel
def trainModel(self, listOfFilenames):
#dictionary of unigrams, bigrams, trigrams
unigramDict = dict()
bigramDict = dict()
trigramDict = dict()
#total count of unigrams, bigrams, trigrams
countUni = 0
countBi = 0
countTri = 0
i = 1
#iterate over list of files
for fileName in listOfFilenames:
print "Reading", i
i += 1
stag = STagger(fileName)
stag.find_unigrams(True, False)
for item in stag.unigrams:
countUni += 1
if item not in unigramDict:
unigramDict[item] = 1
else:
unigramDict[item] += 1
codeBigrams = ngrams(stag.unigrams, 2)
codeTrigrams = ngrams(stag.unigrams, 3)
for item in codeBigrams:
countBi += 1
if item not in bigramDict:
bigramDict[item] = 1
else:
bigramDict[item] += 1
for item in codeTrigrams:
countTri += 1
if item not in trigramDict:
trigramDict[item] = 1
else:
trigramDict[item] += 1
#write the ngrams to the file
outputFile = open('corpus.txt', 'w')
outputFile.write(str(countUni) + "\n")
for key, x in unigramDict.iteritems():
outputFile.write(str(key) + " " + str(x) + "\n")
outputFile.write(str(countBi) + "\n")
for key, x in bigramDict.iteritems():
outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(x) + "\n")
outputFile.write(str(countTri) + "\n")
for key, x in trigramDict.iteritems():
outputFile.write(str(key[0]) + " " + str(key[1]) + " " + str(key[2]) + " " + str(x) + "\n")
outputFile.close()
开发者ID:soumyasanyal,项目名称:NLPTermProject,代码行数:55,代码来源:extract2.py
示例16: ngram_similarity
def ngram_similarity(str1, str2, n = 3):
str1 = str1.split()
str2 = str2.split()
ngram1 = []
ngram2 = []
for i in range(n):
ngram1 = ngram1 + list(ngrams(str1,n-i))
for i in range(n):
ngram2 = ngram2 + list(ngrams(str2,n-i))
return jaccard_dis(set(ngram1),set(ngram2))
开发者ID:kunrenzhilu,项目名称:dmproject_updated,代码行数:11,代码来源:random_forestscript_v2.py
示例17: get_ngrams
def get_ngrams(self, tokens):
tokens.insert(0, '<START>')
unigrams = ngrams(tokens,1)
# key for unigrams is ('word',), not just 'word' string.
for item in unigrams: self.fdist1[item] += 1
bigrams = ngrams(tokens,2)
for item in bigrams: self.fdist2[item] += 1
trigrams = ngrams(tokens,3)
for item in trigrams: self.fdist3[item] += 1
开发者ID:ylmeng,项目名称:generate_captions,代码行数:11,代码来源:getNgrams.py
示例18: modified_precision
def modified_precision(candidate, references, n):
""" Calculate modified ngram precision.
>>> BLEU.modified_precision(
... 'the the the the the the the'.split(),
... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
... n=1,
... )
0.28...
>>> BLEU.modified_precision(
... 'the the the the the the the'.split(),
... ['the cat is on the mat'.split(), 'there is a cat on the mat'.split()],
... n=2,
... )
0.0
>>> BLEU.modified_precision(
... 'of the'.split(),
... [
... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
... 'It is the practical guide for the army always to heed the directions of the party'.split(),
... ],
... n=1,
... )
1.0
>>> BLEU.modified_precision(
... 'of the'.split(),
... [
... 'It is a guide to action that ensures that the military will forever heed Party commands.'.split(),
... 'It is the guiding principle which guarantees the military forces always being under the command of the Party.'.split(),
... 'It is the practical guide for the army always to heed the directions of the party'.split(),
... ],
... n=2,
... )
1.0
"""
counts = Counter(ngrams(candidate, n))
if not counts:
return 0
max_counts = {}
for reference in references:
reference_counts = Counter(ngrams(reference, n))
for ngram in counts:
max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])
clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())
return sum(clipped_counts.values()) / sum(counts.values())
开发者ID:52nlp,项目名称:Text-Summarization,代码行数:54,代码来源:bleu.py
示例19: uni_bi_gram
def uni_bi_gram(self,doc,unigram,bigram):
ret_list = []
if(unigram):
uni = ngrams(doc,1)
for gram in uni:
ret_list.append(gram)
if(bigram):
bi = ngrams(doc,2)
for gram in bi:
ret_list.append(gram)
return ret_list
开发者ID:akshaynavada,项目名称:NLP,代码行数:11,代码来源:NB.py
示例20: create_model
def create_model(tokenized_data):
tokens_list = [tokens for ndata in tokenized_data for tokens in ndata]
cfreq_data_bigram = nltk.ConditionalFreqDist(nltk.bigrams((tokens_list)))
n = 3
trigrams = ngrams(tokens_list, n)
z = 4
m = 5
n = 6
fourgram = ngrams(tokens_list, z)
fivegram = ngrams(tokens_list, m)
sixgram = ngrams(tokens_list, n)
return cfreq_data_bigram,trigrams,fourgram,fivegram,sixgram
开发者ID:nusebac,项目名称:Akosha,代码行数:12,代码来源:Main.py
注:本文中的nltk.util.ngrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论