本文整理汇总了Python中ngram.NGram类的典型用法代码示例。如果您正苦于以下问题:Python NGram类的具体用法?Python NGram怎么用?Python NGram使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了NGram类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: build_multiclusters
def build_multiclusters(inlines, threshold=0.05, N=4):
clusters = []
ignoreus = []
for i, iline in enumerate(inlines):
if i in ignoreus:
continue
iString = " ".join(iline.split(" :::: ")[:3])
ignoreus.append(i)
icluster = {}
icluster[iline] = -1
iModel = NGram(iString)
for j in range(i, len(inlines)):
if j in ignoreus:
continue
jline = inlines[j]
jString = " ".join(jline.split(" :::: ")[:3])
results = iModel.search(jString)
score = sum([y for x,y in results]) / len(results) \
if len(results) > 0 else 0.0
print score
if score > threshold:
icluster[jline] = score
iModel.add(jString)
ignoreus.append(j)
clusters.append(icluster)
return clusters
开发者ID:mayhewsw,项目名称:HMMClustering,代码行数:35,代码来源:clusteralgorithm.py
示例2: filterByOp
def filterByOp(self,clone):
opStr1 = ""
opStr2 = ""
indx1,start1,end1 = clone[1]
indx2,start2,end2 = clone[2]
for i in range(start1,end1+1):
opStr1 += str(self.op1_hash.get(i,-1))
for i in range(start2,end2+1):
opStr2 += str(self.op2_hash.get(i,-1))
if config.DEBUG is True:
print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1)
print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2)
# if ((self.hasChanged(opStr1) is False) or
# (self.hasChanged(opStr2) is False)):
if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)):
return None
idx = NGram(N=config.NGRAM)
ngram1 = list(idx.ngrams(opStr1))
ngram2 = list(idx.ngrams(opStr2))
metric = self.compareList(ngram1,ngram2)
return metric
开发者ID:Shraddha512,项目名称:FSE-2012-REPERTOIRE,代码行数:26,代码来源:operation_filter.py
示例3: map
def map(self,phrase):
for term in phrase:
if len(term) > 4:
continue
for word in self.corpus:
z = Set(term) & Set(word)
matches = []
if len(z) > 0 and len(z) < len(term):
#
#
g=NGram(z - Set(term))
#matches = g.search(term)
else:
#
# At this point we assume context is not informative
# In the advent of context not being informative, we resort to fuzzy lookup
#
g = NGram(word)
#matches = g.search(term)
g.remove(term)
matches = g.search(term)
key = None
value = None
if len(matches) > 0:
matches = list(matches[0])
Pz_ = len(matches) / self.size
Px_ = fuzz.ratio(term,matches[0]) / 100
if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
key = term
value= {}
value= [matches[0],Pz_,Px_,1]
self.emit (key,value)
开发者ID:weiyixia,项目名称:CSV-file-repair,代码行数:33,代码来源:context.py
示例4: select_translation
def select_translation(sentence, idx, word, translations):
# make sure the subject pronoun is in subject form
# heuristic: if it's the first word or the previous word is punctuation
# or conjunction, it's considered a subject
if word[1] == 'r' and word[0] in subject_pronoun:
if idx == 0 or sentence[idx-1][1] in ['x', 'c']:
return (subject_pronoun[word[0]], 'pron')
# handle special case: <digits>/m 日/m
if word[1] == 'm':
if DIGITS_PATTERN.match(word[0]):
if idx+1 < len(sentence) and sentence[idx+1][0] == u'日':
# return proper date string
return (translate_date(int(word[0])), 'n')
else:
# return digits directly
return (word[0], 'n')
elif word[0] == u'日':
# symmetric case
if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]):
return ('', '')
# construct a list of translations with the same pos as word
same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations)
ng = NGram()
if len(same_pos_translations) > 0:
max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0]))
return max_unigram_trans
return translations[0]
开发者ID:scottcheng,项目名称:mt,代码行数:32,代码来源:translate.py
示例5: main
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = next(right_file)
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'), lineterminator='\n')
if titles:
left_header = next(left_file)
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
开发者ID:gpoulter,项目名称:python-ngram,代码行数:25,代码来源:csvjoin.py
示例6: simtitle
def simtitle( request ):
"""calculate similarity based on title and naive threshold"""
n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
results = []
for article in articles:
article.is_duplicate = False
article.duplicate_of = None
article.save()
sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.is_duplicate:
nearest = nearest.duplicate_of
if NGram.compare( article.title, nearest.title ) < 0.7:
results.append( article )
break
article.is_duplicate = True
article.duplicate_of = nearest
article.save()
break
else:
results.append( article )
n.add( article )
return render( request, "dump.html", dictionary = { "article_list": results, } )
开发者ID:mrmonkington,项目名称:channelfunnel,代码行数:25,代码来源:views.py
示例7: test
def test():
filter = opFilter()
opStr1 = "nnn+"
opStr2 = "nn+"
idx = NGram(N=config.NGRAM)
l1 = list(idx.ngrams(opStr1))
l2 = list(idx.ngrams(opStr2))
print filter.compareList(l1,l2)
开发者ID:Shraddha512,项目名称:FSE-2012-REPERTOIRE,代码行数:11,代码来源:operation_filter.py
示例8: ngram_similarity
def ngram_similarity(univ_name):
out = {}
with open("static/UniqueFBUnivNames.csv", 'rb') as f:
reader = csv.reader(f)
for row in reader:
row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
row = re.sub(' ', ' ', str(row))
out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
out['score_used'] = NGram.compare(str(row).lower(), univ_name)
out['univ'] = str(row)
return out
return out
开发者ID:MysteriousMagics,项目名称:NLPCareerTrajectory,代码行数:13,代码来源:univ_lookup.py
示例9: main
def main(left_path, left_column, right_path, right_column,
outfile, titles, join, minscore, count, warp):
"""Perform the similarity join
>>> open('left.csv', 'w').write('''ID,NAME
... 1,Joe
... 2,Kin
... 3,ZAS''')
>>> open('right.csv', 'w').write('''ID,NAME
... ID,NAME
... A,Joe
... B,Jon
... C,Job
... D,Kim''')
>>> main(left_path='left.csv', left_column=1,
... right_path='right.csv', right_column=1, outfile='out.csv',
... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
>>> print open('out.csv').read() #doctest: +NORMALIZE_WHITESPACE
ID,NAME,Rank,Similarity,ID,NAME
1,Joe,1,1.0,A,Joe
1,Joe,2,0.25,B,Jon
1,Joe,3,0.25,C,Job
2,Kin,1,0.25,D,Kim
3,ZAS
<BLANKLINE>
"""
right_file = csv.reader(open(right_path, 'r'))
if titles:
right_header = right_file.next()
index = NGram((tuple(r) for r in right_file),
threshold=minscore,
warp=warp, key=lambda x: lowstrip(x[right_column]))
left_file = csv.reader(open(left_path, 'r'))
out = csv.writer(open(outfile, 'w'))
if titles:
left_header = left_file.next()
out.writerow(left_header + ["Rank", "Similarity"] + right_header)
for row in left_file:
if not row: continue # skip blank lines
row = tuple(row)
results = index.search(lowstrip(row[left_column]), threshold=minscore)
if results:
if count > 0:
results = results[:count]
for rank, result in enumerate(results, 1):
out.writerow(row + (rank, result[1]) + result[0])
elif join == "outer":
out.writerow(row)
开发者ID:Rafiot,项目名称:python-ngram,代码行数:49,代码来源:csvjoin.py
示例10: get_distr
def get_distr(strlist, n_len):
alphabet = ['A', 'C', 'G', 'T', 'N']
n = NGram(N=n_len, pad_len=0)
all_ngrams = 0
grams = init_grams_dict(n_len, alphabet)
for item in strlist:
if item == '':
continue
ngram_list = list(n._split(item))
for ng in ngram_list:
if ng in grams:
grams[ng] += float(1)
all_ngrams += 1
for item in grams.keys():
grams[item] /= all_ngrams
return grams
开发者ID:al-indigo,项目名称:transcriptome-assemblies-refiner,代码行数:16,代码来源:validation_checker.py
示例11: verify
def verify(self,text_compare):
results = []
dictio = []
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
if linea2 != '\n':
dictio += [self.ng.items_sharing_ngrams(linea2)]
compares = 0.0
for parrafo in self.lsn:
comp = NGram.compare(parrafo,linea2)
if compares < comp:
compares = comp
results += [compares]
linea2 = file2.readline()
file2.close()
major_ocurrences=[]
for d in dictio:
major=0
for val in d.values():
if major<val:
major=val
major_ocurrences+=[major]
avg_perc=0.0
for r in results:
avg_perc+=r
avg_perc=avg_perc/len(results)
print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
print("Porcentaje Similitud: "+repr(avg_perc))
开发者ID:elard28,项目名称:plagiarism-ngram,代码行数:33,代码来源:init.py
示例12: main
def main():
questions_path, answers_path = sys.argv[1:]
print("Reading Corpus:")
train_sentences = read_corpus('train_data', disp=True)
print('\nTraining on Corpus')
model = NGram.train_model(train_sentences, disp=True)
with open(answers_path, 'r') as answer_file:
answers = get_sentences(untokenized_text=answer_file.read(),
is_tokenized=True,
token_start_end=('<s>', '</s>'))
dev_sentences = answers[:520]
print('Calculating Probabilities for Dev Sentences:')
model.sentences_probabilities(dev_sentences, disp=True)
lambdas = optimize_lambdas(model)
with open(questions_path, 'r') as question_file:
questions = get_sentences(untokenized_text=question_file.read(),
is_tokenized=True,
token_start_end=('<s>', '</s>'))
print('Calculating Probabilities for Test Sentences:')
model.sentences_probabilities(sentences=questions, disp=True)
_, sentences_perplexity = model.perplexity(lambdas=lambdas)
print('Writing sentences and perplexities to file')
with open('output.txt', 'w') as out_file:
for i, perplexity in enumerate(sentences_perplexity):
out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))
开发者ID:EthanWelsh,项目名称:N-Grams,代码行数:33,代码来源:extrinsic.py
示例13: test_count_1gram
def test_count_1gram(self):
ngram = NGram(1, self.sents)
counts = {
(): 12,
('el',): 1,
('gato',): 1,
('come',): 2,
('pescado',): 1,
('.',): 2,
('</s>',): 2,
('la',): 1,
('gata',): 1,
('salmón',): 1,
}
for gram, c in counts.items():
self.assertEqual(ngram.count(gram), c)
开发者ID:Mallku2,项目名称:PLN-2015,代码行数:17,代码来源:test_ngram.py
示例14: compare_ngrams
def compare_ngrams(left, right, N=2, pad_len=0):
left = ascii(left)
right = ascii(right)
if len(left) == 1 and len(right) == 1:
# NGram.compare returns 0.0 for 1 letter comparison, even if letters
# are equal.
return 1.0 if left == right else 0.0
return NGram.compare(left, right, N=N, pad_len=pad_len)
开发者ID:digideskio,项目名称:addok,代码行数:8,代码来源:text.py
示例15: test_ngram_search
def test_ngram_search(self):
"""Tests from the original ngram.py, to check that the
rewrite still uses the same underlying algorithm"""
# Basic searching of the index
idx = NGram(self.items)
self.assertEqual(idx.search('askfjwehiuasdfji'), [
('askfjwehiuasdfji', 1.0),
('asdfawe', 0.17391304347826086),
('asfwef', 0.083333333333333329),
('adfwe', 0.041666666666666664)])
self.assertEqual(idx.search('afadfwe')[:2],
[('adfwe', 0.59999999999999998),
('asdfawe', 0.20000000000000001)])
# Pairwise comparison of strings
self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
开发者ID:DavidBrear,项目名称:python-ngram,代码行数:18,代码来源:test_ngram.py
示例16: backoff_score_strings
def backoff_score_strings(iline, jline, N, T=0.0):
iString = " ".join(iline.split(" :::: ")[:3])
jString = " ".join(jline.split(" :::: ")[:3])
score = -1
while score <= T and N >= 1:
score = NGram.compare(iString, jString, N=N)
N = N - 1
return score
开发者ID:mayhewsw,项目名称:HMMClustering,代码行数:10,代码来源:clusteralgorithm.py
示例17: cumulative_score_strings
def cumulative_score_strings(iline, jline, N):
iString = " ".join(iline.split(" :::: ")[:3])
jString = " ".join(jline.split(" :::: ")[:3])
score = 0
while N >= 1:
score += (NGram.compare(iString, jString, N=N)) #* N)
N = N - 1
return score
开发者ID:mayhewsw,项目名称:HMMClustering,代码行数:10,代码来源:clusteralgorithm.py
示例18: __init__
def __init__(self,text):
self.ng=NGram()
file = open(text,"r")
linea = file.readline()
while linea != '':
if linea != '\n':
self.ng.add(linea)
linea = file.readline()
self.lsn=list(self.ng);
file.close()
开发者ID:elard28,项目名称:plagiarism-ngram,代码行数:10,代码来源:init.py
示例19: wordsoccurrences
def wordsoccurrences(self, words_list, option='ortony'):
frequencies = FreqDist(words_list)
ordered_unigrams = frequencies.most_common()
if option == 'ortony':
lexicon = self.ortony_list
else:
lexicon = self.profane_words
count = 0
for t_word, count_w in ordered_unigrams:
lower_word = t_word.lower()
three_grams = NGram(lexicon)
likely_words = three_grams.search(lower_word)
if len(likely_words) > 0:
# if lower_word in lexicon:
count += 1 * count_w
if lower_word in lexicon:
count += 1
return count
开发者ID:ARGHZ,项目名称:ClassifTweets,代码行数:19,代码来源:execute_xperiment.py
示例20: verify
def verify(self,text_compare):
results = []
texto = []
'''
file2 = open(text_compare,"r")
for linea2 in file2.readlines():
texto+=linea2.split(" ")
tng=NGram(texto)
file2.close()
'''
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
texto+=linea2.split(" ")
linea2 = file2.readline()
tng=NGram(texto)
file2.close()
for ngs in self.ng:
count=0
for word in list(ngs):
for porc in tng.search(word):
if porc[1]>0.3:
count+=1
results+=[count]
print list(results)
pos=0
count=0
i=0
for res in results:
if count<res:
count=res
pos=i
i+=1
if results[pos]>2:
print("Tema mas preciso del texto: "+repr(self.topic[pos]))
else:
print("No se ha podido precisar de que trata")
print ""
开发者ID:elard28,项目名称:TopicDetector,代码行数:42,代码来源:topic.py
注:本文中的ngram.NGram类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论