本文整理汇总了Python中nltk.metrics.distance.edit_distance函数的典型用法代码示例。如果您正苦于以下问题:Python edit_distance函数的具体用法?Python edit_distance怎么用?Python edit_distance使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了edit_distance函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: process_rel_candidate_for_drop_led
def process_rel_candidate_for_drop_led(relnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_rel):
simple_sentence = " ".join(simple_sentences)
sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())
temp_nodeset, temp_filtered_mod_pos = boxer_graph.drop_relation(nodeset, relnode_candidate, filtered_mod_pos)
sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, temp_filtered_mod_pos)
edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())
isDrop = compare_edit_distance(opr_drop_rel, edit_dist_after_drop, edit_dist_before_drop)
return isDrop
开发者ID:shashiongithub,项目名称:Sentence-Simplification-ACL14,代码行数:12,代码来源:methods_training_graph.py
示例2: process_ood_candidate_for_drop_led
def process_ood_candidate_for_drop_led(oodnode_candidate, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_ood):
simple_sentence = " ".join(simple_sentences)
sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())
temp_nodeset = nodeset[:]
temp_nodeset.remove(oodnode_candidate)
sentence_after_drop = boxer_graph.extract_main_sentence(temp_nodeset, main_sent_dict, filtered_mod_pos)
edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())
isDrop = compare_edit_distance(opr_drop_ood, edit_dist_after_drop, edit_dist_before_drop)
return isDrop
开发者ID:shashiongithub,项目名称:Sentence-Simplification-ACL14,代码行数:13,代码来源:methods_training_graph.py
示例3: make_compatible
def make_compatible(input_str):
for i in range(len(rer_out['taglist'])):
if(rer_out['taglist'][i] == "Org"):
for j in allprods:
if(dist.edit_distance(rer_out['wordlist'][i], j) < 2):
rer_out['wordlist'][i] = j
break
if(rer_out['taglist'][i] == "Family"):
for j in allprods:
for k in allprods[j]:
if(dist.edit_distance(rer_out['wordlist'][i], k) < 4):
rer_out['wordlist'][i] = k
break
开发者ID:rashrag,项目名称:nlp-eval-day2,代码行数:13,代码来源:qgen_sanj.py
示例4: process_mod_candidate_for_drop_led
def process_mod_candidate_for_drop_led(modcand_to_process, filtered_mod_pos, nodeset, simple_sentences, main_sent_dict, boxer_graph, opr_drop_mod):
simple_sentence = " ".join(simple_sentences)
sentence_before_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, filtered_mod_pos)
edit_dist_before_drop = edit_distance(sentence_before_drop.split(), simple_sentence.split())
modcand_position_to_process = modcand_to_process[0]
temp_filtered_mod_pos = filtered_mod_pos[:]+[modcand_position_to_process]
sentence_after_drop = boxer_graph.extract_main_sentence(nodeset, main_sent_dict, temp_filtered_mod_pos)
edit_dist_after_drop = edit_distance(sentence_after_drop.split(), simple_sentence.split())
isDrop = compare_edit_distance(opr_drop_mod, edit_dist_after_drop, edit_dist_before_drop)
return isDrop
开发者ID:shashiongithub,项目名称:Sentence-Simplification-ACL14,代码行数:13,代码来源:methods_training_graph.py
示例5: one2ArrEditDistance
def one2ArrEditDistance(self,sen,arr):
score = []
for l in arr:
score.append(edit_distance(sen,l))
if len(score) != 0:
return sum(score)*1.0/len(score)
return 0
开发者ID:siyuqtt,项目名称:URL_twitter,代码行数:7,代码来源:paraphrase.py
示例6: mean_char_edit_distance
def mean_char_edit_distance(candidates, references):
total_distance = 0
total_target_length = 0
for y, t in zip(candidates, references):
total_distance += edit_distance(y, t)
total_target_length += len(t)
return total_distance/total_target_length
开发者ID:Styrke,项目名称:master-code,代码行数:7,代码来源:performancemetrics.py
示例7: levenshtein_sort
def levenshtein_sort(self, keyword, domains):
"""
Sort domains by Levenshtein edit-distance
:param sentence: str input source
:param domains: domains list
:rtype: list
:return: sorted names list
"""
# distance counter
# transpositions - ab == ba
distance = lambda s, d: edit_distance(s, d, transpositions=True)
# remove zone
get_str = lambda domain: re.sub('([.][a-z]{2,4})+$', '', domain)
domains = map(get_str, domains)
# Sorter
for i in range(len(domains)):
for j in range(len(domains) - 1):
if (distance(keyword, get_str(domains[j])) >
distance(keyword, get_str(domains[j + 1]))):
tmp = copy(domains[j + 1])
domains[j + 1] = domains[j]
domains[j] = tmp
return domains
开发者ID:ksantr,项目名称:DomainSuggester,代码行数:26,代码来源:suggester.py
示例8: select_anagrams
def select_anagrams(token, structures):
"""Select possible anagrams for a given token
Parameters:
token (:func:`str`): Cleaned token
structures (:func:`dict`): Datastructures from file
Returns:
:func:`dict` - Possible anagrams (keys) along with their score (values)
"""
anagrams = {}
focus_alphabet = generate_alphabet_from_word(token[1])
token_hash = anagram_hash(token)
hash_list = []
for c in structures["alphabet"]:
for f in focus_alphabet:
hash_list.append(token_hash + c - f)
hash_counter = Counter(hash_list) # Counting retrieval occurence
for h in set(hash_counter.keys()).intersection(set(structures["anagrams"].keys())):
count = hash_counter[h]
anag_list = [anag for anag in structures["anagrams"][h] if edit_distance(anag, token) <= 3]
for anag in anag_list:
anag_score = rate_anagram(structures["occurence_map"], token, anag, count)
if anag_score > 0:
anagrams[anag] = anag_score
return anagrams
开发者ID:pdessauw,项目名称:ocr-pipeline,代码行数:32,代码来源:utils.py
示例9: __init__
def __init__(self):
self.stemmer = LancasterStemmer()
self.stem_mapping = {}
self.stemmed_trie = TrieNode()
self.trie = TrieNode()
self.singles_lst = []
self.black_listed_stems = set([])
loaded = cPickle.load(open(DICTIONARY, 'r'))
print len(loaded)
loaded += CUSTOM
loaded = set(loaded)
most_common = cPickle.load(open(MOST_COMMON, 'r'))
for word in most_common:
self.black_listed_stems.add(self.stem(word))
#print self.black_listed_stems
for word in loaded:
word = word.lower()
if word not in most_common[:TOP_K_FILTER]:
self.trie.insert(word)
stemmed_word = self.stem(word)
if stemmed_word in self.stem_mapping:
previous = self.stem_mapping[stemmed_word]
edist = distance.edit_distance(word, previous)
if edist > 2:
pass
#print 'warning: %s dropped in favor of %s' % (word, previous)
else:
if stemmed_word not in self.black_listed_stems:
self.stem_mapping[stemmed_word] = word
self.stemmed_trie.insert(stemmed_word)
开发者ID:sergeyk,项目名称:csrec,代码行数:30,代码来源:freebase_interest_disambiguator.py
示例10: get_candidates
def get_candidates(self, word, D=1):
"""If word is in lexicon returns [(word, 1.0)].
Otherwise returns a list with all the words in lexicon that has
a distance equal or less than to D (D is the Levenshtein edit-distance)
If there is no such word, returns [(word, 0.0)]
"""
word = word.lower()
if word in self.fdist:
return [(word, 1.0)]
candidates = []
counts = []
for w, c in self.fdist.iteritems():
if edit_distance(w, word) <= D:
candidates.append(w)
counts.append(c)
if len(candidates) == 0:
candidates.append(word)
counts.append(0)
probs = [float(c) / self.wcount for c in counts]
return sorted(zip(candidates, probs), key=lambda x: x[1], reverse=True)
开发者ID:paglian,项目名称:afip-query,代码行数:25,代码来源:spellchecker.py
示例11: replace
def replace(self, word):
suggestions = self.spell_dict.suggest(word)
if suggestions:
for suggestion in suggestions:
if edit_distance(word, suggestion) <= self.max_dist:
return suggestions[0]
return word
开发者ID:HarryCordewener,项目名称:NLP_Walter_Harry,代码行数:9,代码来源:run.py
示例12: get_geonames_code
def get_geonames_code(m):
lat = session.scalar(m._geo_ponto.y)
lon = session.scalar(m._geo_ponto.x)
places = geonames_reverse(lat, lon)
for place in places:
nome1 = m.nome.strip().lower()
nome2 = place[u'name'].strip().lower()
if edit_distance(nome1, nome2) < 2:
return int(place[u'geonameId'])
开发者ID:dadosgovbr,项目名称:api-siconv,代码行数:9,代码来源:geonames.py
示例13: eval
def eval(references):
string_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]}
jaccard_distances = {'siddharthan':[], 'bayes_no_variation':[], 'bayes_variation':[]}
for reference in references:
print reference
string_distances['siddharthan'].append(edit_distance(reference['original'], reference['siddharthan']))
string_distances['bayes_no_variation'].append(edit_distance(reference['original'], reference['bayes_no_variation']))
string_distances['bayes_variation'].append(edit_distance(reference['original'], reference['bayes_variation']))
# jaccard_distances['siddharthan'].append(jaccard_distance(reference['original'], reference['siddharthan']))
# jaccard_distances['bayes_no_variation'].append(jaccard_distance(reference['original'], reference['bayes_no_variation']))
# jaccard_distances['bayes_variation'].append(jaccard_distance(reference['original'], reference['bayes_variation']))
print 'String distances: '
print 'siddharthan: ', mean_confidence_interval(string_distances['siddharthan'])
print 'bayes_no_variation: ', mean_confidence_interval(string_distances['bayes_no_variation'])
print 'bayes_variation: ', mean_confidence_interval(string_distances['bayes_variation'])
print 10 * '-'
开发者ID:ThiagoCF05,项目名称:ProperName,代码行数:19,代码来源:intr_eval.py
示例14: close_enough_buckets
def close_enough_buckets(first_bucket, second_bucket, dist):
if first_bucket == second_bucket:
return False
elif edit_distance(first_bucket, second_bucket) <= dist:
return True
else:
return False
开发者ID:tbonza,项目名称:data_mining,代码行数:10,代码来源:clusters.py
示例15: strip_synonyms
def strip_synonyms(output_set, exclude_set):
# Remove synonyms that have Levenshtein distance of 1, AFTER removing plurals.
for word in output_set:
for synset in wn.synsets(word):
for synonym in synset.lemma_names():
if edit_distance(word,synonym) == 1:
exclude_set.add(synonym)
output_set.difference_update(exclude_set)
return output_set, exclude_set
开发者ID:sampablokuper,项目名称:dwgen,代码行数:10,代码来源:normalize.py
示例16: _match_by_edit_distance
def _match_by_edit_distance(full_text, text_to_match):
text_to_match = text_to_match.replace("-LRB-", "(").replace("-RRB-", ")")
text_to_match = text_to_match.replace("-LCB-", "{").replace("-RCB-", "}")
text_to_match = re.sub(r'\[\\\]\\\)\]$', ')', text_to_match)
try:
end_point = (text_to_match.index(" ") if " " in text_to_match else len(text_to_match))
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0:end_point]), full_text, re.U | re.I)]
except:
import sys
print(full_text)
print()
print(text_to_match)
sys.exit(1)
if len(potential_matches) == 0:
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
if len(potential_matches) == 0:
text_to_match = text_to_match.replace("(", "[")
potential_matches = [full_text[m.start():(m.start() + len(text_to_match) + 1)] for m in
re.finditer(re.escape(text_to_match[0]), full_text, re.U)]
potential_matches = [(p[0:p.rindex(text_to_match[-1])+1]
if text_to_match[-1] in p and len(p) > len(text_to_match)
else p)
for p in potential_matches]
if len(potential_matches) == 0:
# No idea why this would ever happen, but it does
return text_to_match
match_with_lowest_edit_distance = ""
lowest_edit_distance = -1
for match in potential_matches:
e_d = edit_distance(match, text_to_match)
if lowest_edit_distance == -1 or e_d <= lowest_edit_distance:
lowest_edit_distance = e_d
match_with_lowest_edit_distance = match
result = match_with_lowest_edit_distance.strip()
if text_to_match[-1] in result:
while result[-1] != text_to_match[-1]:
result = result[0:-1]
elif text_to_match[-1] == '"' and re.search(r'["”\u201d]', result):
while result[-1] not in ['"', '”', "\u201d"]:
result = result[0:-1]
elif text_to_match[-1] not in [']', '}', ')'] and text_to_match[-2:] != "..":
while result[-1] != text_to_match[-1]:
result += full_text[full_text.index(result) + len(result)][-1]
return result
开发者ID:EducationalTestingService,项目名称:match,代码行数:54,代码来源:Match.py
示例17: get_X
def get_X(lines, features, cache):
if cache == None:
cache = {}
tokenizer = RegexpTokenizer(r'[a-z]+')
X = []
for line1 in lines:
vector = []
for line2 in lines:
vector.append(edit_distance(line1,line2)/max(len(line1),len(line2)))
max_v = max(vector)
for i in range(len(vector)):
vector[i] = vector[i] / max_v
syn_dist = {}
for word in features:
syn_dist[word] = 1
for word in set(tokenizer.tokenize(line1.lower())):
if word in stopwords.words('english'):
continue
for word2 in features:
if (len(wn.synsets(word)) == 0 or len(wn.synsets(word2)) == 0):
continue
else:
if (word not in cache):
cache[word] = {}
if (word2 not in cache[word]):
similarity = [w1.wup_similarity(w2)
for w1 in wn.synsets(word, pos=wn.NOUN)
+ wn.synsets(word, pos=wn.VERB)
for w2 in wn.synsets(word2, pos=wn.NOUN)
+ wn.synsets(word2, pos=wn.VERB)]
similarity = [s for s in similarity if s]
if (len(similarity) != 0):
cache[word][word2] = max(similarity)
else:
cache[word][word2] = None
#cache[word][word2] = wn.synsets(word)[0].path_similarity(wn.synsets(word2)[0])
#cache[word][word2] = wn.synsets(word)[0].wup_similarity(wn.synsets(word2)[0])
if (not cache[word][word2]):
continue
dist = 1 - cache[word][word2]
if (dist < syn_dist[word2]):
syn_dist[word2] = dist
for word in features:
vector.append(syn_dist[word])
X.append(vector)
return X, cache
开发者ID:cmaclell,项目名称:word_clustering,代码行数:54,代码来源:altusewn.py
示例18: model_evaluate
def model_evaluate(model, d, gt):
model_h.add_data(d, trunc=25)
inf = model_h.states_list[0].stateseq
inf = list(inf)
dist = edit_distance(gt, inf)
s_gt, s_inf = set(gt), set(inf)
iou = len(s_gt.intersection(s_inf)) / len(s_gt.union(s_inf))
return dist, iou
开发者ID:YangWun,项目名称:lip-reading,代码行数:11,代码来源:evaluate_model.py
示例19: probs_metric
def probs_metric(inverse=False):
rand_p = Vec2(random()*table.width+table.min_point.x, random()*table.height+table.min_point.y)
try:
bestmeaning, bestsentence = generate_sentence(rand_p, False, scene, speaker, usebest=True, golden=inverse, printing=printing)
sampled_landmark, sampled_relation = bestmeaning.args[0], bestmeaning.args[3]
golden_posteriors = get_all_sentence_posteriors(bestsentence, meanings, golden=(not inverse), printing=printing)
# lmk_prior = speaker.get_landmark_probability(sampled_landmark, landmarks, PointRepresentation(rand_p))[0]
all_lmk_probs = speaker.all_landmark_probs(landmarks, Landmark(None, PointRepresentation(rand_p), None))
all_lmk_probs = dict(zip(landmarks, all_lmk_probs))
lmk_prior = all_lmk_probs[sampled_landmark]
head_on = speaker.get_head_on_viewpoint(sampled_landmark)
rel_prior = speaker.get_probabilities_points( np.array([rand_p]), sampled_relation, head_on, sampled_landmark)
lmk_post = golden_posteriors[sampled_landmark]
rel_post = golden_posteriors[sampled_relation]
ps = np.array([golden_posteriors[lmk]*golden_posteriors[rel] for lmk, rel in meanings])
rank = None
for i,p in enumerate(ps):
lmk,rel = meanings[i]
# logger( '%f, %s' % (p, m2s(lmk,rel)))
head_on = speaker.get_head_on_viewpoint(lmk)
# ps[i] *= speaker.get_landmark_probability(lmk, landmarks, PointRepresentation(rand_p))[0]
ps[i] *= all_lmk_probs[lmk]
ps[i] *= speaker.get_probabilities_points( np.array([rand_p]), rel, head_on, lmk)
if lmk == sampled_landmark and rel == sampled_relation:
idx = i
ps += epsilon
ps = ps/ps.sum()
prob = ps[idx]
rank = sorted(ps, reverse=True).index(prob)
entropy = entropy_of_probs(ps)
except (ParseError,RuntimeError) as e:
logger( e )
lmk_prior = 0
rel_prior = 0
lmk_post = 0
rel_post = 0
prob = 0
rank = len(meanings)-1
entropy = 0
distances = [[None]]
head_on = speaker.get_head_on_viewpoint(sampled_landmark)
all_descs = speaker.get_all_meaning_descriptions(trajector, scene, sampled_landmark, sampled_relation, head_on, 1)
distances = []
for desc in all_descs:
distances.append([edit_distance( bestsentence, desc ), desc])
distances.sort()
return lmk_prior,rel_prior,lmk_post,rel_post,\
prob,entropy,rank,distances[0][0],type(sampled_relation)
开发者ID:arebgun,项目名称:bolt,代码行数:53,代码来源:object_correction_testing.py
示例20: validate_password_dictionary
def validate_password_dictionary(value):
"""
Insures that the password is not too similar to a defined set of dictionary words
"""
password_max_edit_distance = getattr(settings, "PASSWORD_DICTIONARY_EDIT_DISTANCE_THRESHOLD", None)
password_dictionary = getattr(settings, "PASSWORD_DICTIONARY", None)
if password_max_edit_distance and password_dictionary:
for word in password_dictionary:
distance = edit_distance(value, word)
if distance <= password_max_edit_distance:
raise ValidationError(_("Too similar to a restricted dictionary word."), code="dictionary_word")
开发者ID:TeachAtTUM,项目名称:edx-platform,代码行数:12,代码来源:password_policy_validators.py
注:本文中的nltk.metrics.distance.edit_distance函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论