本文整理汇总了Python中nltk.metrics.edit_distance函数的典型用法代码示例。如果您正苦于以下问题:Python edit_distance函数的具体用法?Python edit_distance怎么用?Python edit_distance使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了edit_distance函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_abstract_by_title
def get_abstract_by_title(title):
print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
print 'searching entry with title: ' + title
fetch = metapub.PubMedFetcher()
pmids = fetch.pmids_for_query(title)
if (len(pmids) == 0):
print 'warning: no entry retrieved for given title'
return None, ''
elif (len(pmids) == 1):
article = fetch.article_by_pmid(pmids[0])
if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
print 'successfully matched title: ' + article.title
return article.title, article.abstract
else:
print 'warning: found one entry but not a match'
return None, ''
else:
print 'warning: retrieved more than one entry for given title'
for i in range(min(20, len(pmids))):
article = fetch.article_by_pmid(pmids[i])
if edit_distance(article.title, title) <= math.ceil(len(title) * 0.1) and article.abstract != None:
print 'successfully matched title: ' + article.title
return article.title, article.abstract
print 'warning: no entry is a match'
return None, ''
开发者ID:tacitia,项目名称:ThoughtFlow,代码行数:25,代码来源:PubMedQuerier.py
示例2: get_related_evidence
def get_related_evidence(title):
print '>>>>>>>>>>>>>>>>>>>>>>>>>>'
try:
print 'given title: ' + title
# TODO: fix this...
except UnicodeEncodeError:
print 'title cannot be printed - containing unicode encode error'
return [], {}, 0
fetch = metapub.PubMedFetcher()
pmids = fetch.pmids_for_query(title)
if len(pmids) == 1:
article = fetch.article_by_pmid(pmids[0])
if edit_distance(article.title, title) <= len(title) * 0.1:
print 'matched title: ' + article.title.encode('utf-8')
related_pmids = fetch.related_pmids(pmids[0])
return _merge_related_pmids(pmids[0], related_pmids, fetch)
elif len(pmids) > 1:
for i in range(min(20, len(pmids))):
article = fetch.article_by_pmid(pmids[i])
if edit_distance(article.title, title) <= len(title) * 0.1:
print 'matched title: ' + article.title.encode('utf-8')
related_pmids = fetch.related_pmids(pmids[i])
return _merge_related_pmids(pmids[i], related_pmids, fetch)
print 'no match found'
return [], {}, 0
开发者ID:tacitia,项目名称:ThoughtFlow,代码行数:26,代码来源:PubMedQuerier.py
示例3: matches_author
def matches_author(self, string, fuzzy=False, distance_threshold=3):
"""
This function retrieves from the KnowledgeBase possible authors that match the search string.
None is returned if no matches are found.
:param string: the string to be matched
:param fuzzy: whether exact or fuzzy string matching should be applied
:distance_threshold: the maximum edit distance threshold (ignored if `fuzzy==False`)
:return: a list of tuples, ordered by distance between the seach and the matching string, where:
tuple[0] contains the id (i.e. CTS URN) of the matching author
tuple[1] contains a label of the matching author
tuple[2] is the distance, measured in characters, between the search string and the matching string
or None if no match is found.
"""
#string = string.lower()
author_matches, abbr_matches = [],[]
if(not fuzzy):
author_matches = [(id.split("$$")[0]
, self._author_names[id]
, len(self._author_names[id])-len(string))
for id in self._author_idx.searchAllWords(string)]
abbr_matches = [(id.split("$$")[0]
, self._author_abbreviations[id]
, len(self._author_abbreviations[id])-len(string))
for id in self._author_abbr_idx.searchAllWords(string)]
else:
abbr_matches = [(id.split("$$")[0]
, self._author_abbreviations[id]
, edit_distance(string,self._author_abbreviations[id]))
for id in self._author_abbreviations
if edit_distance(string,self._author_abbreviations[id]) <= distance_threshold]
abbr_matches = sorted(abbr_matches, key =itemgetter(2))
author_matches = []
for id in self._author_names:
if(string.endswith(".")):
if string.replace(".","") in self._author_names[id]:
if(len(string) > (len(self._author_names[id]) / 2)):
try:
assert abbr_matches[0][2] == 0
distance = len(self._author_names[id]) - len(string)
if distance < 0:
distance = 1
author_matches.append((id.split("$$")[0], self._author_names[id],distance))
except Exception, e:
author_matches.append((id.split("$$")[0], self._author_names[id],0))
else:
if(edit_distance(string,self._author_names[id]) <= distance_threshold):
author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
else:
if(edit_distance(string,self._author_names[id]) <= distance_threshold):
author_matches.append((id.split("$$")[0], self._author_names[id], edit_distance(string,self._author_names[id])))
开发者ID:mromanello,项目名称:CitationExtractor,代码行数:59,代码来源:matchers.py
示例4: searchEvidenceByTitle
def searchEvidenceByTitle(request):
if request.method == 'POST':
data = json.loads(request.body)
collection_id = data['collection_id']
title = data['title']
result_limit = data['result_limit']
include_personal = data['include_personal']
user_id = data['user_id']
# DONE: we can alternatively change this to treat given title as a series of separated terms
title_terms = title.split(' ')
print title_terms
evidence = Evidence.objects.filter(Q(created_by=collection_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
if include_personal:
personal_evidence = Evidence.objects.filter(Q(created_by=user_id)&reduce(lambda x, y: x & y, [Q(title__icontains=word) for word in title_terms]))
evidence = chain(evidence, personal_evidence)
serialized_json = serializers.serialize('json', evidence)
evidence_json = flattenSerializedJson(serialized_json)
evidence = json.loads(evidence_json)
pprint.pprint(evidence)
for e in evidence:
e['dist'] = edit_distance(title, e['title'])
print 'result limit'
print result_limit
evidence = sorted(evidence, key=lambda e:e['dist'])[:result_limit]
for e in evidence:
e['topic'] = -1
try:
e['topic'] = EvidenceTopic.objects.get(evidence=e['id']).primary_topic
except ObjectDoesNotExist:
if len(e['abstract']) > 50:
name = Collection.objects.get(collection_id=collection_id).collection_name
topic_dist, primary_topic_terms = TopicModeler.get_document_topics(e['abstract'], name)
primary_topic_tuple = max(topic_dist, key=lambda x:x[1])
e['topic'] = primary_topic_tuple[0]
else:
print 'warning: evidence with no topic'
return HttpResponse(json.dumps(evidence), status=status.HTTP_200_OK)
elif request.method == 'GET':
collection_id = 13
title = 'UpSet: Visualization of Intersecting Sets'
evidence = Evidence.objects.filter(created_by=collection_id)
serialized_json = serializers.serialize('json', evidence)
evidence_json = flattenSerializedJson(serialized_json)
evidence = json.loads(evidence_json)
for e in evidence:
e['dist'] = edit_distance(title, e['title'])
evidence = sorted(evidence, key=lambda e:e['dist'])
return HttpResponse(json.dumps(evidence[:20]), status=status.HTTP_200_OK)
开发者ID:tacitia,项目名称:ThoughtFlow,代码行数:49,代码来源:views_service.py
示例5: string_matching
def string_matching(label1, label2): #by Maedchen and Staab
""" (string, string) -> float
Return the coefficient of similarity between two sequence of strings based on
the Levenshtein distance (edit distance). It equates 1 for exact match and
0 to no similarity.
>>> string_matching('power','power')
1.0
>>> string_matching('power','abba')
0.0
"""
sm = float(
min(len(label1),len(label2)) -
edit_distance(label1, label2)
) / min(len(label1),len(label2)
)
try:
if sm < 0:
return 0.0
else:
return sm
except:
print "Error found:"
traceback.print_exc(file=sys.stdout)
return 0
开发者ID:Schiessl,项目名称:tese,代码行数:25,代码来源:lixo.py
示例6: replace
def replace(self, word):
if self.spell_dict.check(word):
return word
suggestions = []
suggestions = self.spell_dict.suggest(word)
distance = []
print(distance)
print(suggestions)
retVal = ""
for suggestedWord in suggestions:
distance.append(edit_distance(word, suggestedWord))
print(distance)
lengthMatched = False
if min(distance) <= self.max_dist:
retVal = suggestions[distance.index(min(distance))]
i = 0
for ed in distance:
if ed == min(distance) :
if len(word) == len(suggestions[i]) and lengthMatched == False:
retVal = suggestions[i]
lengthMatched = True
i += 1
else :
retVal = word
return retVal
开发者ID:ashwinbansod,项目名称:Data_Cleaning,代码行数:32,代码来源:Enchant.py
示例7: _GetScore
def _GetScore(self, query, match):
"""Custom edit-distance based scoring."""
str_query = str(query)
str_candidate = str(match.key)
dist = float(edit_distance(str_query, str_candidate))
max_len = float(max(len(str_query), len(str_candidate)))
return (max_len - dist) / max_len
开发者ID:issfangks,项目名称:milo-lab,代码行数:7,代码来源:approximate_matcher.py
示例8: replace_word
def replace_word(self, word):
if self.dictionary.check(word):
return word
suggestions = self.dictionary.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
return suggestions[0]
开发者ID:finde,项目名称:NLP1Emoticon,代码行数:7,代码来源:SpellingCheck.py
示例9: replace
def replace(self, word):
if self.spell_dict.check(word):
return word
distance = []
suggestions = []
suggestions = self.spell_dict.suggest(word)
retVal = ""
for suggestedWord in suggestions:
distance.append(edit_distance(word, suggestedWord))
if min(distance) <= self.max_dist:
retVal = suggestions[distance.index(min(distance))]
i = 0
for ed in distance:
if ed == min(distance) :
if len(word) == len(suggestions[i]):
retVal = suggestions[i]
break
i += 1
else:
retVal = word
return retVal
开发者ID:ashwinbansod,项目名称:Data_Cleaning,代码行数:26,代码来源:clean.py
示例10: spellChecker
def spellChecker(sentences, file_name_s):
dict_name = 'en_GB'
spell_dict = enchant.Dict(dict_name)
max_dist = 3
corrected = []
csv_writer = csv.writer(open(file_name_s, 'wb'))
#csv_writer.writerow(HEADER2)
for sentence in sentences:
corrected_sent = ''
sentence = str(sentence)
sc = set(["[", "]", "'", '"'])
words = ''.join([c for c in sentence if c not in sc])
words = words.split()
#print words
for word in words:
print word
suggestions = spell_dict.suggest(word)
#print suggestions[0]
#print edit_distance(word, suggestions[0])
if suggestions and edit_distance(word, suggestions[0]) <= max_dist:
#print word
corrected_sent = corrected_sent + " " + suggestions[0]
else:
corrected_sent = corrected_sent + " " + word
corrected_sent.replace("[","")
corrected_sent.replace("]","")
corrected_sent.replace("'","")
#print corrected_sent
corrected.append(corrected_sent)
csv_writer.writerow([corrected_sent])
print corrected
开发者ID:Nik0l,项目名称:UTemPro,代码行数:31,代码来源:UserLocation.py
示例11: fuzzy_comparison
def fuzzy_comparison(tokens_1,tokens_2,max_dist=1):
""" compares the tokens based on fuzzy match """
matched = 0
matched_len_1 = init_term_1 - len(tokens_1)
matched_len_2 = init_term_2 - len(tokens_2)
for token in reversed(tokens_1):
if len(token)<=2:
tokens_1.remove(token)
continue
for tkn in reversed(tokens_2):
if len(tkn)<=2:
tokens_2.remove(tkn)
continue
if metrics.edit_distance(token, tkn) <= max_dist:
matched = matched + 1
logging.debug("Match found for:"+token+" - "+tkn)
tokens_2.remove(tkn)
tokens_1.remove(token)
break
logging.info("Fuzzy match count:"+str(matched))
score_1 = (matched_len_1 + matched)/float(init_term_1)
score_2 = (matched_len_2 + matched)/float(init_term_2)
return score_1,score_2
开发者ID:pratyush912,项目名称:dokkom,代码行数:25,代码来源:compare.py
示例12: replace
def replace(self,word):
if self.spell_dict.check(word):
return word
suggestions = self.spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <=self.max_dist:
return suggestions[0]
else:
return word
开发者ID:divyaduraisamy,项目名称:Fake-Review-Detection,代码行数:8,代码来源:replacers.py
示例13: spell_check
def spell_check(r, a, s, scores, weight=1):
change = weight*(1-(edit_distance(r, a)/float(max(len(r), len(a)))))
if s in scores:
# penalty for returning multiple of the same result when
# one instance is incorrectly spelled
return (scores[s] + change)/2.0
else:
return change
开发者ID:gatesporter8,项目名称:EECS-337-Golden-Globes,代码行数:8,代码来源:autograder.py
示例14: check_replace_word
def check_replace_word(word):
if spell_dict.check(word):
return word
suggestions = spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) < 2:
return suggestions[0]
else:
return word
开发者ID:rafaelpiresm,项目名称:yelp_challenge,代码行数:8,代码来源:udfs.py
示例15: ordered_content_distance
def ordered_content_distance(self, sentence, normalized=True):
"""Normalized levenshtein distance on (ordered) content words
between `self` and `sentence`."""
self_content_words = self.content_words
sentence_content_words = sentence.content_words
distance = edit_distance(self_content_words, sentence_content_words)
norm = max(len(self_content_words), len(sentence_content_words))
return distance / norm if normalized else distance
开发者ID:interpretation-experiment,项目名称:analysis,代码行数:9,代码来源:linguistics.py
示例16: get_string_similarity
def get_string_similarity(p_token, h_token):
distance = edit_distance(h_token, p_token)
max_length = max(len(h_token), len(p_token))
score = 0
if max_length > 2:
score = 1 - (distance / (max_length - 1.99999999999999))
#if score > 1:
#logging.warning('score > 1 for %s, %s' % (p_token, h_token))
return max(0, score)
开发者ID:imclab,项目名称:entailment-api,代码行数:9,代码来源:Edit_featurizer.py
示例17: raw_distance
def raw_distance(self, sentence, normalized=True):
"""Normalized levenshtein distance between `self.text` and
`sentence.text`."""
self_text = self.text
sentence_text = sentence.text
distance = edit_distance(self_text, sentence_text)
norm = max(len(self_text), len(sentence_text))
return distance / norm if normalized else distance
开发者ID:interpretation-experiment,项目名称:analysis,代码行数:9,代码来源:linguistics.py
示例18: spell_correct
def spell_correct(unigrams, Dict):
for raword in unigrams:
if not (raword == "" or (raword[0] == '@' or raword[0] == '#')):
#Type error
suggestions = Dict.suggest(raword)
if suggestions and not Dict.check(raword):
if edit_distance(suggestions[0], raword) < 2:
raword = suggestions[0]
return unigrams
开发者ID:memedum90,项目名称:TurkuazTurchese-,代码行数:10,代码来源:TT_prep.py
示例19: similar
def similar(self, word):
names = self.table_names() + self.column_names() + self.row_names()
best = 100
best_word = None
for name in names:
dist = edit_distance(name, word)
if dist <= best:
best,best_word = dist,name
#print "Best word: " + best_word + " for " + word + ". Distance: " + str(dist)
return best_word
开发者ID:sapresearch,项目名称:fuzzy_adventure,代码行数:10,代码来源:term_selector.py
示例20: chug
def chug():
for title in dir_ocr:
with open(ocr + title, "r") as o_open:
with open(lines, "r") as l_open:
# lists of lines for each doc.
o_open_r = o_open.readlines()
l_open_r = l_open.readlines()
tot_o_line = len(o_open_r)
tot_l_line = len(l_open_r)
o_line = 0
for o in o_open_r:
# strip ocr lines of punctuation/whitespace
d = {}
o_1 = p.depunc(o.decode("utf-8"))
l_line = 0
o_line += 1
for l in l_open_r:
# strip 'known' lines of punctuation/whitespace
l_1 = p.depunc(l.decode("utf-8"))
# ignore ocr lines with few characters, still count the line thought
if len(o_1) < 4:
l_line += 1
# don't compare ocr lines less than half or over twice the length of the reference 'known' line(does this improve performance?)
elif len(o_1) < 0.5 * len(l_1) or len(o_1) > 1.5 * len(l_1):
l_line += 1
# compare ocr and known lines, get a similarity value between 0(not similar) and 1 (exact match), insert line pairs into dictionary
else:
l_line += 1
x = len(o_1) + len(l_1)
dist = (x - metrics.edit_distance(o_1, l_1)) / (x)
d[
'"'
+ str(
title
+ "| "
+ str(o_line)
+ '","'
+ o.rstrip("\n")
+ '","'
+ "line: "
+ str(l_line)
+ '","'
+ l.rstrip("\n")
+ '"'
)
] = dist
# keep the top score in the dictionary for each ocr line. Append to file.
if len(d) > 0 and (max(d.values())) > 0.85:
m = d.keys()[d.values().index(max(d.values()))]
f = open(output, "a")
f.write(str(m) + "," + str((max(d.values()))) + "\n")
print str(m).decode("utf-8") + ",", (max(d.values()))
l_open.close()
o_open.close()
f.close()
开发者ID:pleiovn,项目名称:ocr,代码行数:55,代码来源:nlp_chug.py
注:本文中的nltk.metrics.edit_distance函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论