在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
维基百科对字符串相似度(Damerau–Levenshtein distance)的定义是:
简单翻译下,两个字符序列的DL距离,就是从一个变换到另一个的最小操作次数。这个变换包括插入一个字符、删除一个字符、替换一个字符、或互换两个相邻字符。 而所谓“编辑距离(edit distance,或叫Levenshtein distance)”,并不包含互换两个相邻字符。 主要应用是在字符拼写检查上,当然也可以用在其他地方,比方不少输入法就提供类似的校正功能(搜狗拼音输入法即实现了此功能)。 看起来简单,实现还是有一定困难的,好在有牛人已经做好相应的函数,如 Kambiz 在 How to match two strings approximately 中提供了两个函数: 计算DL距离的函数DamerauLevenshteinDistance(Str1, Str2) function DamerauLevenshteinDistance(const Str1, Str2: string): Integer; var LenStr1, LenStr2: Integer; I, J, T, Cost, Minimum: Integer; pStr1, pStr2, S1, S2: PChar; D, RowPrv2, RowPrv1, RowCur, Temp: PIntegerArray; begin LenStr1 := Length(Str1); LenStr2 := Length(Str2); // to save some space, make sure the second index points to the shorter string if LenStr1 < LenStr2 then begin T := LenStr1; LenStr1 := LenStr2; LenStr2 := T; pStr1 := PChar(Str2); pStr2 := PChar(Str1); end else begin pStr1 := PChar(Str1); pStr2 := PChar(Str2); end; // to save some time and space, look for exact match while (LenStr2 <> 0) and (pStr1^ = pStr2^) do begin Inc(pStr1); Inc(pStr2); Dec(LenStr1); Dec(LenStr2); end; // when one string is empty, length of the other is the distance if LenStr2 = 0 then begin Result := LenStr1; Exit; end; // calculate the edit distance T := LenStr2 + 1; GetMem(D, 3 * T * SizeOf(Integer)); FillChar(D^, 2 * T * SizeOf(Integer), 0); RowCur := D; RowPrv1 := @D[T]; RowPrv2 := @D[2 * T]; S1 := pStr1; for I := 1 to LenStr1 do begin Temp := RowPrv2; RowPrv2 := RowPrv1; RowPrv1 := RowCur; RowCur := Temp; RowCur[0] := I; S2 := pStr2; for J := 1 to LenStr2 do begin Cost := Ord(S1^ <> S2^); Minimum := RowPrv1[J - 1] + Cost; // substitution T := RowCur[J - 1] + 1; // insertion if T < Minimum then Minimum := T; T := RowPrv1[J] + 1; // deletion if T < Minimum then Minimum := T; if (I <> 1) and (J <> 1) and (S1^ = (S2 - 1)^) and (S2^ = (S1 - 1)^) then begin T := RowPrv2[J - 2] + Cost; // transposition if T < Minimum then Minimum := T; end; RowCur[J] := Minimum; Inc(S2); end; Inc(S1); end; Result := RowCur[LenStr2]; FreeMem(D); end; 还有计算字符串相似度的函数 StringSimilarityRatio(Str1, Str2, IgnoreCase): Double; 返回值在0到1之间,0表示不相似,1表示完全相似。 function StringSimilarityRatio(const Str1, Str2: string; IgnoreCase: Boolean): Double; var MaxLen: Integer; Distance: Integer; begin Result := 1.0; if Length(Str1) > Length(Str2) then MaxLen := Length(Str1) else MaxLen := Length(Str2); if MaxLen <> 0 then begin if IgnoreCase then Distance := DamerauLevenshteinDistance(LowerCase(Str1), LowerCase(Str2)) else Distance := DamerauLevenshteinDistance(Str1, Str2); Result := Result - (Distance / MaxLen); end; end; 后来data man 参考一个德国人的ApproxStrUtils单元(该单元计算的是L距离,不是DL距离),给出一个据说效率更高的DL距离函数,遗憾的是调用它会有“Invalid Pointer Operation”的报错,还没有Debug出问题所在,所以暂时先用前一个版本吧。 function DamerauLevenshteinDistance2(const Str1, Str2: string): Integer; function Min(const A, B, C: Integer): Integer; inline; begin Result := A; if B < A then Result := B; if C < Result then Result := C; end; var LenStr1, LenStr2: Integer; I, J, T, Cost, PrevCost: Integer; pStr1, pStr2, S1, S2: PChar; D: PIntegerArray; begin LenStr1 := Length(Str1); LenStr2 := Length(Str2); // to save some space, make sure the second index points to the shorter string if LenStr1 < LenStr2 then begin T := LenStr1; LenStr1 := LenStr2; LenStr2 := T; pStr1 := PChar(Str2); pStr2 := PChar(Str1); end else begin pStr1 := PChar(Str1); pStr2 := PChar(Str2); end; // to save some time and space, look for exact match while (LenStr2 <> 0) and (pStr1^ = pStr2^) do begin Inc(pStr1); Inc(pStr2); Dec(LenStr1); Dec(LenStr2); end; while (LenStr2 <> 0) and ((pStr1 + LenStr1 - 1)^ = (pStr2 + LenStr2 - 1)^) do begin Dec(LenStr1); Dec(LenStr2); end; if LenStr2 = 0 then begin Result := LenStr1; Exit; end; // calculate the edit distance T := LenStr2 + 1; GetMem(D, T * SizeOf(Integer)); for I := 0 to T do D[I] := I; S1 := pStr1; for I := 1 to LenStr1 do begin PrevCost := I - 1; Cost := I; S2 := pStr2; for J := 1 to LenStr2 do begin if (S1^ = S2^) or ((I > 1) and (J > 1) and (S1^ = (S2 - 1)^) and (S2^ = (S1 - 1)^)) then Cost := PrevCost else Cost := 1 + min(Cost, PrevCost, D[J]); PrevCost := D[J]; D[J] := Cost; Inc(S2); end; Inc(S1); end; Result := D[LenStr2]; FreeMem(D); end; 参考文献:
Match
|
2023-10-27
2022-08-15
2022-08-17
2022-09-23
2022-08-13
请发表评论