TheAlgorithms-Python
78 строк · 2.1 Кб
1"""https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance"""
2
3
4def jaro_winkler(str1: str, str2: str) -> float:5"""6Jaro–Winkler distance is a string metric measuring an edit distance between two
7sequences.
8Output value is between 0.0 and 1.0.
9
10>>> jaro_winkler("martha", "marhta")
110.9611111111111111
12>>> jaro_winkler("CRATE", "TRACE")
130.7333333333333334
14>>> jaro_winkler("test", "dbdbdbdb")
150.0
16>>> jaro_winkler("test", "test")
171.0
18>>> jaro_winkler("hello world", "HeLLo W0rlD")
190.6363636363636364
20>>> jaro_winkler("test", "")
210.0
22>>> jaro_winkler("hello", "world")
230.4666666666666666
24>>> jaro_winkler("hell**o", "*world")
250.4365079365079365
26"""
27
28def get_matched_characters(_str1: str, _str2: str) -> str:29matched = []30limit = min(len(_str1), len(_str2)) // 231for i, l in enumerate(_str1):32left = int(max(0, i - limit))33right = int(min(i + limit + 1, len(_str2)))34if l in _str2[left:right]:35matched.append(l)36_str2 = f"{_str2[0:_str2.index(l)]} {_str2[_str2.index(l) + 1:]}"37
38return "".join(matched)39
40# matching characters41matching_1 = get_matched_characters(str1, str2)42matching_2 = get_matched_characters(str2, str1)43match_count = len(matching_1)44
45# transposition46transpositions = (47len([(c1, c2) for c1, c2 in zip(matching_1, matching_2) if c1 != c2]) // 248)49
50if not match_count:51jaro = 0.052else:53jaro = (54155/ 356* (57match_count / len(str1)58+ match_count / len(str2)59+ (match_count - transpositions) / match_count60)61)62
63# common prefix up to 4 characters64prefix_len = 065for c1, c2 in zip(str1[:4], str2[:4]):66if c1 == c2:67prefix_len += 168else:69break70
71return jaro + 0.1 * prefix_len * (1 - jaro)72
73
74if __name__ == "__main__":75import doctest76
77doctest.testmod()78print(jaro_winkler("hello", "world"))79