TheAlgorithms-Python
95 строк · 3.2 Кб
1"""
2The Jaccard similarity coefficient is a commonly used indicator of the
3similarity between two sets. Let U be a set and A and B be subsets of U,
4then the Jaccard index/similarity is defined to be the ratio of the number
5of elements of their intersection and the number of elements of their union.
6
7Inspired from Wikipedia and
8the book Mining of Massive Datasets [MMDS 2nd Edition, Chapter 3]
9
10https://en.wikipedia.org/wiki/Jaccard_index
11https://mmds.org
12
13Jaccard similarity is widely used with MinHashing.
14"""
15
16
17def jaccard_similarity(18set_a: set[str] | list[str] | tuple[str],19set_b: set[str] | list[str] | tuple[str],20alternative_union=False,21):22"""23Finds the jaccard similarity between two sets.
24Essentially, its intersection over union.
25
26The alternative way to calculate this is to take union as sum of the
27number of items in the two sets. This will lead to jaccard similarity
28of a set with itself be 1/2 instead of 1. [MMDS 2nd Edition, Page 77]
29
30Parameters:
31:set_a (set,list,tuple): A non-empty set/list
32:set_b (set,list,tuple): A non-empty set/list
33:alternativeUnion (boolean): If True, use sum of number of
34items as union
35
36Output:
37(float) The jaccard similarity between the two sets.
38
39Examples:
40>>> set_a = {'a', 'b', 'c', 'd', 'e'}
41>>> set_b = {'c', 'd', 'e', 'f', 'h', 'i'}
42>>> jaccard_similarity(set_a, set_b)
430.375
44>>> jaccard_similarity(set_a, set_a)
451.0
46>>> jaccard_similarity(set_a, set_a, True)
470.5
48>>> set_a = ['a', 'b', 'c', 'd', 'e']
49>>> set_b = ('c', 'd', 'e', 'f', 'h', 'i')
50>>> jaccard_similarity(set_a, set_b)
510.375
52>>> set_a = ('c', 'd', 'e', 'f', 'h', 'i')
53>>> set_b = ['a', 'b', 'c', 'd', 'e']
54>>> jaccard_similarity(set_a, set_b)
550.375
56>>> set_a = ('c', 'd', 'e', 'f', 'h', 'i')
57>>> set_b = ['a', 'b', 'c', 'd']
58>>> jaccard_similarity(set_a, set_b, True)
590.2
60>>> set_a = {'a', 'b'}
61>>> set_b = ['c', 'd']
62>>> jaccard_similarity(set_a, set_b)
63Traceback (most recent call last):
64...
65ValueError: Set a and b must either both be sets or be either a list or a tuple.
66"""
67
68if isinstance(set_a, set) and isinstance(set_b, set):69intersection_length = len(set_a.intersection(set_b))70
71if alternative_union:72union_length = len(set_a) + len(set_b)73else:74union_length = len(set_a.union(set_b))75
76return intersection_length / union_length77
78elif isinstance(set_a, (list, tuple)) and isinstance(set_b, (list, tuple)):79intersection = [element for element in set_a if element in set_b]80
81if alternative_union:82return len(intersection) / (len(set_a) + len(set_b))83else:84# Cast set_a to list because tuples cannot be mutated85union = list(set_a) + [element for element in set_b if element not in set_a]86return len(intersection) / len(union)87raise ValueError(88"Set a and b must either both be sets or be either a list or a tuple."89)90
91
92if __name__ == "__main__":93set_a = {"a", "b", "c", "d", "e"}94set_b = {"c", "d", "e", "f", "h", "i"}95print(jaccard_similarity(set_a, set_b))96