TheAlgorithms-Python

Форк
0
/
jaccard_similarity.py 
95 строк · 3.2 Кб
1
"""
2
The Jaccard similarity coefficient is a commonly used indicator of the
3
similarity between two sets. Let U be a set and A and B be subsets of U,
4
then the Jaccard index/similarity is defined to be the ratio of the number
5
of elements of their intersection and the number of elements of their union.
6

7
Inspired from Wikipedia and
8
the book Mining of Massive Datasets [MMDS 2nd Edition, Chapter 3]
9

10
https://en.wikipedia.org/wiki/Jaccard_index
11
https://mmds.org
12

13
Jaccard similarity is widely used with MinHashing.
14
"""
15

16

17
def jaccard_similarity(
18
    set_a: set[str] | list[str] | tuple[str],
19
    set_b: set[str] | list[str] | tuple[str],
20
    alternative_union=False,
21
):
22
    """
23
    Finds the jaccard similarity between two sets.
24
    Essentially, its intersection over union.
25

26
    The alternative way to calculate this is to take union as sum of the
27
    number of items in the two sets. This will lead to jaccard similarity
28
    of a set with itself be 1/2 instead of 1. [MMDS 2nd Edition, Page 77]
29

30
    Parameters:
31
        :set_a (set,list,tuple): A non-empty set/list
32
        :set_b (set,list,tuple): A non-empty set/list
33
        :alternativeUnion (boolean): If True, use sum of number of
34
        items as union
35

36
    Output:
37
        (float) The jaccard similarity between the two sets.
38

39
    Examples:
40
    >>> set_a = {'a', 'b', 'c', 'd', 'e'}
41
    >>> set_b = {'c', 'd', 'e', 'f', 'h', 'i'}
42
    >>> jaccard_similarity(set_a, set_b)
43
    0.375
44
    >>> jaccard_similarity(set_a, set_a)
45
    1.0
46
    >>> jaccard_similarity(set_a, set_a, True)
47
    0.5
48
    >>> set_a = ['a', 'b', 'c', 'd', 'e']
49
    >>> set_b = ('c', 'd', 'e', 'f', 'h', 'i')
50
    >>> jaccard_similarity(set_a, set_b)
51
    0.375
52
    >>> set_a = ('c', 'd', 'e', 'f', 'h', 'i')
53
    >>> set_b = ['a', 'b', 'c', 'd', 'e']
54
    >>> jaccard_similarity(set_a, set_b)
55
    0.375
56
    >>> set_a = ('c', 'd', 'e', 'f', 'h', 'i')
57
    >>> set_b = ['a', 'b', 'c', 'd']
58
    >>> jaccard_similarity(set_a, set_b, True)
59
    0.2
60
    >>> set_a = {'a', 'b'}
61
    >>> set_b = ['c', 'd']
62
    >>> jaccard_similarity(set_a, set_b)
63
    Traceback (most recent call last):
64
        ...
65
    ValueError: Set a and b must either both be sets or be either a list or a tuple.
66
    """
67

68
    if isinstance(set_a, set) and isinstance(set_b, set):
69
        intersection_length = len(set_a.intersection(set_b))
70

71
        if alternative_union:
72
            union_length = len(set_a) + len(set_b)
73
        else:
74
            union_length = len(set_a.union(set_b))
75

76
        return intersection_length / union_length
77

78
    elif isinstance(set_a, (list, tuple)) and isinstance(set_b, (list, tuple)):
79
        intersection = [element for element in set_a if element in set_b]
80

81
        if alternative_union:
82
            return len(intersection) / (len(set_a) + len(set_b))
83
        else:
84
            # Cast set_a to list because tuples cannot be mutated
85
            union = list(set_a) + [element for element in set_b if element not in set_a]
86
            return len(intersection) / len(union)
87
    raise ValueError(
88
        "Set a and b must either both be sets or be either a list or a tuple."
89
    )
90

91

92
if __name__ == "__main__":
93
    set_a = {"a", "b", "c", "d", "e"}
94
    set_b = {"c", "d", "e", "f", "h", "i"}
95
    print(jaccard_similarity(set_a, set_b))
96

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.