llama-index

Форк
0
116 строк · 3.4 Кб
1
"""DeepLake reader."""
2

3
from typing import List, Optional, Union
4

5
import numpy as np
6

7
from llama_index.legacy.readers.base import BaseReader
8
from llama_index.legacy.schema import Document
9

10
distance_metric_map = {
11
    "l2": lambda a, b: np.linalg.norm(a - b, axis=1, ord=2),
12
    "l1": lambda a, b: np.linalg.norm(a - b, axis=1, ord=1),
13
    "max": lambda a, b: np.linalg.norm(a - b, axis=1, ord=np.inf),
14
    "cos": lambda a, b: np.dot(a, b.T)
15
    / (np.linalg.norm(a) * np.linalg.norm(b, axis=1)),
16
    "dot": lambda a, b: np.dot(a, b.T),
17
}
18

19

20
def vector_search(
21
    query_vector: Union[List, np.ndarray],
22
    data_vectors: np.ndarray,
23
    distance_metric: str = "l2",
24
    limit: Optional[int] = 4,
25
) -> List:
26
    """Naive search for nearest neighbors
27
    args:
28
        query_vector: Union[List, np.ndarray]
29
        data_vectors: np.ndarray
30
        limit (int): number of nearest neighbors
31
        distance_metric: distance function 'L2' for Euclidean, 'L1' for Nuclear, 'Max'
32
            l-infinity distance, 'cos' for cosine similarity, 'dot' for dot product
33
    returns:
34
        nearest_indices: List, indices of nearest neighbors.
35
    """
36
    # Calculate the distance between the query_vector and all data_vectors
37
    if isinstance(query_vector, list):
38
        query_vector = np.array(query_vector)
39
        query_vector = query_vector.reshape(1, -1)
40

41
    distances = distance_metric_map[distance_metric](query_vector, data_vectors)
42
    nearest_indices = np.argsort(distances)
43

44
    nearest_indices = (
45
        nearest_indices[::-1][:limit]
46
        if distance_metric in ["cos"]
47
        else nearest_indices[:limit]
48
    )
49

50
    return nearest_indices.tolist()
51

52

53
class DeepLakeReader(BaseReader):
54
    """DeepLake reader.
55

56
    Retrieve documents from existing DeepLake datasets.
57

58
    Args:
59
        dataset_name: Name of the deeplake dataset.
60
    """
61

62
    def __init__(
63
        self,
64
        token: Optional[str] = None,
65
    ):
66
        """Initializing the deepLake reader."""
67
        import_err_msg = (
68
            "`deeplake` package not found, please run `pip install deeplake`"
69
        )
70
        try:
71
            import deeplake  # noqa
72
        except ImportError:
73
            raise ImportError(import_err_msg)
74
        self.token = token
75

76
    def load_data(
77
        self,
78
        query_vector: List[float],
79
        dataset_path: str,
80
        limit: int = 4,
81
        distance_metric: str = "l2",
82
    ) -> List[Document]:
83
        """Load data from DeepLake.
84

85
        Args:
86
            dataset_name (str): Name of the DeepLake dataset.
87
            query_vector (List[float]): Query vector.
88
            limit (int): Number of results to return.
89

90
        Returns:
91
            List[Document]: A list of documents.
92
        """
93
        import deeplake
94
        from deeplake.util.exceptions import TensorDoesNotExistError
95

96
        dataset = deeplake.load(dataset_path, token=self.token)
97

98
        try:
99
            embeddings = dataset.embedding.numpy(fetch_chunks=True)
100
        except Exception:
101
            raise TensorDoesNotExistError("embedding")
102

103
        indices = vector_search(
104
            query_vector, embeddings, distance_metric=distance_metric, limit=limit
105
        )
106

107
        documents = []
108
        for idx in indices:
109
            document = Document(
110
                text=str(dataset[idx].text.numpy().tolist()[0]),
111
                id_=dataset[idx].ids.numpy().tolist()[0],
112
            )
113

114
            documents.append(document)
115

116
        return documents
117

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.