llama-index

deeplake.py
221 строка · 7.8 Кб
Перенос по словам
1
"""DeepLake vector store index.
2

3
An index that is built within DeepLake.
4

5
"""
6

7
import logging
8
from typing import Any, List, Optional, cast
9

10
from llama_index.legacy.bridge.pydantic import PrivateAttr
11
from llama_index.legacy.schema import BaseNode, MetadataMode
12
from llama_index.legacy.vector_stores.types import (
13
    BasePydanticVectorStore,
14
    VectorStoreQuery,
15
    VectorStoreQueryResult,
16
)
17
from llama_index.legacy.vector_stores.utils import (
18
    metadata_dict_to_node,
19
    node_to_metadata_dict,
20
)
21

22
try:
23
    from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
24

25
    DEEPLAKE_INSTALLED = True
26
except ImportError:
27
    DEEPLAKE_INSTALLED = False
28

29
logger = logging.getLogger(__name__)
30

31

32
class DeepLakeVectorStore(BasePydanticVectorStore):
33
    """The DeepLake Vector Store.
34

35
    In this vector store we store the text, its embedding and
36
    a few pieces of its metadata in a deeplake dataset. This implementation
37
    allows the use of an already existing deeplake dataset if it is one that was created
38
    this vector store. It also supports creating a new one if the dataset doesn't
39
    exist or if `overwrite` is set to True.
40
    """
41

42
    stores_text: bool = True
43
    flat_metadata: bool = True
44

45
    ingestion_batch_size: int
46
    num_workers: int
47
    token: Optional[str]
48
    read_only: Optional[bool]
49
    dataset_path: str
50

51
    _embedding_dimension: int = PrivateAttr()
52
    _ttl_seconds: Optional[int] = PrivateAttr()
53
    _deeplake_db: Any = PrivateAttr()
54
    _deeplake_db_collection: Any = PrivateAttr()
55
    _vectorstore: "VectorStore" = PrivateAttr()
56
    _id_tensor_name: str = PrivateAttr()
57

58
    def __init__(
59
        self,
60
        dataset_path: str = "llama_index",
61
        token: Optional[str] = None,
62
        read_only: Optional[bool] = False,
63
        ingestion_batch_size: int = 1024,
64
        ingestion_num_workers: int = 4,
65
        overwrite: bool = False,
66
        exec_option: Optional[str] = None,
67
        verbose: bool = True,
68
        **kwargs: Any,
69
    ) -> None:
70
        """
71
        Args:
72
            dataset_path (str): Path to the deeplake dataset, where data will be
73
            stored. Defaults to "llama_index".
74
            overwrite (bool, optional): Whether to overwrite existing dataset with same
75
                name. Defaults to False.
76
            token (str, optional): the deeplake token that allows you to access the
77
                dataset with proper access. Defaults to None.
78
            read_only (bool, optional): Whether to open the dataset with read only mode.
79
            ingestion_batch_size (int): used for controlling batched data
80
                ingestion to deeplake dataset. Defaults to 1024.
81
            ingestion_num_workers (int): number of workers to use during data ingestion.
82
                Defaults to 4.
83
            overwrite (bool): Whether to overwrite existing dataset with the
84
                new dataset with the same name.
85
            exec_option (str): Default method for search execution. It could be either
86
                It could be either ``"python"``, ``"compute_engine"`` or
87
                ``"tensor_db"``. Defaults to ``"python"``.
88
                - ``python`` - Pure-python implementation that runs on the client and
89
                    can be used for data stored anywhere. WARNING: using this option
90
                    with big datasets is discouraged because it can lead to memory
91
                    issues.
92
                - ``compute_engine`` - Performant C++ implementation of the Deep Lake
93
                    Compute Engine that runs on the client and can be used for any data
94
                    stored in or connected to Deep Lake. It cannot be used with
95
                    in-memory or local datasets.
96
                - ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
97
                    that is responsible for storage and query execution. Only available
98
                    for data stored in the Deep Lake Managed Database. Store datasets in
99
                    this database by specifying runtime = {"tensor_db": True} during
100
                    dataset creation.
101
            verbose (bool): Specify if verbose output is enabled. Default is True.
102
            **kwargs (Any): Additional keyword arguments.
103

104
        Raises:
105
            ImportError: Unable to import `deeplake`.
106
        """
107
        super().__init__(
108
            dataset_path=dataset_path,
109
            token=token,
110
            read_only=read_only,
111
            ingestion_batch_size=ingestion_batch_size,
112
            num_workers=ingestion_num_workers,
113
        )
114

115
        if not DEEPLAKE_INSTALLED:
116
            raise ImportError(
117
                "Could not import deeplake python package. "
118
                "Please install it with `pip install deeplake`."
119
            )
120

121
        self._vectorstore = VectorStore(
122
            path=dataset_path,
123
            ingestion_batch_size=ingestion_batch_size,
124
            num_workers=ingestion_num_workers,
125
            token=token,
126
            read_only=read_only,
127
            exec_option=exec_option,
128
            overwrite=overwrite,
129
            verbose=verbose,
130
            **kwargs,
131
        )
132
        self._id_tensor_name = "ids" if "ids" in self._vectorstore.tensors() else "id"
133

134
    @property
135
    def client(self) -> Any:
136
        """Get client.
137

138
        Returns:
139
            Any: DeepLake vectorstore dataset.
140
        """
141
        return self._vectorstore.dataset
142

143
    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
144
        """Add the embeddings and their nodes into DeepLake.
145

146
        Args:
147
            nodes (List[BaseNode]): List of nodes with embeddings
148
                to insert.
149

150
        Returns:
151
            List[str]: List of ids inserted.
152
        """
153
        embedding = []
154
        metadata = []
155
        id_ = []
156
        text = []
157

158
        for node in nodes:
159
            embedding.append(node.get_embedding())
160
            metadata.append(
161
                node_to_metadata_dict(
162
                    node, remove_text=False, flat_metadata=self.flat_metadata
163
                )
164
            )
165
            id_.append(node.node_id)
166
            text.append(node.get_content(metadata_mode=MetadataMode.NONE))
167

168
        kwargs = {
169
            "embedding": embedding,
170
            "metadata": metadata,
171
            self._id_tensor_name: id_,
172
            "text": text,
173
        }
174

175
        return self._vectorstore.add(
176
            return_ids=True,
177
            **kwargs,
178
        )
179

180
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
181
        """
182
        Delete nodes using with ref_doc_id.
183

184
        Args:
185
            ref_doc_id (str): The doc_id of the document to delete.
186

187
        """
188
        self._vectorstore.delete(filter={"metadata": {"doc_id": ref_doc_id}})
189

190
    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
191
        """Query index for top k most similar nodes.
192

193
        Args:
194
            query (VectorStoreQuery): VectorStoreQuery class input, it has
195
                the following attributes:
196
                1. query_embedding (List[float]): query embedding
197
                2. similarity_top_k (int): top k most similar nodes
198
            deep_memory (bool): Whether to use deep memory for query execution.
199

200
        Returns:
201
            VectorStoreQueryResult
202
        """
203
        query_embedding = cast(List[float], query.query_embedding)
204
        exec_option = kwargs.get("exec_option")
205
        deep_memory = kwargs.get("deep_memory")
206
        data = self._vectorstore.search(
207
            embedding=query_embedding,
208
            exec_option=exec_option,
209
            k=query.similarity_top_k,
210
            filter=query.filters,
211
            deep_memory=deep_memory,
212
        )
213

214
        similarities = data["score"]
215
        ids = data[self._id_tensor_name]
216
        metadatas = data["metadata"]
217
        nodes = []
218
        for metadata in metadatas:
219
            nodes.append(metadata_dict_to_node(metadata))
220

221
        return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
222
llama-index

Использование cookies