llama-index
221 строка · 7.8 Кб
1"""DeepLake vector store index.
2
3An index that is built within DeepLake.
4
5"""
6
7import logging8from typing import Any, List, Optional, cast9
10from llama_index.legacy.bridge.pydantic import PrivateAttr11from llama_index.legacy.schema import BaseNode, MetadataMode12from llama_index.legacy.vector_stores.types import (13BasePydanticVectorStore,14VectorStoreQuery,15VectorStoreQueryResult,16)
17from llama_index.legacy.vector_stores.utils import (18metadata_dict_to_node,19node_to_metadata_dict,20)
21
22try:23from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore24
25DEEPLAKE_INSTALLED = True26except ImportError:27DEEPLAKE_INSTALLED = False28
29logger = logging.getLogger(__name__)30
31
32class DeepLakeVectorStore(BasePydanticVectorStore):33"""The DeepLake Vector Store.34
35In this vector store we store the text, its embedding and
36a few pieces of its metadata in a deeplake dataset. This implementation
37allows the use of an already existing deeplake dataset if it is one that was created
38this vector store. It also supports creating a new one if the dataset doesn't
39exist or if `overwrite` is set to True.
40"""
41
42stores_text: bool = True43flat_metadata: bool = True44
45ingestion_batch_size: int46num_workers: int47token: Optional[str]48read_only: Optional[bool]49dataset_path: str50
51_embedding_dimension: int = PrivateAttr()52_ttl_seconds: Optional[int] = PrivateAttr()53_deeplake_db: Any = PrivateAttr()54_deeplake_db_collection: Any = PrivateAttr()55_vectorstore: "VectorStore" = PrivateAttr()56_id_tensor_name: str = PrivateAttr()57
58def __init__(59self,60dataset_path: str = "llama_index",61token: Optional[str] = None,62read_only: Optional[bool] = False,63ingestion_batch_size: int = 1024,64ingestion_num_workers: int = 4,65overwrite: bool = False,66exec_option: Optional[str] = None,67verbose: bool = True,68**kwargs: Any,69) -> None:70"""71Args:
72dataset_path (str): Path to the deeplake dataset, where data will be
73stored. Defaults to "llama_index".
74overwrite (bool, optional): Whether to overwrite existing dataset with same
75name. Defaults to False.
76token (str, optional): the deeplake token that allows you to access the
77dataset with proper access. Defaults to None.
78read_only (bool, optional): Whether to open the dataset with read only mode.
79ingestion_batch_size (int): used for controlling batched data
80ingestion to deeplake dataset. Defaults to 1024.
81ingestion_num_workers (int): number of workers to use during data ingestion.
82Defaults to 4.
83overwrite (bool): Whether to overwrite existing dataset with the
84new dataset with the same name.
85exec_option (str): Default method for search execution. It could be either
86It could be either ``"python"``, ``"compute_engine"`` or
87``"tensor_db"``. Defaults to ``"python"``.
88- ``python`` - Pure-python implementation that runs on the client and
89can be used for data stored anywhere. WARNING: using this option
90with big datasets is discouraged because it can lead to memory
91issues.
92- ``compute_engine`` - Performant C++ implementation of the Deep Lake
93Compute Engine that runs on the client and can be used for any data
94stored in or connected to Deep Lake. It cannot be used with
95in-memory or local datasets.
96- ``tensor_db`` - Performant and fully-hosted Managed Tensor Database
97that is responsible for storage and query execution. Only available
98for data stored in the Deep Lake Managed Database. Store datasets in
99this database by specifying runtime = {"tensor_db": True} during
100dataset creation.
101verbose (bool): Specify if verbose output is enabled. Default is True.
102**kwargs (Any): Additional keyword arguments.
103
104Raises:
105ImportError: Unable to import `deeplake`.
106"""
107super().__init__(108dataset_path=dataset_path,109token=token,110read_only=read_only,111ingestion_batch_size=ingestion_batch_size,112num_workers=ingestion_num_workers,113)114
115if not DEEPLAKE_INSTALLED:116raise ImportError(117"Could not import deeplake python package. "118"Please install it with `pip install deeplake`."119)120
121self._vectorstore = VectorStore(122path=dataset_path,123ingestion_batch_size=ingestion_batch_size,124num_workers=ingestion_num_workers,125token=token,126read_only=read_only,127exec_option=exec_option,128overwrite=overwrite,129verbose=verbose,130**kwargs,131)132self._id_tensor_name = "ids" if "ids" in self._vectorstore.tensors() else "id"133
134@property135def client(self) -> Any:136"""Get client.137
138Returns:
139Any: DeepLake vectorstore dataset.
140"""
141return self._vectorstore.dataset142
143def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:144"""Add the embeddings and their nodes into DeepLake.145
146Args:
147nodes (List[BaseNode]): List of nodes with embeddings
148to insert.
149
150Returns:
151List[str]: List of ids inserted.
152"""
153embedding = []154metadata = []155id_ = []156text = []157
158for node in nodes:159embedding.append(node.get_embedding())160metadata.append(161node_to_metadata_dict(162node, remove_text=False, flat_metadata=self.flat_metadata163)164)165id_.append(node.node_id)166text.append(node.get_content(metadata_mode=MetadataMode.NONE))167
168kwargs = {169"embedding": embedding,170"metadata": metadata,171self._id_tensor_name: id_,172"text": text,173}174
175return self._vectorstore.add(176return_ids=True,177**kwargs,178)179
180def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:181"""182Delete nodes using with ref_doc_id.
183
184Args:
185ref_doc_id (str): The doc_id of the document to delete.
186
187"""
188self._vectorstore.delete(filter={"metadata": {"doc_id": ref_doc_id}})189
190def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:191"""Query index for top k most similar nodes.192
193Args:
194query (VectorStoreQuery): VectorStoreQuery class input, it has
195the following attributes:
1961. query_embedding (List[float]): query embedding
1972. similarity_top_k (int): top k most similar nodes
198deep_memory (bool): Whether to use deep memory for query execution.
199
200Returns:
201VectorStoreQueryResult
202"""
203query_embedding = cast(List[float], query.query_embedding)204exec_option = kwargs.get("exec_option")205deep_memory = kwargs.get("deep_memory")206data = self._vectorstore.search(207embedding=query_embedding,208exec_option=exec_option,209k=query.similarity_top_k,210filter=query.filters,211deep_memory=deep_memory,212)213
214similarities = data["score"]215ids = data[self._id_tensor_name]216metadatas = data["metadata"]217nodes = []218for metadata in metadatas:219nodes.append(metadata_dict_to_node(metadata))220
221return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)222