llama-index
202 строки · 6.5 Кб
1import logging2from abc import ABC, abstractmethod3from typing import Any, Dict, List, Optional, Type4
5import numpy as np6
7from llama_index.legacy.bridge.pydantic import Field8from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode9from llama_index.legacy.vector_stores.types import (10VectorStore,11VectorStoreQuery,12VectorStoreQueryResult,13)
14from llama_index.legacy.vector_stores.utils import (15legacy_metadata_dict_to_node,16metadata_dict_to_node,17node_to_metadata_dict,18)
19
20logger = logging.getLogger(__name__)21
22
23class DocArrayVectorStore(VectorStore, ABC):24"""DocArray Vector Store Base Class.25
26
27This is an abstract base class for creating a DocArray vector store.
28The subclasses should implement _init_index and _find_docs_to_be_removed methods.
29"""
30
31# for mypy. will get initialized by the subclass.32_index: Any33_schema: Any34_ref_docs: Dict[str, List[str]]35
36stores_text: bool = True37flat_metadata: bool = False38
39def _update_ref_docs(self, docs) -> None: # type: ignore[no-untyped-def]40pass41
42@abstractmethod43def _init_index(self, **kwargs: Any): # type: ignore[no-untyped-def]44"""Initializes the index.45
46This method should be overridden by the subclasses.
47"""
48
49@abstractmethod50def _find_docs_to_be_removed(self, doc_id: str) -> List[str]:51"""Finds the documents to be removed from the vector store.52
53Args:
54doc_id (str): Document ID that should be removed.
55
56Returns:
57List[str]: List of document IDs to be removed.
58
59This is an abstract method and needs to be implemented in any concrete subclass.
60"""
61
62@property63def client(self) -> Any:64"""Get client."""65return None66
67def num_docs(self) -> int:68"""Retrieves the number of documents in the index.69
70Returns:
71int: The number of documents in the index.
72"""
73return self._index.num_docs()74
75@staticmethod76def _get_schema(**embeddings_params: Any) -> Type:77"""Fetches the schema for DocArray indices.78
79Args:
80**embeddings_params: Variable length argument list for the embedding.
81
82Returns:
83DocArraySchema: Schema for a DocArray index.
84"""
85from docarray import BaseDoc86from docarray.typing import ID, NdArray87
88class DocArraySchema(BaseDoc):89id: Optional[ID] = None90text: Optional[str] = None91metadata: Optional[dict] = None92embedding: NdArray = Field(**embeddings_params)93
94return DocArraySchema95
96def add(97self,98nodes: List[BaseNode],99**add_kwargs: Any,100) -> List[str]:101"""Adds nodes to the vector store.102
103Args:
104nodes (List[BaseNode]): List of nodes with embeddings.
105
106Returns:
107List[str]: List of document IDs added to the vector store.
108"""
109from docarray import DocList110
111# check to see if empty document list was passed112if len(nodes) == 0:113return []114
115docs = DocList[self._schema]( # type: ignore[name-defined]116self._schema(117id=node.node_id,118metadata=node_to_metadata_dict(node, flat_metadata=self.flat_metadata),119text=node.get_content(metadata_mode=MetadataMode.NONE),120embedding=node.get_embedding(),121)122for node in nodes123)124self._index.index(docs)125logger.info(f"Successfully added {len(docs)} documents to the index")126if self._ref_docs is not None:127self._update_ref_docs(docs)128return [doc.id for doc in docs]129
130def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:131"""Deletes a document from the vector store.132
133Args:
134ref_doc_id (str): Document ID to be deleted.
135**delete_kwargs (Any): Additional arguments to pass to the delete method.
136"""
137docs_to_be_removed = self._find_docs_to_be_removed(ref_doc_id)138if not docs_to_be_removed:139logger.warning(f"Document with doc_id {ref_doc_id} not found")140return141
142del self._index[docs_to_be_removed]143logger.info(f"Deleted {len(docs_to_be_removed)} documents from the index")144
145def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:146"""Queries the vector store and retrieves the results.147
148Args:
149query (VectorStoreQuery): Query for the vector store.
150
151Returns:
152VectorStoreQueryResult: Result of the query from vector store.
153"""
154if query.filters:155# only for ExactMatchFilters156filter_query = {157"metadata__" + filter.key: {"$eq": filter.value}158for filter in query.filters.legacy_filters()159}160query = (161self._index.build_query() # get empty query object162.find(163query=self._schema(embedding=np.array(query.query_embedding)),164search_field="embedding",165limit=query.similarity_top_k,166) # add vector similarity search167.filter(filter_query=filter_query) # add filter search168.build() # build the query169)170
171# execute the combined query and return the results172docs, scores = self._index.execute_query(query)173else:174docs, scores = self._index.find(175query=self._schema(embedding=np.array(query.query_embedding)),176search_field="embedding",177limit=query.similarity_top_k,178)179nodes, ids = [], []180for doc in docs:181try:182node = metadata_dict_to_node(doc.metadata)183node.text = doc.text184except Exception:185# TODO: legacy metadata support186metadata, node_info, relationships = legacy_metadata_dict_to_node(187doc.metadata188)189node = TextNode(190id_=doc.id,191text=doc.text,192metadata=metadata,193start_char_idx=node_info.get("start", None),194end_char_idx=node_info.get("end", None),195relationships=relationships,196)197
198nodes.append(node)199ids.append(doc.id)200logger.info(f"Found {len(nodes)} results for the query")201
202return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)203