llama-index
340 строк · 11.4 Кб
1import logging2from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union3
4from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode5from llama_index.legacy.vector_stores.types import (6MetadataFilters,7VectorStore,8VectorStoreQuery,9VectorStoreQueryResult,10)
11from llama_index.legacy.vector_stores.utils import (12metadata_dict_to_node,13node_to_metadata_dict,14)
15
16logger = logging.getLogger(__name__)17
18if TYPE_CHECKING:19from zep_python.document import Document as ZepDocument20
21
22class ZepVectorStore(VectorStore):23"""Zep Vector Store for storing and retrieving embeddings.24
25Zep supports both normalized and non-normalized embeddings. Cosine similarity is
26used to compute distance and the returned score is normalized to be between 0 and 1.
27
28Args:
29collection_name (str): Name of the Zep collection in which to store embeddings.
30api_url (str): URL of the Zep API.
31api_key (str, optional): Key for the Zep API. Defaults to None.
32collection_description (str, optional): Description of the collection.
33Defaults to None.
34collection_metadata (dict, optional): Metadata of the collection.
35Defaults to None.
36embedding_dimensions (int, optional): Dimensions of the embeddings.
37Defaults to None.
38is_auto_embedded (bool, optional): Whether the embeddings are auto-embedded.
39Defaults to False.
40"""
41
42stores_text = True43flat_metadata = False44
45def __init__(46self,47collection_name: str,48api_url: str,49api_key: Optional[str] = None,50collection_description: Optional[str] = None,51collection_metadata: Optional[Dict[str, Any]] = None,52embedding_dimensions: Optional[int] = None,53is_auto_embedded: bool = False,54**kwargs: Any,55) -> None:56"""Init params."""57import_err_msg = (58"`zep-python` package not found, please run `pip install zep-python`"59)60try:61import zep_python62except ImportError:63raise ImportError(import_err_msg)64
65from zep_python import ZepClient66from zep_python.document import DocumentCollection67
68self._client = ZepClient(base_url=api_url, api_key=api_key)69self._collection: Union[DocumentCollection, None] = None70
71try:72self._collection = self._client.document.get_collection(73name=collection_name74)75except zep_python.NotFoundError:76if embedding_dimensions is None:77raise ValueError(78"embedding_dimensions must be specified if collection does not"79" exist"80)81logger.info(82f"Collection {collection_name} does not exist, "83f"will try creating one with dimensions={embedding_dimensions}"84)85
86self._collection = self._client.document.add_collection(87name=collection_name,88embedding_dimensions=embedding_dimensions,89is_auto_embedded=is_auto_embedded,90description=collection_description,91metadata=collection_metadata,92)93
94@property95def client(self) -> Any:96"""Get client."""97return self._client98
99def _prepare_documents(100self, nodes: List[BaseNode]101) -> Tuple[List["ZepDocument"], List[str]]:102from zep_python.document import Document as ZepDocument103
104docs: List["ZepDocument"] = []105ids: List[str] = []106
107for node in nodes:108metadata_dict: Dict[str, Any] = node_to_metadata_dict(109node, remove_text=True, flat_metadata=self.flat_metadata110)111
112if len(node.get_content()) == 0:113raise ValueError("No content to add to Zep")114
115docs.append(116ZepDocument(117document_id=node.node_id,118content=node.get_content(metadata_mode=MetadataMode.NONE),119embedding=node.get_embedding(),120metadata=metadata_dict,121)122)123ids.append(node.node_id)124
125return docs, ids126
127def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:128"""Add nodes to the collection.129
130Args:
131nodes (List[BaseNode]): List of nodes with embeddings.
132
133Returns:
134List[str]: List of IDs of the added documents.
135"""
136from zep_python.document import DocumentCollection137
138if not isinstance(self._collection, DocumentCollection):139raise ValueError("Collection not initialized")140
141if self._collection.is_auto_embedded:142raise ValueError("Collection is auto embedded, cannot add embeddings")143
144docs, ids = self._prepare_documents(nodes)145
146self._collection.add_documents(docs)147
148return ids149
150async def async_add(151self,152nodes: List[BaseNode],153**add_kwargs: Any,154) -> List[str]:155"""Asynchronously add nodes to the collection.156
157Args:
158nodes (List[BaseNode]): List of nodes with embeddings.
159
160Returns:
161List[str]: List of IDs of the added documents.
162"""
163from zep_python.document import DocumentCollection164
165if not isinstance(self._collection, DocumentCollection):166raise ValueError("Collection not initialized")167
168if self._collection.is_auto_embedded:169raise ValueError("Collection is auto embedded, cannot add embeddings")170
171docs, ids = self._prepare_documents(nodes)172
173await self._collection.aadd_documents(docs)174
175return ids176
177def delete(178self, ref_doc_id: Optional[str] = None, **delete_kwargs: Any179) -> None: # type: ignore180"""Delete a document from the collection.181
182Args:
183ref_doc_id (Optional[str]): ID of the document to delete.
184Not currently supported.
185delete_kwargs: Must contain "uuid" key with UUID of the document to delete.
186"""
187from zep_python.document import DocumentCollection188
189if not isinstance(self._collection, DocumentCollection):190raise ValueError("Collection not initialized")191
192if ref_doc_id and len(ref_doc_id) > 0:193raise NotImplementedError(194"Delete by ref_doc_id not yet implemented for Zep."195)196
197if "uuid" in delete_kwargs:198self._collection.delete_document(uuid=delete_kwargs["uuid"])199else:200raise ValueError("uuid must be specified")201
202async def adelete(203self, ref_doc_id: Optional[str] = None, **delete_kwargs: Any204) -> None: # type: ignore205"""Asynchronously delete a document from the collection.206
207Args:
208ref_doc_id (Optional[str]): ID of the document to delete.
209Not currently supported.
210delete_kwargs: Must contain "uuid" key with UUID of the document to delete.
211"""
212from zep_python.document import DocumentCollection213
214if not isinstance(self._collection, DocumentCollection):215raise ValueError("Collection not initialized")216
217if ref_doc_id and len(ref_doc_id) > 0:218raise NotImplementedError(219"Delete by ref_doc_id not yet implemented for Zep."220)221
222if "uuid" in delete_kwargs:223await self._collection.adelete_document(uuid=delete_kwargs["uuid"])224else:225raise ValueError("uuid must be specified")226
227def _parse_query_result(228self, results: List["ZepDocument"]229) -> VectorStoreQueryResult:230similarities: List[float] = []231ids: List[str] = []232nodes: List[TextNode] = []233
234for d in results:235node = metadata_dict_to_node(d.metadata or {})236node.set_content(d.content)237
238nodes.append(node)239
240if d.score is None:241d.score = 0.0242similarities.append(d.score)243
244if d.document_id is None:245d.document_id = ""246ids.append(d.document_id)247
248return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)249
250def _to_zep_filters(self, filters: MetadataFilters) -> Dict[str, Any]:251"""Convert filters to Zep filters. Filters are ANDed together."""252filter_conditions: List[Dict[str, Any]] = []253
254for f in filters.legacy_filters():255filter_conditions.append({"jsonpath": f'$[*] ? (@.{f.key} == "{f.value}")'})256
257return {"where": {"and": filter_conditions}}258
259def query(260self,261query: VectorStoreQuery,262**kwargs: Any,263) -> VectorStoreQueryResult:264"""Query the index for the top k most similar nodes to the given query.265
266Args:
267query (VectorStoreQuery): Query object containing either a query string
268or a query embedding.
269
270Returns:
271VectorStoreQueryResult: Result of the query, containing the most similar
272nodes, their similarities, and their IDs.
273"""
274from zep_python.document import DocumentCollection275
276if not isinstance(self._collection, DocumentCollection):277raise ValueError("Collection not initialized")278
279if query.query_embedding is None and query.query_str is None:280raise ValueError("query must have one of query_str or query_embedding")281
282# If we have an embedding, we shouldn't use the query string283# Zep does not allow both to be set284if query.query_embedding:285query.query_str = None286
287metadata_filters = None288if query.filters is not None:289metadata_filters = self._to_zep_filters(query.filters)290
291results = self._collection.search(292text=query.query_str,293embedding=query.query_embedding,294metadata=metadata_filters,295limit=query.similarity_top_k,296)297
298return self._parse_query_result(results)299
300async def aquery(301self,302query: VectorStoreQuery,303**kwargs: Any,304) -> VectorStoreQueryResult:305"""Asynchronously query the index for the top k most similar nodes to the306given query.
307
308Args:
309query (VectorStoreQuery): Query object containing either a query string or
310a query embedding.
311
312Returns:
313VectorStoreQueryResult: Result of the query, containing the most similar
314nodes, their similarities, and their IDs.
315"""
316from zep_python.document import DocumentCollection317
318if not isinstance(self._collection, DocumentCollection):319raise ValueError("Collection not initialized")320
321if query.query_embedding is None and query.query_str is None:322raise ValueError("query must have one of query_str or query_embedding")323
324# If we have an embedding, we shouldn't use the query string325# Zep does not allow both to be set326if query.query_embedding:327query.query_str = None328
329metadata_filters = None330if query.filters is not None:331metadata_filters = self._to_zep_filters(query.filters)332
333results = await self._collection.asearch(334text=query.query_str,335embedding=query.query_embedding,336metadata=metadata_filters,337limit=query.similarity_top_k,338)339
340return self._parse_query_result(results)341