llama-index

zep.py
340 строк · 11.4 Кб
Перенос по словам
1
import logging
2
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3

4
from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode
5
from llama_index.legacy.vector_stores.types import (
6
    MetadataFilters,
7
    VectorStore,
8
    VectorStoreQuery,
9
    VectorStoreQueryResult,
10
)
11
from llama_index.legacy.vector_stores.utils import (
12
    metadata_dict_to_node,
13
    node_to_metadata_dict,
14
)
15

16
logger = logging.getLogger(__name__)
17

18
if TYPE_CHECKING:
19
    from zep_python.document import Document as ZepDocument
20

21

22
class ZepVectorStore(VectorStore):
23
    """Zep Vector Store for storing and retrieving embeddings.
24

25
    Zep supports both normalized and non-normalized embeddings. Cosine similarity is
26
    used to compute distance and the returned score is normalized to be between 0 and 1.
27

28
    Args:
29
        collection_name (str): Name of the Zep collection in which to store embeddings.
30
        api_url (str): URL of the Zep API.
31
        api_key (str, optional): Key for the Zep API. Defaults to None.
32
        collection_description (str, optional): Description of the collection.
33
            Defaults to None.
34
        collection_metadata (dict, optional): Metadata of the collection.
35
            Defaults to None.
36
        embedding_dimensions (int, optional): Dimensions of the embeddings.
37
            Defaults to None.
38
        is_auto_embedded (bool, optional): Whether the embeddings are auto-embedded.
39
            Defaults to False.
40
    """
41

42
    stores_text = True
43
    flat_metadata = False
44

45
    def __init__(
46
        self,
47
        collection_name: str,
48
        api_url: str,
49
        api_key: Optional[str] = None,
50
        collection_description: Optional[str] = None,
51
        collection_metadata: Optional[Dict[str, Any]] = None,
52
        embedding_dimensions: Optional[int] = None,
53
        is_auto_embedded: bool = False,
54
        **kwargs: Any,
55
    ) -> None:
56
        """Init params."""
57
        import_err_msg = (
58
            "`zep-python` package not found, please run `pip install zep-python`"
59
        )
60
        try:
61
            import zep_python
62
        except ImportError:
63
            raise ImportError(import_err_msg)
64

65
        from zep_python import ZepClient
66
        from zep_python.document import DocumentCollection
67

68
        self._client = ZepClient(base_url=api_url, api_key=api_key)
69
        self._collection: Union[DocumentCollection, None] = None
70

71
        try:
72
            self._collection = self._client.document.get_collection(
73
                name=collection_name
74
            )
75
        except zep_python.NotFoundError:
76
            if embedding_dimensions is None:
77
                raise ValueError(
78
                    "embedding_dimensions must be specified if collection does not"
79
                    " exist"
80
                )
81
            logger.info(
82
                f"Collection {collection_name} does not exist, "
83
                f"will try creating one with dimensions={embedding_dimensions}"
84
            )
85

86
            self._collection = self._client.document.add_collection(
87
                name=collection_name,
88
                embedding_dimensions=embedding_dimensions,
89
                is_auto_embedded=is_auto_embedded,
90
                description=collection_description,
91
                metadata=collection_metadata,
92
            )
93

94
    @property
95
    def client(self) -> Any:
96
        """Get client."""
97
        return self._client
98

99
    def _prepare_documents(
100
        self, nodes: List[BaseNode]
101
    ) -> Tuple[List["ZepDocument"], List[str]]:
102
        from zep_python.document import Document as ZepDocument
103

104
        docs: List["ZepDocument"] = []
105
        ids: List[str] = []
106

107
        for node in nodes:
108
            metadata_dict: Dict[str, Any] = node_to_metadata_dict(
109
                node, remove_text=True, flat_metadata=self.flat_metadata
110
            )
111

112
            if len(node.get_content()) == 0:
113
                raise ValueError("No content to add to Zep")
114

115
            docs.append(
116
                ZepDocument(
117
                    document_id=node.node_id,
118
                    content=node.get_content(metadata_mode=MetadataMode.NONE),
119
                    embedding=node.get_embedding(),
120
                    metadata=metadata_dict,
121
                )
122
            )
123
            ids.append(node.node_id)
124

125
        return docs, ids
126

127
    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
128
        """Add nodes to the collection.
129

130
        Args:
131
            nodes (List[BaseNode]): List of nodes with embeddings.
132

133
        Returns:
134
            List[str]: List of IDs of the added documents.
135
        """
136
        from zep_python.document import DocumentCollection
137

138
        if not isinstance(self._collection, DocumentCollection):
139
            raise ValueError("Collection not initialized")
140

141
        if self._collection.is_auto_embedded:
142
            raise ValueError("Collection is auto embedded, cannot add embeddings")
143

144
        docs, ids = self._prepare_documents(nodes)
145

146
        self._collection.add_documents(docs)
147

148
        return ids
149

150
    async def async_add(
151
        self,
152
        nodes: List[BaseNode],
153
        **add_kwargs: Any,
154
    ) -> List[str]:
155
        """Asynchronously add nodes to the collection.
156

157
        Args:
158
            nodes (List[BaseNode]): List of nodes with embeddings.
159

160
        Returns:
161
            List[str]: List of IDs of the added documents.
162
        """
163
        from zep_python.document import DocumentCollection
164

165
        if not isinstance(self._collection, DocumentCollection):
166
            raise ValueError("Collection not initialized")
167

168
        if self._collection.is_auto_embedded:
169
            raise ValueError("Collection is auto embedded, cannot add embeddings")
170

171
        docs, ids = self._prepare_documents(nodes)
172

173
        await self._collection.aadd_documents(docs)
174

175
        return ids
176

177
    def delete(
178
        self, ref_doc_id: Optional[str] = None, **delete_kwargs: Any
179
    ) -> None:  # type: ignore
180
        """Delete a document from the collection.
181

182
        Args:
183
            ref_doc_id (Optional[str]): ID of the document to delete.
184
                Not currently supported.
185
            delete_kwargs: Must contain "uuid" key with UUID of the document to delete.
186
        """
187
        from zep_python.document import DocumentCollection
188

189
        if not isinstance(self._collection, DocumentCollection):
190
            raise ValueError("Collection not initialized")
191

192
        if ref_doc_id and len(ref_doc_id) > 0:
193
            raise NotImplementedError(
194
                "Delete by ref_doc_id not yet implemented for Zep."
195
            )
196

197
        if "uuid" in delete_kwargs:
198
            self._collection.delete_document(uuid=delete_kwargs["uuid"])
199
        else:
200
            raise ValueError("uuid must be specified")
201

202
    async def adelete(
203
        self, ref_doc_id: Optional[str] = None, **delete_kwargs: Any
204
    ) -> None:  # type: ignore
205
        """Asynchronously delete a document from the collection.
206

207
        Args:
208
            ref_doc_id (Optional[str]): ID of the document to delete.
209
                Not currently supported.
210
            delete_kwargs: Must contain "uuid" key with UUID of the document to delete.
211
        """
212
        from zep_python.document import DocumentCollection
213

214
        if not isinstance(self._collection, DocumentCollection):
215
            raise ValueError("Collection not initialized")
216

217
        if ref_doc_id and len(ref_doc_id) > 0:
218
            raise NotImplementedError(
219
                "Delete by ref_doc_id not yet implemented for Zep."
220
            )
221

222
        if "uuid" in delete_kwargs:
223
            await self._collection.adelete_document(uuid=delete_kwargs["uuid"])
224
        else:
225
            raise ValueError("uuid must be specified")
226

227
    def _parse_query_result(
228
        self, results: List["ZepDocument"]
229
    ) -> VectorStoreQueryResult:
230
        similarities: List[float] = []
231
        ids: List[str] = []
232
        nodes: List[TextNode] = []
233

234
        for d in results:
235
            node = metadata_dict_to_node(d.metadata or {})
236
            node.set_content(d.content)
237

238
            nodes.append(node)
239

240
            if d.score is None:
241
                d.score = 0.0
242
            similarities.append(d.score)
243

244
            if d.document_id is None:
245
                d.document_id = ""
246
            ids.append(d.document_id)
247

248
        return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)
249

250
    def _to_zep_filters(self, filters: MetadataFilters) -> Dict[str, Any]:
251
        """Convert filters to Zep filters. Filters are ANDed together."""
252
        filter_conditions: List[Dict[str, Any]] = []
253

254
        for f in filters.legacy_filters():
255
            filter_conditions.append({"jsonpath": f'$[*] ? (@.{f.key} == "{f.value}")'})
256

257
        return {"where": {"and": filter_conditions}}
258

259
    def query(
260
        self,
261
        query: VectorStoreQuery,
262
        **kwargs: Any,
263
    ) -> VectorStoreQueryResult:
264
        """Query the index for the top k most similar nodes to the given query.
265

266
        Args:
267
            query (VectorStoreQuery): Query object containing either a query string
268
                or a query embedding.
269

270
        Returns:
271
            VectorStoreQueryResult: Result of the query, containing the most similar
272
                nodes, their similarities, and their IDs.
273
        """
274
        from zep_python.document import DocumentCollection
275

276
        if not isinstance(self._collection, DocumentCollection):
277
            raise ValueError("Collection not initialized")
278

279
        if query.query_embedding is None and query.query_str is None:
280
            raise ValueError("query must have one of query_str or query_embedding")
281

282
        # If we have an embedding, we shouldn't use the query string
283
        # Zep does not allow both to be set
284
        if query.query_embedding:
285
            query.query_str = None
286

287
        metadata_filters = None
288
        if query.filters is not None:
289
            metadata_filters = self._to_zep_filters(query.filters)
290

291
        results = self._collection.search(
292
            text=query.query_str,
293
            embedding=query.query_embedding,
294
            metadata=metadata_filters,
295
            limit=query.similarity_top_k,
296
        )
297

298
        return self._parse_query_result(results)
299

300
    async def aquery(
301
        self,
302
        query: VectorStoreQuery,
303
        **kwargs: Any,
304
    ) -> VectorStoreQueryResult:
305
        """Asynchronously query the index for the top k most similar nodes to the
306
            given query.
307

308
        Args:
309
            query (VectorStoreQuery): Query object containing either a query string or
310
                a query embedding.
311

312
        Returns:
313
            VectorStoreQueryResult: Result of the query, containing the most similar
314
                nodes, their similarities, and their IDs.
315
        """
316
        from zep_python.document import DocumentCollection
317

318
        if not isinstance(self._collection, DocumentCollection):
319
            raise ValueError("Collection not initialized")
320

321
        if query.query_embedding is None and query.query_str is None:
322
            raise ValueError("query must have one of query_str or query_embedding")
323

324
        # If we have an embedding, we shouldn't use the query string
325
        # Zep does not allow both to be set
326
        if query.query_embedding:
327
            query.query_str = None
328

329
        metadata_filters = None
330
        if query.filters is not None:
331
            metadata_filters = self._to_zep_filters(query.filters)
332

333
        results = await self._collection.asearch(
334
            text=query.query_str,
335
            embedding=query.query_embedding,
336
            metadata=metadata_filters,
337
            limit=query.similarity_top_k,
338
        )
339

340
        return self._parse_query_result(results)
341
llama-index

Использование cookies