llama-index

azurecosmosmongo.py
249 строк · 8.7 Кб
Перенос по словам
1
"""Azure CosmosDB MongoDB vCore Vector store index.
2

3
An index that is built on top of an existing vector store.
4

5
"""
6

7
import logging
8
import os
9
from typing import Any, Dict, List, Optional, cast
10

11
from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode
12
from llama_index.legacy.vector_stores.types import (
13
    VectorStore,
14
    VectorStoreQuery,
15
    VectorStoreQueryResult,
16
)
17
from llama_index.legacy.vector_stores.utils import (
18
    legacy_metadata_dict_to_node,
19
    metadata_dict_to_node,
20
    node_to_metadata_dict,
21
)
22

23
logger = logging.getLogger(__name__)
24

25

26
class AzureCosmosDBMongoDBVectorSearch(VectorStore):
27
    """Azure CosmosDB MongoDB vCore Vector Store.
28

29
    To use, you should have both:
30
    - the ``pymongo`` python package installed
31
    - a connection string associated with an Azure Cosmodb MongoDB vCore Cluster
32
    """
33

34
    stores_text: bool = True
35
    flat_metadata: bool = True
36

37
    def __init__(
38
        self,
39
        mongodb_client: Optional[Any] = None,
40
        db_name: str = "default_db",
41
        collection_name: str = "default_collection",
42
        index_name: str = "default_vector_search_index",
43
        id_key: str = "id",
44
        embedding_key: str = "content_vector",
45
        text_key: str = "text",
46
        metadata_key: str = "metadata",
47
        cosmos_search_kwargs: Optional[Dict] = None,
48
        insert_kwargs: Optional[Dict] = None,
49
        **kwargs: Any,
50
    ) -> None:
51
        """Initialize the vector store.
52

53
        Args:
54
            mongodb_client: An Azure CosmoDB MongoDB client (type: MongoClient, shown any for lazy import).
55
            db_name: An Azure CosmosDB MongoDB database name.
56
            collection_name: An Azure CosmosDB collection name.
57
            index_name: An Azure CosmosDB MongoDB vCore Vector Search index name.
58
            id_key: The data field to use as the id.
59
            embedding_key: An Azure CosmosDB MongoDB field that will contain
60
            the embedding for each document.
61
            text_key: An Azure CosmosDB MongoDB field that will contain the text for each document.
62
            metadata_key: An Azure CosmosDB MongoDB field that will contain
63
            the metadata for each document.
64
            cosmos_search_kwargs: An Azure CosmosDB MongoDB field that will
65
            contain search options, such as kind, numLists, similarity, and dimensions.
66
            insert_kwargs: The kwargs used during `insert`.
67
        """
68
        import_err_msg = "`pymongo` package not found, please run `pip install pymongo`"
69
        try:
70
            import pymongo
71
        except ImportError:
72
            raise ImportError(import_err_msg)
73

74
        if mongodb_client is not None:
75
            self._mongodb_client = cast(pymongo.MongoClient, mongodb_client)
76
        else:
77
            if "AZURE_COSMOSDB_MONGODB_URI" not in os.environ:
78
                raise ValueError(
79
                    "Must specify Azure cosmodb 'AZURE_COSMOSDB_MONGODB_URI' via env variable "
80
                    "if not directly passing in client."
81
                )
82
            self._mongodb_client = pymongo.MongoClient(
83
                os.environ["AZURE_COSMOSDB_MONGODB_URI"]
84
            )
85

86
        self._collection = self._mongodb_client[db_name][collection_name]
87
        self._index_name = index_name
88
        self._embedding_key = embedding_key
89
        self._id_key = id_key
90
        self._text_key = text_key
91
        self._metadata_key = metadata_key
92
        self._insert_kwargs = insert_kwargs or {}
93
        self._db_name = db_name
94
        self._collection_name = collection_name
95
        self._cosmos_search_kwargs = cosmos_search_kwargs or {}
96
        self._create_vector_search_index()
97

98
    def _create_vector_search_index(self) -> None:
99
        db = self._mongodb_client[self._db_name]
100
        db.command(
101
            {
102
                "createIndexes": self._collection_name,
103
                "indexes": [
104
                    {
105
                        "name": self._index_name,
106
                        "key": {self._embedding_key: "cosmosSearch"},
107
                        "cosmosSearchOptions": {
108
                            "kind": self._cosmos_search_kwargs.get(
109
                                "kind", "vector-ivf"
110
                            ),
111
                            "numLists": self._cosmos_search_kwargs.get("numLists", 1),
112
                            "similarity": self._cosmos_search_kwargs.get(
113
                                "similarity", "COS"
114
                            ),
115
                            "dimensions": self._cosmos_search_kwargs.get(
116
                                "dimensions", 1536
117
                            ),
118
                        },
119
                    }
120
                ],
121
            }
122
        )
123

124
    def add(
125
        self,
126
        nodes: List[BaseNode],
127
        **add_kwargs: Any,
128
    ) -> List[str]:
129
        """Add nodes to index.
130

131
        Args:
132
            nodes: List[BaseNode]: list of nodes with embeddings
133

134
        Returns:
135
            A List of ids for successfully added nodes.
136

137
        """
138
        ids = []
139
        data_to_insert = []
140
        for node in nodes:
141
            metadata = node_to_metadata_dict(
142
                node, remove_text=True, flat_metadata=self.flat_metadata
143
            )
144

145
            entry = {
146
                self._id_key: node.node_id,
147
                self._embedding_key: node.get_embedding(),
148
                self._text_key: node.get_content(metadata_mode=MetadataMode.NONE) or "",
149
                self._metadata_key: metadata,
150
            }
151
            data_to_insert.append(entry)
152
            ids.append(node.node_id)
153
        logger.debug("Inserting data into MongoDB: %s", data_to_insert)
154
        insert_result = self._collection.insert_many(
155
            data_to_insert, **self._insert_kwargs
156
        )
157
        logger.debug("Result of insert: %s", insert_result)
158
        return ids
159

160
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
161
        """
162
        Delete nodes using with ref_doc_id.
163

164
        Args:
165
            ref_doc_id (str): The doc_id of the document to delete.
166

167
        """
168
        # delete by filtering on the doc_id metadata
169
        self._collection.delete_one(
170
            filter={self._metadata_key + ".ref_doc_id": ref_doc_id}, **delete_kwargs
171
        )
172

173
    @property
174
    def client(self) -> Any:
175
        """Return MongoDB client."""
176
        return self._mongodb_client
177

178
    def _query(self, query: VectorStoreQuery) -> VectorStoreQueryResult:
179
        params: Dict[str, Any] = {
180
            "vector": query.query_embedding,
181
            "path": self._embedding_key,
182
            "k": query.similarity_top_k,
183
        }
184

185
        if query.filters is not None:
186
            raise ValueError(
187
                "Metadata filters not implemented for azure cosmosdb mongodb yet."
188
            )
189

190
        query_field = {"$search": {"cosmosSearch": params, "returnStoredSource": True}}
191

192
        pipeline = [
193
            query_field,
194
            {
195
                "$project": {
196
                    "similarityScore": {"$meta": "searchScore"},
197
                    "document": "$$ROOT",
198
                }
199
            },
200
        ]
201

202
        logger.debug("Running query pipeline: %s", pipeline)
203
        cursor = self._collection.aggregate(pipeline)  # type: ignore
204

205
        top_k_nodes = []
206
        top_k_ids = []
207
        top_k_scores = []
208
        for res in cursor:
209
            text = res["document"].pop(self._text_key)
210
            score = res.pop("similarityScore")
211
            id = res["document"].pop(self._id_key)
212
            metadata_dict = res["document"].pop(self._metadata_key)
213

214
            try:
215
                node = metadata_dict_to_node(metadata_dict)
216
                node.set_content(text)
217
            except Exception:
218
                # NOTE: deprecated legacy logic for backward compatibility
219
                metadata, node_info, relationships = legacy_metadata_dict_to_node(
220
                    metadata_dict
221
                )
222

223
                node = TextNode(
224
                    text=text,
225
                    id_=id,
226
                    metadata=metadata,
227
                    start_char_idx=node_info.get("start", None),
228
                    end_char_idx=node_info.get("end", None),
229
                    relationships=relationships,
230
                )
231
            top_k_ids.append(id)
232
            top_k_nodes.append(node)
233
            top_k_scores.append(score)
234
        result = VectorStoreQueryResult(
235
            nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids
236
        )
237
        logger.debug("Result of query: %s", result)
238
        return result
239

240
    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
241
        """Query index for top k most similar nodes.
242

243
        Args:
244
            query: a VectorStoreQuery object.
245

246
        Returns:
247
            A VectorStoreQueryResult containing the results of the query.
248
        """
249
        return self._query(query)
250
llama-index

Использование cookies