llama-index

Форк
0
505 строк · 15.2 Кб
1
""" Jaguar Vector Store.
2

3
. A distributed vector database
4
. The ZeroMove feature enables instant horizontal scalability
5
. Multimodal: embeddings, text, images, videos, PDFs, audio, time series, and geospatial
6
. All-masters: allows both parallel reads and writes
7
. Anomaly detection capabilities: anomaly and anomamous
8
. RAG support: combines LLMs with proprietary and real-time data
9
. Shared metadata: sharing of metadata across multiple vector indexes
10
. Distance metrics: Euclidean, Cosine, InnerProduct, Manhatten, Chebyshev, Hamming, Jeccard, Minkowski
11

12
"""
13

14
import datetime
15
import json
16
import logging
17
from typing import Any, List, Optional, Tuple, Union, cast
18

19
from llama_index.legacy.schema import BaseNode, Document, TextNode
20
from llama_index.legacy.vector_stores.types import (
21
    VectorStore,
22
    VectorStoreQuery,
23
    VectorStoreQueryResult,
24
)
25

26
logger = logging.getLogger(__name__)
27

28

29
class JaguarVectorStore(VectorStore):
30
    """Jaguar vector store.
31

32
    See http://www.jaguardb.com
33
    See http://github.com/fserv/jaguar-sdk
34

35
    Example:
36
       .. code-block:: python
37

38
           vectorstore = JaguarVectorStore(
39
               pod = 'vdb',
40
               store = 'mystore',
41
               vector_index = 'v',
42
               vector_type = 'cosine_fraction_float',
43
               vector_dimension = 1536,
44
               url='http://192.168.8.88:8080/fwww/',
45
           )
46
    """
47

48
    stores_text: bool = True
49

50
    def __init__(
51
        self,
52
        pod: str,
53
        store: str,
54
        vector_index: str,
55
        vector_type: str,
56
        vector_dimension: int,
57
        url: str,
58
    ):
59
        """Constructor of JaguarVectorStore.
60

61
        Args:
62
            pod: str:  name of the pod (database)
63
            store: str:  name of vector store in the pod
64
            vector_index: str:  name of vector index of the store
65
            vector_type: str:  type of the vector index
66
            vector_dimension: int:  dimension of the vector index
67
            url: str:  URL end point of jaguar http server
68
        """
69
        self._pod = pod
70
        self._store = store
71
        self._vector_index = vector_index
72
        self._vector_type = vector_type
73
        self._vector_dimension = vector_dimension
74

75
        try:
76
            from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
77
        except ImportError:
78
            logger.error("E0001 error import JaguarHttpClient")
79
            raise ValueError(
80
                "Could not import jaguardb-http-client python package. "
81
                "Please install it with `pip install -U jaguardb-http-client`"
82
            )
83

84
        self._jag = JaguarHttpClient(url)
85
        self._token = ""
86

87
    def __del__(self) -> None:
88
        pass
89

90
    @classmethod
91
    def class_name(cls) -> str:
92
        return "JaguarVectorStore"
93

94
    @property
95
    def client(self) -> Any:
96
        """Get client."""
97
        return self._jag
98

99
    def add(
100
        self,
101
        nodes: List[BaseNode],
102
        **add_kwargs: Any,
103
    ) -> List[str]:
104
        """Add nodes to index.
105

106
        Args:
107
            nodes: List[BaseNode]: list of nodes with embeddings
108
        """
109
        use_node_metadata = add_kwargs.get("use_node_metadata", False)
110
        ids = []
111
        for node in nodes:
112
            text = node.get_text()
113
            embedding = node.get_embedding()
114
            if use_node_metadata is True:
115
                metadata = node.metadata
116
            else:
117
                metadata = None
118
            zid = self.add_text(text, embedding, metadata, **add_kwargs)
119
            ids.append(zid)
120

121
        return ids
122

123
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
124
        """
125
        Delete nodes using with ref_doc_id.
126

127
        Args:
128
            ref_doc_id (str): The doc_id of the document to delete.
129
        """
130
        podstore = self._pod + "." + self._store
131
        q = "delete from " + podstore + " where zid='" + ref_doc_id + "'"
132
        self.run(q)
133

134
    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
135
        """Query index for top k most similar nodes.
136

137
        Args:
138
            query: VectorStoreQuery object
139
            kwargs:  may contain 'where', 'metadata_fields', 'args', 'fetch_k'
140
        """
141
        embedding = query.query_embedding
142
        k = query.similarity_top_k
143
        (nodes, ids, simscores) = self.similarity_search_with_score(
144
            embedding, k=k, form="node", **kwargs
145
        )
146
        return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=simscores)
147

148
    def load_documents(
149
        self, embedding: List[float], k: int, **kwargs: Any
150
    ) -> List[Document]:
151
        """Query index to load top k most similar documents.
152

153
        Args:
154
            embedding: a list of floats
155
            k: topK number
156
            kwargs:  may contain 'where', 'metadata_fields', 'args', 'fetch_k'
157
        """
158
        return cast(
159
            List[Document],
160
            self.similarity_search_with_score(embedding, k=k, form="doc", **kwargs),
161
        )
162

163
    def create(
164
        self,
165
        metadata_fields: str,
166
        text_size: int,
167
    ) -> None:
168
        """
169
        create the vector store on the backend database.
170

171
        Args:
172
            metadata_fields (str):  exrta metadata columns and types
173
        Returns:
174
            True if successful; False if not successful
175
        """
176
        podstore = self._pod + "." + self._store
177

178
        """
179
        v:text column is required.
180
        """
181
        q = "create store "
182
        q += podstore
183
        q += f" ({self._vector_index} vector({self._vector_dimension},"
184
        q += f" '{self._vector_type}'),"
185
        q += f"  v:text char({text_size}),"
186
        q += metadata_fields + ")"
187
        self.run(q)
188

189
    def add_text(
190
        self,
191
        text: str,
192
        embedding: List[float],
193
        metadata: Optional[dict] = None,
194
        **kwargs: Any,
195
    ) -> str:
196
        """
197
        Add  texts through the embeddings and add to the vectorstore.
198

199
        Args:
200
          texts: text string to add to the jaguar vector store.
201
          embedding: embedding vector of the text, list of floats
202
          metadata: {'file_path': '../data/paul_graham/paul_graham_essay.txt',
203
                          'file_name': 'paul_graham_essay.txt',
204
                          'file_type': 'text/plain',
205
                          'file_size': 75042,
206
                          'creation_date': '2023-12-24',
207
                          'last_modified_date': '2023-12-24',
208
                          'last_accessed_date': '2023-12-28'}
209
          kwargs: vector_index=name_of_vector_index
210
                  file_column=name_of_file_column
211
                  metadata={...}
212

213
        Returns:
214
            id from adding the text into the vectorstore
215
        """
216
        text = text.replace("'", "\\'")
217
        vcol = self._vector_index
218
        filecol = kwargs.get("file_column", "")
219
        text_tag = kwargs.get("text_tag", "")
220

221
        if text_tag != "":
222
            text = text_tag + " " + text
223

224
        podstorevcol = self._pod + "." + self._store + "." + vcol
225
        q = "textcol " + podstorevcol
226
        js = self.run(q)
227
        if js == "":
228
            return ""
229
        textcol = js["data"]
230

231
        zid = ""
232
        if metadata is None:
233
            ### no metadata and no files to upload
234
            str_vec = [str(x) for x in embedding]
235
            values_comma = ",".join(str_vec)
236
            podstore = self._pod + "." + self._store
237
            q = "insert into " + podstore + " ("
238
            q += vcol + "," + textcol + ") values ('" + values_comma
239
            q += "','" + text + "')"
240
            js = self.run(q, False)
241
            zid = js["zid"]
242
        else:
243
            str_vec = [str(x) for x in embedding]
244
            nvec, vvec, filepath = self._parseMeta(metadata, filecol)
245
            if filecol != "":
246
                rc = self._jag.postFile(self._token, filepath, 1)
247
                if not rc:
248
                    return ""
249
            names_comma = ",".join(nvec)
250
            names_comma += "," + vcol
251
            ## col1,col2,col3,vecl
252

253
            if vvec is not None and len(vvec) > 0:
254
                values_comma = "'" + "','".join(vvec) + "'"
255
            else:
256
                values_comma = "'" + "','".join(vvec) + "'"
257

258
            ### 'va1','val2','val3'
259
            values_comma += ",'" + ",".join(str_vec) + "'"
260
            ### 'v1,v2,v3'
261
            podstore = self._pod + "." + self._store
262
            q = "insert into " + podstore + " ("
263
            q += names_comma + "," + textcol + ") values (" + values_comma
264
            q += ",'" + text + "')"
265
            if filecol != "":
266
                js = self.run(q, True)
267
            else:
268
                js = self.run(q, False)
269
            zid = js["zid"]
270

271
        return zid
272

273
    def similarity_search_with_score(
274
        self,
275
        embedding: Optional[List[float]],
276
        k: int = 3,
277
        form: str = "node",
278
        **kwargs: Any,
279
    ) -> Union[Tuple[List[TextNode], List[str], List[float]], List[Document]]:
280
        """Return nodes most similar to query embedding, along with ids and scores.
281

282
        Args:
283
            embedding: embedding of text to look up.
284
            k: Number of nodes to return. Defaults to 3.
285
            form: if "node", return Tuple[List[TextNode], List[str], List[float]]
286
                  if "doc", return List[Document]
287
            kwargs: may have where, metadata_fields, args, fetch_k
288
        Returns:
289
            Tuple(list of nodes, list of ids, list of similaity scores)
290
        """
291
        where = kwargs.get("where", None)
292
        metadata_fields = kwargs.get("metadata_fields", None)
293

294
        args = kwargs.get("args", None)
295
        fetch_k = kwargs.get("fetch_k", -1)
296

297
        vcol = self._vector_index
298
        vtype = self._vector_type
299
        if embedding is None:
300
            return ([], [], [])
301
        str_embeddings = [str(f) for f in embedding]
302
        qv_comma = ",".join(str_embeddings)
303
        podstore = self._pod + "." + self._store
304
        q = (
305
            "select similarity("
306
            + vcol
307
            + ",'"
308
            + qv_comma
309
            + "','topk="
310
            + str(k)
311
            + ",fetch_k="
312
            + str(fetch_k)
313
            + ",type="
314
            + vtype
315
        )
316
        q += ",with_score=yes,with_text=yes"
317
        if args is not None:
318
            q += "," + args
319

320
        if metadata_fields is not None:
321
            x = "&".join(metadata_fields)
322
            q += ",metadata=" + x
323

324
        q += "') from " + podstore
325

326
        if where is not None:
327
            q += " where " + where
328

329
        jarr = self.run(q)
330

331
        if jarr is None:
332
            return ([], [], [])
333

334
        nodes = []
335
        ids = []
336
        simscores = []
337
        docs = []
338
        for js in jarr:
339
            score = js["score"]
340
            text = js["text"]
341
            zid = js["zid"]
342

343
            md = {}
344
            md["zid"] = zid
345
            if metadata_fields is not None:
346
                for m in metadata_fields:
347
                    mv = js[m]
348
                    md[m] = mv
349

350
            if form == "node":
351
                node = TextNode(
352
                    id_=zid,
353
                    text=text,
354
                    metadata=md,
355
                )
356
                nodes.append(node)
357
                ids.append(zid)
358
                simscores.append(float(score))
359
            else:
360
                doc = Document(
361
                    id_=zid,
362
                    text=text,
363
                    metadata=md,
364
                )
365
                docs.append(doc)
366

367
        if form == "node":
368
            return (nodes, ids, simscores)
369
        else:
370
            return docs
371

372
    def is_anomalous(
373
        self,
374
        node: BaseNode,
375
        **kwargs: Any,
376
    ) -> bool:
377
        """Detect if given text is anomalous from the dataset.
378

379
        Args:
380
            query: Text to detect if it is anomaly
381
        Returns:
382
            True or False
383
        """
384
        vcol = self._vector_index
385
        vtype = self._vector_type
386
        str_embeddings = [str(f) for f in node.get_embedding()]
387
        qv_comma = ",".join(str_embeddings)
388
        podstore = self._pod + "." + self._store
389
        q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')"
390
        q += " from " + podstore
391

392
        js = self.run(q)
393
        if isinstance(js, list) and len(js) == 0:
394
            return False
395
        jd = json.loads(js[0])
396
        if jd["anomalous"] == "YES":
397
            return True
398
        return False
399

400
    def run(self, query: str, withFile: bool = False) -> dict:
401
        """Run any query statement in jaguardb.
402

403
        Args:
404
            query (str): query statement to jaguardb
405
        Returns:
406
            None for invalid token, or
407
            json result string
408
        """
409
        if self._token == "":
410
            logger.error(f"E0005 error run({query})")
411
            return {}
412

413
        resp = self._jag.post(query, self._token, withFile)
414
        txt = resp.text
415
        try:
416
            return json.loads(txt)
417
        except Exception:
418
            return {}
419

420
    def count(self) -> int:
421
        """Count records of a store in jaguardb.
422

423
        Args: no args
424
        Returns: (int) number of records in pod store
425
        """
426
        podstore = self._pod + "." + self._store
427
        q = "select count() from " + podstore
428
        js = self.run(q)
429
        if isinstance(js, list) and len(js) == 0:
430
            return 0
431
        jd = json.loads(js[0])
432
        return int(jd["data"])
433

434
    def clear(self) -> None:
435
        """Delete all records in jaguardb.
436

437
        Args: No args
438
        Returns: None
439
        """
440
        podstore = self._pod + "." + self._store
441
        q = "truncate store " + podstore
442
        self.run(q)
443

444
    def drop(self) -> None:
445
        """Drop or remove a store in jaguardb.
446

447
        Args: no args
448
        Returns: None
449
        """
450
        podstore = self._pod + "." + self._store
451
        q = "drop store " + podstore
452
        self.run(q)
453

454
    def prt(self, msg: str) -> None:
455
        nows = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
456
        with open("/tmp/debugjaguar.log", "a") as file:
457
            print(f"{nows} msg={msg}", file=file, flush=True)
458

459
    def login(
460
        self,
461
        jaguar_api_key: Optional[str] = "",
462
    ) -> bool:
463
        """Login to jaguar server with a jaguar_api_key or let self._jag find a key.
464

465
        Args:
466
            optional jaguar_api_key (str): API key of user to jaguardb server
467
        Returns:
468
            True if successful; False if not successful
469
        """
470
        if jaguar_api_key == "":
471
            jaguar_api_key = self._jag.getApiKey()
472
        self._jaguar_api_key = jaguar_api_key
473
        self._token = self._jag.login(jaguar_api_key)
474
        if self._token == "":
475
            logger.error("E0001 error init(): invalid jaguar_api_key")
476
            return False
477
        return True
478

479
    def logout(self) -> None:
480
        """Logout to cleanup resources.
481

482
        Args: no args
483
        Returns: None
484
        """
485
        self._jag.logout(self._token)
486

487
    def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]:
488
        filepath = ""
489
        if filecol == "":
490
            nvec = list(nvmap.keys())
491
            vvec = list(nvmap.values())
492
        else:
493
            nvec = []
494
            vvec = []
495
            if filecol in nvmap:
496
                nvec.append(filecol)
497
                vvec.append(nvmap[filecol])
498
                filepath = nvmap[filecol]
499

500
            for k, v in nvmap.items():
501
                if k != filecol:
502
                    nvec.append(k)
503
                    vvec.append(v)
504

505
        return nvec, vvec, filepath
506

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.