llama-index

Форк
0
256 строк · 7.2 Кб
1
"""Jaguar Reader."""
2

3
import datetime
4
import json
5
from typing import Any, List, Optional
6

7
from llama_index.legacy.readers.base import BaseReader
8
from llama_index.legacy.schema import Document
9

10

11
class JaguarReader(BaseReader):
12
    """Jaguar reader.
13
    Retrieve documents from existing persisted Jaguar store.
14
    """
15

16
    def __init__(
17
        self,
18
        pod: str,
19
        store: str,
20
        vector_index: str,
21
        vector_type: str,
22
        vector_dimension: int,
23
        url: str,
24
    ):
25
        """Constructor of JaguarReader.
26

27
        Args:
28
            pod: name of the pod (database)
29
            store: name of vector store in the pod
30
            vector_index: name of vector index of the store
31
            vector_type: type of the vector index
32
            vector_dimension: dimension of the vector index
33
            url: end point URL of jaguar http server
34
        """
35
        self._pod = pod
36
        self._store = store
37
        self._vector_index = vector_index
38
        self._vector_type = vector_type
39
        self._vector_dimension = vector_dimension
40

41
        try:
42
            from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
43
        except ImportError:
44
            raise ValueError(
45
                "Could not import jaguardb-http-client python package. "
46
                "Please install it with `pip install -U jaguardb-http-client`"
47
            )
48

49
        self._jag = JaguarHttpClient(url)
50
        self._token = ""
51

52
    def login(
53
        self,
54
        jaguar_api_key: Optional[str] = "",
55
    ) -> bool:
56
        """Login to jaguar server with a jaguar_api_key or let self._jag find a key.
57

58
        Args:
59
            optional jaguar_api_key (str): API key of user to jaguardb server.
60
            If not provided, jaguar api key is read from environment variable
61
            JAGUAR_API_KEY or from file $HOME/.jagrc
62
        Returns:
63
            True if successful; False if not successful
64
        """
65
        if jaguar_api_key == "":
66
            jaguar_api_key = self._jag.getApiKey()
67
        self._jaguar_api_key = jaguar_api_key
68
        self._token = self._jag.login(jaguar_api_key)
69
        if self._token == "":
70
            return False
71
        return True
72

73
    def logout(self) -> None:
74
        """Logout from jaguar server to cleanup resources.
75

76
        Args: no args
77
        Returns: None
78
        """
79
        self._jag.logout(self._token)
80

81
    def load_data(
82
        self,
83
        embedding: Optional[List[float]] = None,
84
        k: int = 10,
85
        metadata_fields: Optional[List[str]] = None,
86
        where: Optional[str] = None,
87
        **kwargs: Any,
88
    ) -> List[Document]:
89
        """Load data from the jaguar vector store.
90

91
        Args:
92
            embedding: list of float number for vector. If this
93
                       is given, it returns topk similar documents.
94
            k: Number of results to return.
95
            where: "a = '100' or ( b > 100 and c < 200 )"
96
                   If embedding is not given, it finds values
97
                   of columns in metadata_fields, and the text value.
98
            metadata_fields: Optional[List[str]] a list of metadata fields to load
99
                       in addition to the text document
100

101
        Returns:
102
            List of documents
103
        """
104
        if embedding is not None:
105
            return self._load_similar_data(
106
                embedding=embedding,
107
                k=k,
108
                metadata_fields=metadata_fields,
109
                where=where,
110
                **kwargs,
111
            )
112
        else:
113
            return self._load_store_data(
114
                k=k, metadata_fields=metadata_fields, where=where, **kwargs
115
            )
116

117
    def _load_similar_data(
118
        self,
119
        embedding: List[float],
120
        k: int = 10,
121
        metadata_fields: Optional[List[str]] = None,
122
        where: Optional[str] = None,
123
        **kwargs: Any,
124
    ) -> List[Document]:
125
        """Load data by similarity search from the jaguar store."""
126
        ### args is additional search conditions, such as time decay
127
        args = kwargs.get("args", None)
128
        fetch_k = kwargs.get("fetch_k", -1)
129

130
        vcol = self._vector_index
131
        vtype = self._vector_type
132
        str_embeddings = [str(f) for f in embedding]
133
        qv_comma = ",".join(str_embeddings)
134
        podstore = self._pod + "." + self._store
135
        q = (
136
            "select similarity("
137
            + vcol
138
            + ",'"
139
            + qv_comma
140
            + "','topk="
141
            + str(k)
142
            + ",fetch_k="
143
            + str(fetch_k)
144
            + ",type="
145
            + vtype
146
        )
147
        q += ",with_score,with_text"
148
        if args is not None:
149
            q += "," + args
150

151
        if metadata_fields is not None:
152
            x = "&".join(metadata_fields)
153
            q += ",metadata=" + x
154

155
        q += "') from " + podstore
156

157
        if where is not None:
158
            q += " where " + where
159

160
        jarr = self.run(q)
161
        if jarr is None:
162
            return []
163

164
        docs = []
165
        for js in jarr:
166
            score = js["score"]
167
            text = js["text"]
168
            zid = js["zid"]
169

170
            md = {}
171
            md["zid"] = zid
172
            md["score"] = score
173
            if metadata_fields is not None:
174
                for m in metadata_fields:
175
                    md[m] = js[m]
176

177
            doc = Document(
178
                id_=zid,
179
                text=text,
180
                metadata=md,
181
            )
182
            docs.append(doc)
183

184
        return docs
185

186
    def _load_store_data(
187
        self,
188
        k: int = 10,
189
        metadata_fields: Optional[List[str]] = None,
190
        where: Optional[str] = None,
191
        **kwargs: Any,
192
    ) -> List[Document]:
193
        """Load a number of document from the jaguar store."""
194
        vcol = self._vector_index
195
        podstore = self._pod + "." + self._store
196
        txtcol = vcol + ":text"
197

198
        sel_str = "zid," + txtcol
199
        if metadata_fields is not None:
200
            sel_str += "," + ",".join(metadata_fields)
201

202
        q = "select " + sel_str
203
        q += " from " + podstore
204

205
        if where is not None:
206
            q += " where " + where
207
        q += " limit " + str(k)
208

209
        jarr = self.run(q)
210
        if jarr is None:
211
            return []
212

213
        docs = []
214
        for ds in jarr:
215
            js = json.loads(ds)
216
            text = js[txtcol]
217
            zid = js["zid"]
218

219
            md = {}
220
            md["zid"] = zid
221
            if metadata_fields is not None:
222
                for m in metadata_fields:
223
                    md[m] = js[m]
224

225
            doc = Document(
226
                id_=zid,
227
                text=text,
228
                metadata=md,
229
            )
230
            docs.append(doc)
231

232
        return docs
233

234
    def run(self, query: str) -> dict:
235
        """Run any query statement in jaguardb.
236

237
        Args:
238
            query (str): query statement to jaguardb
239
        Returns:
240
            None for invalid token, or
241
            json result string
242
        """
243
        if self._token == "":
244
            return {}
245

246
        resp = self._jag.post(query, self._token, False)
247
        txt = resp.text
248
        try:
249
            return json.loads(txt)
250
        except Exception as e:
251
            return {}
252

253
    def prt(self, msg: str) -> None:
254
        nows = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
255
        with open("/tmp/debugjaguarrdr.log", "a") as file:
256
            print(f"{nows} msg={msg}", file=file, flush=True)
257

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.