llama-index

Форк
0
274 строки · 8.7 Кб
1
"""Tair Vector store index.
2

3
An index that is built on top of Alibaba Cloud's Tair database.
4
"""
5

6
import logging
7
from typing import TYPE_CHECKING, Any, Dict, List, Optional
8

9
from llama_index.legacy.schema import (
10
    BaseNode,
11
    MetadataMode,
12
    NodeRelationship,
13
    RelatedNodeInfo,
14
    TextNode,
15
)
16
from llama_index.legacy.vector_stores.types import (
17
    MetadataFilters,
18
    VectorStore,
19
    VectorStoreQuery,
20
    VectorStoreQueryResult,
21
)
22
from llama_index.legacy.vector_stores.utils import node_to_metadata_dict
23

24
_logger = logging.getLogger(__name__)
25

26

27
if TYPE_CHECKING:
28
    from tair import Tair
29

30

31
def _to_filter_expr(filters: MetadataFilters) -> str:
32
    conditions = []
33
    for f in filters.legacy_filters():
34
        value = str(f.value)
35
        if isinstance(f.value, str):
36
            value = '"' + value + '"'
37
        conditions.append(f"{f.key}=={value}")
38
    return "&&".join(conditions)
39

40

41
class TairVectorStore(VectorStore):
42
    stores_text = True
43
    stores_node = True
44
    flat_metadata = False
45

46
    def __init__(
47
        self,
48
        tair_url: str,
49
        index_name: str,
50
        index_type: str = "HNSW",
51
        index_args: Optional[Dict[str, Any]] = None,
52
        overwrite: bool = False,
53
        **kwargs: Any,
54
    ) -> None:
55
        """Initialize TairVectorStore.
56

57
        Two index types are available: FLAT & HNSW.
58

59
        index args for HNSW:
60
            - ef_construct
61
            - M
62
            - ef_search
63

64
        Detailed info for these arguments can be found here:
65
        https://www.alibabacloud.com/help/en/tair/latest/tairvector#section-c76-ull-5mk
66

67
        Args:
68
            index_name (str): Name of the index.
69
            index_type (str): Type of the index. Defaults to 'HNSW'.
70
            index_args (Dict[str, Any]): Arguments for the index. Defaults to None.
71
            tair_url (str): URL for the Tair instance.
72
            overwrite (bool): Whether to overwrite the index if it already exists.
73
                Defaults to False.
74
            kwargs (Any): Additional arguments to pass to the Tair client.
75

76
        Raises:
77
            ValueError: If tair-py is not installed
78
            ValueError: If failed to connect to Tair instance
79

80
        Examples:
81
            >>> from llama_index.legacy.vector_stores.tair import TairVectorStore
82
            >>> # Create a TairVectorStore
83
            >>> vector_store = TairVectorStore(
84
            >>>     tair_url="redis://{username}:{password}@r-bp****************.\
85
                redis.rds.aliyuncs.com:{port}",
86
            >>>     index_name="my_index",
87
            >>>     index_type="HNSW",
88
            >>>     index_args={"M": 16, "ef_construct": 200},
89
            >>>     overwrite=True)
90

91
        """
92
        try:
93
            from tair import Tair, tairvector  # noqa
94
        except ImportError:
95
            raise ValueError(
96
                "Could not import tair-py python package. "
97
                "Please install it with `pip install tair`."
98
            )
99
        try:
100
            self._tair_client = Tair.from_url(tair_url, **kwargs)
101
        except ValueError as e:
102
            raise ValueError(f"Tair failed to connect: {e}")
103

104
        # index identifiers
105
        self._index_name = index_name
106
        self._index_type = index_type
107
        self._metric_type = "L2"
108
        self._overwrite = overwrite
109
        self._index_args = {}
110
        self._query_args = {}
111
        if index_type == "HNSW":
112
            if index_args is not None:
113
                ef_construct = index_args.get("ef_construct", 500)
114
                M = index_args.get("M", 24)
115
                ef_search = index_args.get("ef_search", 400)
116
            else:
117
                ef_construct = 500
118
                M = 24
119
                ef_search = 400
120

121
            self._index_args = {"ef_construct": ef_construct, "M": M}
122
            self._query_args = {"ef_search": ef_search}
123

124
    @property
125
    def client(self) -> "Tair":
126
        """Return the Tair client instance."""
127
        return self._tair_client
128

129
    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
130
        """Add nodes to the index.
131

132
        Args:
133
            nodes (List[BaseNode]): List of nodes with embeddings
134

135
        Returns:
136
            List[str]: List of ids of the documents added to the index.
137
        """
138
        # check to see if empty document list was passed
139
        if len(nodes) == 0:
140
            return []
141

142
        # set vector dim for creation if index doesn't exist
143
        self.dim = len(nodes[0].get_embedding())
144

145
        if self._index_exists():
146
            if self._overwrite:
147
                self.delete_index()
148
                self._create_index()
149
            else:
150
                logging.info(f"Adding document to existing index {self._index_name}")
151
        else:
152
            self._create_index()
153

154
        ids = []
155
        for node in nodes:
156
            attributes = {
157
                "id": node.node_id,
158
                "doc_id": node.ref_doc_id,
159
                "text": node.get_content(metadata_mode=MetadataMode.NONE),
160
            }
161
            metadata_dict = node_to_metadata_dict(
162
                node, remove_text=True, flat_metadata=self.flat_metadata
163
            )
164
            attributes.update(metadata_dict)
165

166
            ids.append(node.node_id)
167
            self._tair_client.tvs_hset(
168
                self._index_name,
169
                f"{node.ref_doc_id}#{node.node_id}",
170
                vector=node.get_embedding(),
171
                is_binary=False,
172
                **attributes,
173
            )
174

175
        _logger.info(f"Added {len(ids)} documents to index {self._index_name}")
176
        return ids
177

178
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
179
        """Delete a document.
180

181
        Args:
182
            doc_id (str): document id
183

184
        """
185
        iter = self._tair_client.tvs_scan(self._index_name, "%s#*" % ref_doc_id)
186
        for k in iter:
187
            self._tair_client.tvs_del(self._index_name, k)
188

189
    def delete_index(self) -> None:
190
        """Delete the index and all documents."""
191
        _logger.info(f"Deleting index {self._index_name}")
192
        self._tair_client.tvs_del_index(self._index_name)
193

194
    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
195
        """Query the index.
196

197
        Args:
198
            query (VectorStoreQuery): query object
199

200
        Returns:
201
            VectorStoreQueryResult: query result
202

203
        Raises:
204
            ValueError: If query.query_embedding is None.
205
        """
206
        filter_expr = None
207
        if query.filters is not None:
208
            filter_expr = _to_filter_expr(query.filters)
209

210
        if not query.query_embedding:
211
            raise ValueError("Query embedding is required for querying.")
212

213
        _logger.info(f"Querying index {self._index_name}")
214

215
        query_args = self._query_args
216
        if self._index_type == "HNSW" and "ef_search" in kwargs:
217
            query_args["ef_search"] = kwargs["ef_search"]
218

219
        results = self._tair_client.tvs_knnsearch(
220
            self._index_name,
221
            query.similarity_top_k,
222
            query.query_embedding,
223
            False,
224
            filter_str=filter_expr,
225
            **query_args,
226
        )
227
        results = [(k.decode(), float(s)) for k, s in results]
228

229
        ids = []
230
        nodes = []
231
        scores = []
232
        pipe = self._tair_client.pipeline(transaction=False)
233
        for key, score in results:
234
            scores.append(score)
235
            pipe.tvs_hmget(self._index_name, key, "id", "doc_id", "text")
236
        metadatas = pipe.execute()
237
        for i, m in enumerate(metadatas):
238
            # TODO: properly get the _node_conent
239
            doc_id = m[0].decode()
240
            node = TextNode(
241
                text=m[2].decode(),
242
                id_=doc_id,
243
                embedding=None,
244
                relationships={
245
                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=m[1].decode())
246
                },
247
            )
248
            ids.append(doc_id)
249
            nodes.append(node)
250
        _logger.info(f"Found {len(nodes)} results for query with id {ids}")
251

252
        return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)
253

254
    def _create_index(self) -> None:
255
        try:
256
            from tair import tairvector
257
        except ImportError:
258
            raise ValueError(
259
                "Could not import tair-py python package. "
260
                "Please install it with `pip install tair`."
261
            )
262
        _logger.info(f"Creating index {self._index_name}")
263
        self._tair_client.tvs_create_index(
264
            self._index_name,
265
            self.dim,
266
            distance_type=self._metric_type,
267
            index_type=self._index_type,
268
            data_type=tairvector.DataType.Float32,
269
            **self._index_args,
270
        )
271

272
    def _index_exists(self) -> bool:
273
        index = self._tair_client.tvs_get_index(self._index_name)
274
        return index is not None
275

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.