llama-index
274 строки · 8.7 Кб
1"""Tair Vector store index.
2
3An index that is built on top of Alibaba Cloud's Tair database.
4"""
5
6import logging7from typing import TYPE_CHECKING, Any, Dict, List, Optional8
9from llama_index.legacy.schema import (10BaseNode,11MetadataMode,12NodeRelationship,13RelatedNodeInfo,14TextNode,15)
16from llama_index.legacy.vector_stores.types import (17MetadataFilters,18VectorStore,19VectorStoreQuery,20VectorStoreQueryResult,21)
22from llama_index.legacy.vector_stores.utils import node_to_metadata_dict23
24_logger = logging.getLogger(__name__)25
26
27if TYPE_CHECKING:28from tair import Tair29
30
31def _to_filter_expr(filters: MetadataFilters) -> str:32conditions = []33for f in filters.legacy_filters():34value = str(f.value)35if isinstance(f.value, str):36value = '"' + value + '"'37conditions.append(f"{f.key}=={value}")38return "&&".join(conditions)39
40
41class TairVectorStore(VectorStore):42stores_text = True43stores_node = True44flat_metadata = False45
46def __init__(47self,48tair_url: str,49index_name: str,50index_type: str = "HNSW",51index_args: Optional[Dict[str, Any]] = None,52overwrite: bool = False,53**kwargs: Any,54) -> None:55"""Initialize TairVectorStore.56
57Two index types are available: FLAT & HNSW.
58
59index args for HNSW:
60- ef_construct
61- M
62- ef_search
63
64Detailed info for these arguments can be found here:
65https://www.alibabacloud.com/help/en/tair/latest/tairvector#section-c76-ull-5mk
66
67Args:
68index_name (str): Name of the index.
69index_type (str): Type of the index. Defaults to 'HNSW'.
70index_args (Dict[str, Any]): Arguments for the index. Defaults to None.
71tair_url (str): URL for the Tair instance.
72overwrite (bool): Whether to overwrite the index if it already exists.
73Defaults to False.
74kwargs (Any): Additional arguments to pass to the Tair client.
75
76Raises:
77ValueError: If tair-py is not installed
78ValueError: If failed to connect to Tair instance
79
80Examples:
81>>> from llama_index.legacy.vector_stores.tair import TairVectorStore
82>>> # Create a TairVectorStore
83>>> vector_store = TairVectorStore(
84>>> tair_url="redis://{username}:{password}@r-bp****************.\
85redis.rds.aliyuncs.com:{port}",
86>>> index_name="my_index",
87>>> index_type="HNSW",
88>>> index_args={"M": 16, "ef_construct": 200},
89>>> overwrite=True)
90
91"""
92try:93from tair import Tair, tairvector # noqa94except ImportError:95raise ValueError(96"Could not import tair-py python package. "97"Please install it with `pip install tair`."98)99try:100self._tair_client = Tair.from_url(tair_url, **kwargs)101except ValueError as e:102raise ValueError(f"Tair failed to connect: {e}")103
104# index identifiers105self._index_name = index_name106self._index_type = index_type107self._metric_type = "L2"108self._overwrite = overwrite109self._index_args = {}110self._query_args = {}111if index_type == "HNSW":112if index_args is not None:113ef_construct = index_args.get("ef_construct", 500)114M = index_args.get("M", 24)115ef_search = index_args.get("ef_search", 400)116else:117ef_construct = 500118M = 24119ef_search = 400120
121self._index_args = {"ef_construct": ef_construct, "M": M}122self._query_args = {"ef_search": ef_search}123
124@property125def client(self) -> "Tair":126"""Return the Tair client instance."""127return self._tair_client128
129def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:130"""Add nodes to the index.131
132Args:
133nodes (List[BaseNode]): List of nodes with embeddings
134
135Returns:
136List[str]: List of ids of the documents added to the index.
137"""
138# check to see if empty document list was passed139if len(nodes) == 0:140return []141
142# set vector dim for creation if index doesn't exist143self.dim = len(nodes[0].get_embedding())144
145if self._index_exists():146if self._overwrite:147self.delete_index()148self._create_index()149else:150logging.info(f"Adding document to existing index {self._index_name}")151else:152self._create_index()153
154ids = []155for node in nodes:156attributes = {157"id": node.node_id,158"doc_id": node.ref_doc_id,159"text": node.get_content(metadata_mode=MetadataMode.NONE),160}161metadata_dict = node_to_metadata_dict(162node, remove_text=True, flat_metadata=self.flat_metadata163)164attributes.update(metadata_dict)165
166ids.append(node.node_id)167self._tair_client.tvs_hset(168self._index_name,169f"{node.ref_doc_id}#{node.node_id}",170vector=node.get_embedding(),171is_binary=False,172**attributes,173)174
175_logger.info(f"Added {len(ids)} documents to index {self._index_name}")176return ids177
178def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:179"""Delete a document.180
181Args:
182doc_id (str): document id
183
184"""
185iter = self._tair_client.tvs_scan(self._index_name, "%s#*" % ref_doc_id)186for k in iter:187self._tair_client.tvs_del(self._index_name, k)188
189def delete_index(self) -> None:190"""Delete the index and all documents."""191_logger.info(f"Deleting index {self._index_name}")192self._tair_client.tvs_del_index(self._index_name)193
194def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:195"""Query the index.196
197Args:
198query (VectorStoreQuery): query object
199
200Returns:
201VectorStoreQueryResult: query result
202
203Raises:
204ValueError: If query.query_embedding is None.
205"""
206filter_expr = None207if query.filters is not None:208filter_expr = _to_filter_expr(query.filters)209
210if not query.query_embedding:211raise ValueError("Query embedding is required for querying.")212
213_logger.info(f"Querying index {self._index_name}")214
215query_args = self._query_args216if self._index_type == "HNSW" and "ef_search" in kwargs:217query_args["ef_search"] = kwargs["ef_search"]218
219results = self._tair_client.tvs_knnsearch(220self._index_name,221query.similarity_top_k,222query.query_embedding,223False,224filter_str=filter_expr,225**query_args,226)227results = [(k.decode(), float(s)) for k, s in results]228
229ids = []230nodes = []231scores = []232pipe = self._tair_client.pipeline(transaction=False)233for key, score in results:234scores.append(score)235pipe.tvs_hmget(self._index_name, key, "id", "doc_id", "text")236metadatas = pipe.execute()237for i, m in enumerate(metadatas):238# TODO: properly get the _node_conent239doc_id = m[0].decode()240node = TextNode(241text=m[2].decode(),242id_=doc_id,243embedding=None,244relationships={245NodeRelationship.SOURCE: RelatedNodeInfo(node_id=m[1].decode())246},247)248ids.append(doc_id)249nodes.append(node)250_logger.info(f"Found {len(nodes)} results for query with id {ids}")251
252return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)253
254def _create_index(self) -> None:255try:256from tair import tairvector257except ImportError:258raise ValueError(259"Could not import tair-py python package. "260"Please install it with `pip install tair`."261)262_logger.info(f"Creating index {self._index_name}")263self._tair_client.tvs_create_index(264self._index_name,265self.dim,266distance_type=self._metric_type,267index_type=self._index_type,268data_type=tairvector.DataType.Float32,269**self._index_args,270)271
272def _index_exists(self) -> bool:273index = self._tair_client.tvs_get_index(self._index_name)274return index is not None275