llama-index
211 строк · 6.7 Кб
1"""DashVector Vector Store."""
2
3import logging4from typing import Any, List, Optional, cast5
6from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode7from llama_index.legacy.vector_stores.types import (8MetadataFilters,9VectorStore,10VectorStoreQuery,11VectorStoreQueryMode,12VectorStoreQueryResult,13)
14from llama_index.legacy.vector_stores.utils import (15DEFAULT_DOC_ID_KEY,16DEFAULT_TEXT_KEY,17legacy_metadata_dict_to_node,18metadata_dict_to_node,19node_to_metadata_dict,20)
21
22DEFAULT_BATCH_SIZE = 10023logger = logging.getLogger(__name__)24
25
26def _to_dashvector_filter(27standard_filters: Optional[MetadataFilters] = None,28) -> Optional[str]:29"""Convert from standard filter to dashvector filter dict."""30if standard_filters is None:31return None32
33filters = []34for filter in standard_filters.legacy_filters():35if isinstance(filter.value, str):36value = f"'{filter.value}'"37else:38value = f"{filter.value}"39filters.append(f"{filter.key} = {value}")40return " and ".join(filters)41
42
43class DashVectorStore(VectorStore):44"""Dash Vector Store.45
46In this vector store, embeddings and docs are stored within a
47DashVector collection.
48
49During query time, the index uses DashVector to query for the top
50k most similar nodes.
51
52Args:
53collection (Optional[dashvector.Collection]): DashVector collection instance
54support_sparse_vector (bool): whether support sparse vector for collection.
55encoder (Optional[dashtext.SparseVectorEncoder]): encoder for generating sparse vector from document
56"""
57
58stores_text: bool = True59flat_metadata: bool = True60
61def __init__(62self,63collection: Optional[Any] = None,64support_sparse_vector: bool = False,65encoder: Optional[Any] = None,66) -> None:67"""Initialize params."""68try:69import dashvector70except ImportError:71raise ImportError(72"`dashvector` package not found, please run `pip install dashvector`"73)74
75if support_sparse_vector:76try:77import dashtext78except ImportError:79raise ImportError(80"`dashtext` package not found, please run `pip install dashtext`"81)82
83if encoder is None:84encoder = dashtext.SparseVectorEncoder.default()85
86self._support_sparse_vector = support_sparse_vector87self._encoder = cast(dashtext.SparseVectorEncoder, encoder)88
89if collection is not None:90self._collection = cast(dashvector.Collection, collection)91
92def add(93self,94nodes: List[BaseNode],95**add_kwargs: Any,96) -> List[str]:97"""Add nodes to vector store.98
99Args:
100nodes (List[BaseNode]): list of nodes with embeddings
101"""
102from dashvector import Doc103
104for i in range(0, len(nodes), DEFAULT_BATCH_SIZE):105# batch end106end = min(i + DEFAULT_BATCH_SIZE, len(nodes))107docs = [108Doc(109id=node.node_id,110vector=node.embedding,111sparse_vector=(112self._encoder.encode_documents(113node.get_content(metadata_mode=MetadataMode.EMBED)114)115if self._support_sparse_vector116else None117),118fields=node_to_metadata_dict(119node, remove_text=False, flat_metadata=self.flat_metadata120),121)122for node in nodes[i:end]123]124
125resp = self._collection.upsert(docs)126if not resp:127raise Exception(f"Failed to upsert docs, error: {resp}")128
129return [node.node_id for node in nodes]130
131def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:132"""133Delete nodes using with ref_doc_id.
134
135Args:
136ref_doc_id (str): The doc_id of the document to delete.
137
138"""
139filter = f"{DEFAULT_DOC_ID_KEY}='{ref_doc_id}'"140resp = self._collection.query(filter=filter)141if not resp:142raise Exception(f"Failed to query doc by {filter}")143
144self._collection.delete(ids=[doc.id for doc in resp])145
146def query(147self,148query: VectorStoreQuery,149**kwargs: Any,150) -> VectorStoreQueryResult:151"""Query vector store."""152query_embedding = (153[float(e) for e in query.query_embedding] if query.query_embedding else []154)155
156sparse_vector = None157topk = query.similarity_top_k158if (159query.mode in (VectorStoreQueryMode.SPARSE, VectorStoreQueryMode.HYBRID)160and self._support_sparse_vector161):162sparse_vector = self._encoder.encode_queries(query.query_str)163topk = query.hybrid_top_k or query.similarity_top_k164
165if query.alpha is not None:166from dashtext import combine_dense_and_sparse167
168query_embedding, sparse_vector = combine_dense_and_sparse(169query_embedding, sparse_vector, query.alpha170)171
172filter = _to_dashvector_filter(query.filters)173rsp = self._collection.query(174vector=query_embedding,175sparse_vector=sparse_vector,176topk=topk,177filter=filter,178include_vector=True,179)180if not rsp:181raise Exception(f"Failed to query docs, error: {rsp}")182
183top_k_ids = []184top_k_nodes = []185top_k_scores = []186for doc in rsp:187try:188node = metadata_dict_to_node(doc.fields)189except Exception:190# NOTE: deprecated legacy logic for backward compatibility191logger.debug("Failed to parse Node metadata, fallback to legacy logic.")192metadata, node_info, relationships = legacy_metadata_dict_to_node(193doc.fields194)195
196text = doc.fields[DEFAULT_TEXT_KEY]197node = TextNode(198id_=doc.id,199text=text,200metadata=metadata,201start_char_idx=node_info.get("start", None),202end_char_idx=node_info.get("end", None),203relationships=relationships,204)205top_k_ids.append(doc.id)206top_k_nodes.append(node)207top_k_scores.append(doc.score)208
209return VectorStoreQueryResult(210nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids211)212