llama-index
750 строк · 28.1 Кб
1"""Azure AI Search vector store."""
2
3import enum4import json5import logging6from enum import auto7from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast8
9from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode10from llama_index.legacy.vector_stores.types import (11ExactMatchFilter,12MetadataFilters,13VectorStore,14VectorStoreQuery,15VectorStoreQueryMode,16VectorStoreQueryResult,17)
18from llama_index.legacy.vector_stores.utils import (19legacy_metadata_dict_to_node,20metadata_dict_to_node,21node_to_metadata_dict,22)
23
24logger = logging.getLogger(__name__)25
26
27class MetadataIndexFieldType(int, enum.Enum):28"""29Enumeration representing the supported types for metadata fields in an
30Azure AI Search Index, corresponds with types supported in a flat
31metadata dictionary.
32"""
33
34STRING = auto() # "Edm.String"35BOOLEAN = auto() # "Edm.Boolean"36INT32 = auto() # "Edm.Int32"37INT64 = auto() # "Edm.Int64"38DOUBLE = auto() # "Edm.Double"39
40
41class IndexManagement(int, enum.Enum):42"""Enumeration representing the supported index management operations."""43
44NO_VALIDATION = auto()45VALIDATE_INDEX = auto()46CREATE_IF_NOT_EXISTS = auto()47
48
49class AzureAISearchVectorStore(VectorStore):50stores_text: bool = True51flat_metadata: bool = True52
53def _normalise_metadata_to_index_fields(54self,55filterable_metadata_field_keys: Union[56List[str],57Dict[str, str],58Dict[str, Tuple[str, MetadataIndexFieldType]],59None,60] = [],61) -> Dict[str, Tuple[str, MetadataIndexFieldType]]:62index_field_spec: Dict[str, Tuple[str, MetadataIndexFieldType]] = {}63
64if isinstance(filterable_metadata_field_keys, List):65for field in filterable_metadata_field_keys:66# Index field name and the metadata field name are the same67# Use String as the default index field type68index_field_spec[field] = (field, MetadataIndexFieldType.STRING)69
70elif isinstance(filterable_metadata_field_keys, Dict):71for k, v in filterable_metadata_field_keys.items():72if isinstance(v, tuple):73# Index field name and metadata field name may differ74# The index field type used is as supplied75index_field_spec[k] = v76else:77# Index field name and metadata field name may differ78# Use String as the default index field type79index_field_spec[k] = (v, MetadataIndexFieldType.STRING)80
81return index_field_spec82
83def _create_index_if_not_exists(self, index_name: str) -> None:84if index_name not in self._index_client.list_index_names():85logger.info(86f"Index {index_name} does not exist in Azure AI Search, creating index"87)88self._create_index(index_name)89
90def _create_metadata_index_fields(self) -> List[Any]:91"""Create a list of index fields for storing metadata values."""92from azure.search.documents.indexes.models import SimpleField93
94index_fields = []95
96# create search fields97for v in self._metadata_to_index_field_map.values():98field_name, field_type = v99
100if field_type == MetadataIndexFieldType.STRING:101index_field_type = "Edm.String"102elif field_type == MetadataIndexFieldType.INT32:103index_field_type = "Edm.Int32"104elif field_type == MetadataIndexFieldType.INT64:105index_field_type = "Edm.Int64"106elif field_type == MetadataIndexFieldType.DOUBLE:107index_field_type = "Edm.Double"108elif field_type == MetadataIndexFieldType.BOOLEAN:109index_field_type = "Edm.Boolean"110
111field = SimpleField(name=field_name, type=index_field_type, filterable=True)112index_fields.append(field)113
114return index_fields115
116def _create_index(self, index_name: Optional[str]) -> None:117"""118Creates a default index based on the supplied index name, key field names and
119metadata filtering keys.
120"""
121from azure.search.documents.indexes.models import (122ExhaustiveKnnAlgorithmConfiguration,123ExhaustiveKnnParameters,124HnswAlgorithmConfiguration,125HnswParameters,126SearchableField,127SearchField,128SearchFieldDataType,129SearchIndex,130SemanticConfiguration,131SemanticField,132SemanticPrioritizedFields,133SemanticSearch,134SimpleField,135VectorSearch,136VectorSearchAlgorithmKind,137VectorSearchAlgorithmMetric,138VectorSearchProfile,139)140
141logger.info(f"Configuring {index_name} fields for Azure AI Search")142fields = [143SimpleField(name=self._field_mapping["id"], type="Edm.String", key=True),144SearchableField(145name=self._field_mapping["chunk"],146type="Edm.String",147analyzer_name="en.microsoft",148),149SearchField(150name=self._field_mapping["embedding"],151type=SearchFieldDataType.Collection(SearchFieldDataType.Single),152searchable=True,153vector_search_dimensions=self.embedding_dimensionality,154vector_search_profile_name="default",155),156SimpleField(name=self._field_mapping["metadata"], type="Edm.String"),157SimpleField(158name=self._field_mapping["doc_id"], type="Edm.String", filterable=True159),160]161logger.info(f"Configuring {index_name} metadata fields")162metadata_index_fields = self._create_metadata_index_fields()163fields.extend(metadata_index_fields)164logger.info(f"Configuring {index_name} vector search")165# Configure the vector search algorithms and profiles166vector_search = VectorSearch(167algorithms=[168HnswAlgorithmConfiguration(169name="myHnsw",170kind=VectorSearchAlgorithmKind.HNSW,171# For more information on HNSw parameters, visit https://learn.microsoft.com//azure/search/vector-search-ranking#creating-the-hnsw-graph172parameters=HnswParameters(173m=4,174ef_construction=400,175ef_search=500,176metric=VectorSearchAlgorithmMetric.COSINE,177),178),179ExhaustiveKnnAlgorithmConfiguration(180name="myExhaustiveKnn",181kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,182parameters=ExhaustiveKnnParameters(183metric=VectorSearchAlgorithmMetric.COSINE,184),185),186],187profiles=[188VectorSearchProfile(189name="myHnswProfile",190algorithm_configuration_name="myHnsw",191),192# Add more profiles if needed193VectorSearchProfile(194name="myExhaustiveKnnProfile",195algorithm_configuration_name="myExhaustiveKnn",196),197# Add more profiles if needed198],199)200logger.info(f"Configuring {index_name} semantic search")201semantic_config = SemanticConfiguration(202name="mySemanticConfig",203prioritized_fields=SemanticPrioritizedFields(204content_fields=[SemanticField(field_name=self._field_mapping["chunk"])],205),206)207
208semantic_search = SemanticSearch(configurations=[semantic_config])209
210index = SearchIndex(211name=index_name,212fields=fields,213vector_search=vector_search,214semantic_search=semantic_search,215)216logger.debug(f"Creating {index_name} search index")217self._index_client.create_index(index)218
219def _validate_index(self, index_name: Optional[str]) -> None:220if self._index_client and index_name:221if index_name not in self._index_client.list_index_names():222raise ValueError(223f"Validation failed, index {index_name} does not exist."224)225
226def __init__(227self,228search_or_index_client: Any,229id_field_key: str,230chunk_field_key: str,231embedding_field_key: str,232metadata_string_field_key: str,233doc_id_field_key: str,234filterable_metadata_field_keys: Optional[235Union[236List[str],237Dict[str, str],238Dict[str, Tuple[str, MetadataIndexFieldType]],239]240] = None,241index_name: Optional[str] = None,242index_mapping: Optional[243Callable[[Dict[str, str], Dict[str, Any]], Dict[str, str]]244] = None,245index_management: IndexManagement = IndexManagement.NO_VALIDATION,246embedding_dimensionality: int = 1536,247**kwargs: Any,248) -> None:249# ruff: noqa: E501250"""251Embeddings and documents are stored in an Azure AI Search index,
252a merge or upload approach is used when adding embeddings.
253When adding multiple embeddings the index is updated by this vector store
254in batches of 10 documents, very large nodes may result in failure due to
255the batch byte size being exceeded.
256
257Args:
258search_client (azure.search.documents.SearchClient):
259Client for index to populated / queried.
260id_field_key (str): Index field storing the id
261chunk_field_key (str): Index field storing the node text
262embedding_field_key (str): Index field storing the embedding vector
263metadata_string_field_key (str):
264Index field storing node metadata as a json string.
265Schema is arbitrary, to filter on metadata values they must be stored
266as separate fields in the index, use filterable_metadata_field_keys
267to specify the metadata values that should be stored in these filterable fields
268doc_id_field_key (str): Index field storing doc_id
269index_mapping:
270Optional function with definition
271(enriched_doc: Dict[str, str], metadata: Dict[str, Any]): Dict[str,str]
272used to map document fields to the AI search index fields
273(return value of function).
274If none is specified a default mapping is provided which uses
275the field keys. The keys in the enriched_doc are
276["id", "chunk", "embedding", "metadata"]
277The default mapping is:
278- "id" to id_field_key
279- "chunk" to chunk_field_key
280- "embedding" to embedding_field_key
281- "metadata" to metadata_field_key
282*kwargs (Any): Additional keyword arguments.
283
284Raises:
285ImportError: Unable to import `azure-search-documents`
286ValueError: If `search_or_index_client` is not provided
287ValueError: If `index_name` is not provided and `search_or_index_client`
288is of type azure.search.documents.SearchIndexClient
289ValueError: If `index_name` is provided and `search_or_index_client`
290is of type azure.search.documents.SearchClient
291ValueError: If `create_index_if_not_exists` is true and
292`search_or_index_client` is of type azure.search.documents.SearchClient
293"""
294import_err_msg = (295"`azure-search-documents` package not found, please run "296"`pip install azure-search-documents==11.4.0`"297)298
299try:300import azure.search.documents # noqa301from azure.search.documents import SearchClient302from azure.search.documents.indexes import SearchIndexClient303except ImportError:304raise ImportError(import_err_msg)305
306self._index_client: SearchIndexClient = cast(SearchIndexClient, None)307self._search_client: SearchClient = cast(SearchClient, None)308self.embedding_dimensionality = embedding_dimensionality309
310# Validate search_or_index_client311if search_or_index_client is not None:312if isinstance(search_or_index_client, SearchIndexClient):313# If SearchIndexClient is supplied so must index_name314self._index_client = cast(SearchIndexClient, search_or_index_client)315
316if not index_name:317raise ValueError(318"index_name must be supplied if search_or_index_client is of "319"type azure.search.documents.SearchIndexClient"320)321
322self._search_client = self._index_client.get_search_client(323index_name=index_name324)325
326elif isinstance(search_or_index_client, SearchClient):327self._search_client = cast(SearchClient, search_or_index_client)328
329# Validate index_name330if index_name:331raise ValueError(332"index_name cannot be supplied if search_or_index_client "333"is of type azure.search.documents.SearchClient"334)335
336if not self._index_client and not self._search_client:337raise ValueError(338"search_or_index_client must be of type "339"azure.search.documents.SearchClient or "340"azure.search.documents.SearchIndexClient"341)342else:343raise ValueError("search_or_index_client not specified")344
345if (346index_management == IndexManagement.CREATE_IF_NOT_EXISTS347and not self._index_client348):349raise ValueError(350"index_management has value of IndexManagement.CREATE_IF_NOT_EXISTS "351"but search_or_index_client is not of type "352"azure.search.documents.SearchIndexClient"353)354
355self._index_management = index_management356
357# Default field mapping358field_mapping = {359"id": id_field_key,360"chunk": chunk_field_key,361"embedding": embedding_field_key,362"metadata": metadata_string_field_key,363"doc_id": doc_id_field_key,364}365
366self._field_mapping = field_mapping367
368self._index_mapping = (369self._default_index_mapping if index_mapping is None else index_mapping370)371
372# self._filterable_metadata_field_keys = filterable_metadata_field_keys373self._metadata_to_index_field_map = self._normalise_metadata_to_index_fields(374filterable_metadata_field_keys
375)376
377if self._index_management == IndexManagement.CREATE_IF_NOT_EXISTS:378if index_name:379self._create_index_if_not_exists(index_name)380
381if self._index_management == IndexManagement.VALIDATE_INDEX:382self._validate_index(index_name)383
384@property385def client(self) -> Any:386"""Get client."""387return self._search_client388
389def _default_index_mapping(390self, enriched_doc: Dict[str, str], metadata: Dict[str, Any]391) -> Dict[str, str]:392index_doc: Dict[str, str] = {}393
394for field in self._field_mapping:395index_doc[self._field_mapping[field]] = enriched_doc[field]396
397for metadata_field_name, (398index_field_name,399_,400) in self._metadata_to_index_field_map.items():401metadata_value = metadata.get(metadata_field_name)402if metadata_value:403index_doc[index_field_name] = metadata_value404
405return index_doc406
407def add(408self,409nodes: List[BaseNode],410**add_kwargs: Any,411) -> List[str]:412"""Add nodes to index associated with the configured search client.413
414Args:
415nodes: List[BaseNode]: nodes with embeddings
416
417"""
418if not self._search_client:419raise ValueError("Search client not initialized")420
421documents = []422ids = []423
424for node in nodes:425logger.debug(f"Processing embedding: {node.node_id}")426ids.append(node.node_id)427
428index_document = self._create_index_document(node)429
430documents.append(index_document)431
432if len(documents) >= 10:433logger.info(434f"Uploading batch of size {len(documents)}, "435f"current progress {len(ids)} of {len(nodes)}"436)437self._search_client.merge_or_upload_documents(documents)438documents = []439
440# Upload remaining batch of less than 10 documents441if len(documents) > 0:442logger.info(443f"Uploading remaining batch of size {len(documents)}, "444f"current progress {len(ids)} of {len(nodes)}"445)446self._search_client.merge_or_upload_documents(documents)447documents = []448
449return ids450
451def _create_index_document(self, node: BaseNode) -> Dict[str, Any]:452"""Create AI Search index document from embedding result."""453doc: Dict[str, Any] = {}454doc["id"] = node.node_id455doc["chunk"] = node.get_content(metadata_mode=MetadataMode.NONE) or ""456doc["embedding"] = node.get_embedding()457doc["doc_id"] = node.ref_doc_id458
459node_metadata = node_to_metadata_dict(460node,461remove_text=True,462flat_metadata=self.flat_metadata,463)464
465doc["metadata"] = json.dumps(node_metadata)466
467return self._index_mapping(doc, node_metadata)468
469def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:470"""471Delete documents from the AI Search Index
472with doc_id_field_key field equal to ref_doc_id.
473"""
474# Locate documents to delete475filter = f'{self._field_mapping["doc_id"]} eq \'{ref_doc_id}\''476results = self._search_client.search(search_text="*", filter=filter)477
478logger.debug(f"Searching with filter {filter}")479
480docs_to_delete = []481for result in results:482doc = {}483doc["id"] = result[self._field_mapping["id"]]484logger.debug(f"Found document to delete: {doc}")485docs_to_delete.append(doc)486
487if len(docs_to_delete) > 0:488logger.debug(f"Deleting {len(docs_to_delete)} documents")489self._search_client.delete_documents(docs_to_delete)490
491def _create_odata_filter(self, metadata_filters: MetadataFilters) -> str:492"""Generate an OData filter string using supplied metadata filters."""493odata_filter: List[str] = []494for f in metadata_filters.legacy_filters():495if not isinstance(f, ExactMatchFilter):496raise NotImplementedError(497"Only `ExactMatchFilter` filters are supported"498)499
500# Raise error if filtering on a metadata field that lacks a mapping to501# an index field502metadata_mapping = self._metadata_to_index_field_map.get(f.key)503
504if not metadata_mapping:505raise ValueError(506f"Metadata field '{f.key}' is missing a mapping to an index field, "507"provide entry in 'filterable_metadata_field_keys' for this "508"vector store"509)510
511index_field = metadata_mapping[0]512
513if len(odata_filter) > 0:514odata_filter.append(" and ")515if isinstance(f.value, str):516escaped_value = "".join([("''" if s == "'" else s) for s in f.value])517odata_filter.append(f"{index_field} eq '{escaped_value}'")518else:519odata_filter.append(f"{index_field} eq {f.value}")520
521odata_expr = "".join(odata_filter)522
523logger.info(f"Odata filter: {odata_expr}")524
525return odata_expr526
527def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:528odata_filter = None529if query.filters is not None:530odata_filter = self._create_odata_filter(query.filters)531azure_query_result_search: AzureQueryResultSearchBase = (532AzureQueryResultSearchDefault(533query, self._field_mapping, odata_filter, self._search_client534)535)536if query.mode == VectorStoreQueryMode.SPARSE:537azure_query_result_search = AzureQueryResultSearchSparse(538query, self._field_mapping, odata_filter, self._search_client539)540elif query.mode == VectorStoreQueryMode.HYBRID:541azure_query_result_search = AzureQueryResultSearchHybrid(542query, self._field_mapping, odata_filter, self._search_client543)544elif query.mode == VectorStoreQueryMode.SEMANTIC_HYBRID:545azure_query_result_search = AzureQueryResultSearchSemanticHybrid(546query, self._field_mapping, odata_filter, self._search_client547)548return azure_query_result_search.search()549
550
551class AzureQueryResultSearchBase:552def __init__(553self,554query: VectorStoreQuery,555field_mapping: Dict[str, str],556odata_filter: Optional[str],557search_client: Any,558) -> None:559self._query = query560self._field_mapping = field_mapping561self._odata_filter = odata_filter562self._search_client = search_client563
564@property565def _select_fields(self) -> List[str]:566return [567self._field_mapping["id"],568self._field_mapping["chunk"],569self._field_mapping["metadata"],570self._field_mapping["doc_id"],571]572
573def _create_search_query(self) -> str:574return "*"575
576def _create_query_vector(self) -> Optional[List[Any]]:577return None578
579def _create_query_result(580self, search_query: str, vectors: Optional[List[Any]]581) -> VectorStoreQueryResult:582results = self._search_client.search(583search_text=search_query,584vector_queries=vectors,585top=self._query.similarity_top_k,586select=self._select_fields,587filter=self._odata_filter,588)589
590id_result = []591node_result = []592score_result = []593for result in results:594node_id = result[self._field_mapping["id"]]595metadata = json.loads(result[self._field_mapping["metadata"]])596score = result["@search.score"]597chunk = result[self._field_mapping["chunk"]]598
599try:600node = metadata_dict_to_node(metadata)601node.set_content(chunk)602except Exception:603# NOTE: deprecated legacy logic for backward compatibility604metadata, node_info, relationships = legacy_metadata_dict_to_node(605metadata
606)607
608node = TextNode(609text=chunk,610id_=node_id,611metadata=metadata,612start_char_idx=node_info.get("start", None),613end_char_idx=node_info.get("end", None),614relationships=relationships,615)616
617logger.debug(f"Retrieved node id {node_id} with node data of {node}")618
619id_result.append(node_id)620node_result.append(node)621score_result.append(score)622
623logger.debug(624f"Search query '{search_query}' returned {len(id_result)} results."625)626
627return VectorStoreQueryResult(628nodes=node_result, similarities=score_result, ids=id_result629)630
631def search(self) -> VectorStoreQueryResult:632search_query = self._create_search_query()633vectors = self._create_query_vector()634return self._create_query_result(search_query, vectors)635
636
637class AzureQueryResultSearchDefault(AzureQueryResultSearchBase):638def _create_query_vector(self) -> Optional[List[Any]]:639"""Query vector store."""640from azure.search.documents.models import VectorizedQuery641
642if not self._query.query_embedding:643raise ValueError("Query missing embedding")644
645vectorized_query = VectorizedQuery(646vector=self._query.query_embedding,647k_nearest_neighbors=self._query.similarity_top_k,648fields=self._field_mapping["embedding"],649)650vector_queries = [vectorized_query]651logger.info("Vector search with supplied embedding")652return vector_queries653
654
655class AzureQueryResultSearchSparse(AzureQueryResultSearchBase):656def _create_search_query(self) -> str:657if self._query.query_str is None:658raise ValueError("Query missing query string")659
660search_query = self._query.query_str661
662logger.info(f"Hybrid search with search text: {search_query}")663return search_query664
665
666class AzureQueryResultSearchHybrid(667AzureQueryResultSearchDefault, AzureQueryResultSearchSparse668):669def _create_query_vector(self) -> Optional[List[Any]]:670return AzureQueryResultSearchDefault._create_query_vector(self)671
672def _create_search_query(self) -> str:673return AzureQueryResultSearchSparse._create_search_query(self)674
675
676class AzureQueryResultSearchSemanticHybrid(AzureQueryResultSearchHybrid):677def _create_query_vector(self) -> Optional[List[Any]]:678"""Query vector store."""679from azure.search.documents.models import VectorizedQuery680
681if not self._query.query_embedding:682raise ValueError("Query missing embedding")683# k is set to 50 to align with the number of accept document in azure semantic reranking model.684# https://learn.microsoft.com/azure/search/semantic-search-overview685vectorized_query = VectorizedQuery(686vector=self._query.query_embedding,687k_nearest_neighbors=50,688fields=self._field_mapping["embedding"],689)690vector_queries = [vectorized_query]691logger.info("Vector search with supplied embedding")692return vector_queries693
694def _create_query_result(695self, search_query: str, vector_queries: Optional[List[Any]]696) -> VectorStoreQueryResult:697results = self._search_client.search(698search_text=search_query,699vector_queries=vector_queries,700top=self._query.similarity_top_k,701select=self._select_fields,702filter=self._odata_filter,703query_type="semantic",704semantic_configuration_name="mySemanticConfig",705)706
707id_result = []708node_result = []709score_result = []710for result in results:711node_id = result[self._field_mapping["id"]]712metadata = json.loads(result[self._field_mapping["metadata"]])713# use reranker_score instead of score714score = result["@search.reranker_score"]715chunk = result[self._field_mapping["chunk"]]716
717try:718node = metadata_dict_to_node(metadata)719node.set_content(chunk)720except Exception:721# NOTE: deprecated legacy logic for backward compatibility722metadata, node_info, relationships = legacy_metadata_dict_to_node(723metadata
724)725
726node = TextNode(727text=chunk,728id_=node_id,729metadata=metadata,730start_char_idx=node_info.get("start", None),731end_char_idx=node_info.get("end", None),732relationships=relationships,733)734
735logger.debug(f"Retrieved node id {node_id} with node data of {node}")736
737id_result.append(node_id)738node_result.append(node)739score_result.append(score)740
741logger.debug(742f"Search query '{search_query}' returned {len(id_result)} results."743)744
745return VectorStoreQueryResult(746nodes=node_result, similarities=score_result, ids=id_result747)748
749
750CognitiveSearchVectorStore = AzureAISearchVectorStore751