llama-index

Форк
0
750 строк · 28.1 Кб
1
"""Azure AI Search vector store."""
2

3
import enum
4
import json
5
import logging
6
from enum import auto
7
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
8

9
from llama_index.legacy.schema import BaseNode, MetadataMode, TextNode
10
from llama_index.legacy.vector_stores.types import (
11
    ExactMatchFilter,
12
    MetadataFilters,
13
    VectorStore,
14
    VectorStoreQuery,
15
    VectorStoreQueryMode,
16
    VectorStoreQueryResult,
17
)
18
from llama_index.legacy.vector_stores.utils import (
19
    legacy_metadata_dict_to_node,
20
    metadata_dict_to_node,
21
    node_to_metadata_dict,
22
)
23

24
logger = logging.getLogger(__name__)
25

26

27
class MetadataIndexFieldType(int, enum.Enum):
28
    """
29
    Enumeration representing the supported types for metadata fields in an
30
    Azure AI Search Index, corresponds with types supported in a flat
31
    metadata dictionary.
32
    """
33

34
    STRING = auto()  # "Edm.String"
35
    BOOLEAN = auto()  # "Edm.Boolean"
36
    INT32 = auto()  # "Edm.Int32"
37
    INT64 = auto()  # "Edm.Int64"
38
    DOUBLE = auto()  # "Edm.Double"
39

40

41
class IndexManagement(int, enum.Enum):
42
    """Enumeration representing the supported index management operations."""
43

44
    NO_VALIDATION = auto()
45
    VALIDATE_INDEX = auto()
46
    CREATE_IF_NOT_EXISTS = auto()
47

48

49
class AzureAISearchVectorStore(VectorStore):
50
    stores_text: bool = True
51
    flat_metadata: bool = True
52

53
    def _normalise_metadata_to_index_fields(
54
        self,
55
        filterable_metadata_field_keys: Union[
56
            List[str],
57
            Dict[str, str],
58
            Dict[str, Tuple[str, MetadataIndexFieldType]],
59
            None,
60
        ] = [],
61
    ) -> Dict[str, Tuple[str, MetadataIndexFieldType]]:
62
        index_field_spec: Dict[str, Tuple[str, MetadataIndexFieldType]] = {}
63

64
        if isinstance(filterable_metadata_field_keys, List):
65
            for field in filterable_metadata_field_keys:
66
                # Index field name and the metadata field name are the same
67
                # Use String as the default index field type
68
                index_field_spec[field] = (field, MetadataIndexFieldType.STRING)
69

70
        elif isinstance(filterable_metadata_field_keys, Dict):
71
            for k, v in filterable_metadata_field_keys.items():
72
                if isinstance(v, tuple):
73
                    # Index field name and metadata field name may differ
74
                    # The index field type used is as supplied
75
                    index_field_spec[k] = v
76
                else:
77
                    # Index field name and metadata field name may differ
78
                    # Use String as the default index field type
79
                    index_field_spec[k] = (v, MetadataIndexFieldType.STRING)
80

81
        return index_field_spec
82

83
    def _create_index_if_not_exists(self, index_name: str) -> None:
84
        if index_name not in self._index_client.list_index_names():
85
            logger.info(
86
                f"Index {index_name} does not exist in Azure AI Search, creating index"
87
            )
88
            self._create_index(index_name)
89

90
    def _create_metadata_index_fields(self) -> List[Any]:
91
        """Create a list of index fields for storing metadata values."""
92
        from azure.search.documents.indexes.models import SimpleField
93

94
        index_fields = []
95

96
        # create search fields
97
        for v in self._metadata_to_index_field_map.values():
98
            field_name, field_type = v
99

100
            if field_type == MetadataIndexFieldType.STRING:
101
                index_field_type = "Edm.String"
102
            elif field_type == MetadataIndexFieldType.INT32:
103
                index_field_type = "Edm.Int32"
104
            elif field_type == MetadataIndexFieldType.INT64:
105
                index_field_type = "Edm.Int64"
106
            elif field_type == MetadataIndexFieldType.DOUBLE:
107
                index_field_type = "Edm.Double"
108
            elif field_type == MetadataIndexFieldType.BOOLEAN:
109
                index_field_type = "Edm.Boolean"
110

111
            field = SimpleField(name=field_name, type=index_field_type, filterable=True)
112
            index_fields.append(field)
113

114
        return index_fields
115

116
    def _create_index(self, index_name: Optional[str]) -> None:
117
        """
118
        Creates a default index based on the supplied index name, key field names and
119
        metadata filtering keys.
120
        """
121
        from azure.search.documents.indexes.models import (
122
            ExhaustiveKnnAlgorithmConfiguration,
123
            ExhaustiveKnnParameters,
124
            HnswAlgorithmConfiguration,
125
            HnswParameters,
126
            SearchableField,
127
            SearchField,
128
            SearchFieldDataType,
129
            SearchIndex,
130
            SemanticConfiguration,
131
            SemanticField,
132
            SemanticPrioritizedFields,
133
            SemanticSearch,
134
            SimpleField,
135
            VectorSearch,
136
            VectorSearchAlgorithmKind,
137
            VectorSearchAlgorithmMetric,
138
            VectorSearchProfile,
139
        )
140

141
        logger.info(f"Configuring {index_name} fields for Azure AI Search")
142
        fields = [
143
            SimpleField(name=self._field_mapping["id"], type="Edm.String", key=True),
144
            SearchableField(
145
                name=self._field_mapping["chunk"],
146
                type="Edm.String",
147
                analyzer_name="en.microsoft",
148
            ),
149
            SearchField(
150
                name=self._field_mapping["embedding"],
151
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
152
                searchable=True,
153
                vector_search_dimensions=self.embedding_dimensionality,
154
                vector_search_profile_name="default",
155
            ),
156
            SimpleField(name=self._field_mapping["metadata"], type="Edm.String"),
157
            SimpleField(
158
                name=self._field_mapping["doc_id"], type="Edm.String", filterable=True
159
            ),
160
        ]
161
        logger.info(f"Configuring {index_name} metadata fields")
162
        metadata_index_fields = self._create_metadata_index_fields()
163
        fields.extend(metadata_index_fields)
164
        logger.info(f"Configuring {index_name} vector search")
165
        # Configure the vector search algorithms and profiles
166
        vector_search = VectorSearch(
167
            algorithms=[
168
                HnswAlgorithmConfiguration(
169
                    name="myHnsw",
170
                    kind=VectorSearchAlgorithmKind.HNSW,
171
                    # For more information on HNSw parameters, visit https://learn.microsoft.com//azure/search/vector-search-ranking#creating-the-hnsw-graph
172
                    parameters=HnswParameters(
173
                        m=4,
174
                        ef_construction=400,
175
                        ef_search=500,
176
                        metric=VectorSearchAlgorithmMetric.COSINE,
177
                    ),
178
                ),
179
                ExhaustiveKnnAlgorithmConfiguration(
180
                    name="myExhaustiveKnn",
181
                    kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
182
                    parameters=ExhaustiveKnnParameters(
183
                        metric=VectorSearchAlgorithmMetric.COSINE,
184
                    ),
185
                ),
186
            ],
187
            profiles=[
188
                VectorSearchProfile(
189
                    name="myHnswProfile",
190
                    algorithm_configuration_name="myHnsw",
191
                ),
192
                # Add more profiles if needed
193
                VectorSearchProfile(
194
                    name="myExhaustiveKnnProfile",
195
                    algorithm_configuration_name="myExhaustiveKnn",
196
                ),
197
                # Add more profiles if needed
198
            ],
199
        )
200
        logger.info(f"Configuring {index_name} semantic search")
201
        semantic_config = SemanticConfiguration(
202
            name="mySemanticConfig",
203
            prioritized_fields=SemanticPrioritizedFields(
204
                content_fields=[SemanticField(field_name=self._field_mapping["chunk"])],
205
            ),
206
        )
207

208
        semantic_search = SemanticSearch(configurations=[semantic_config])
209

210
        index = SearchIndex(
211
            name=index_name,
212
            fields=fields,
213
            vector_search=vector_search,
214
            semantic_search=semantic_search,
215
        )
216
        logger.debug(f"Creating {index_name} search index")
217
        self._index_client.create_index(index)
218

219
    def _validate_index(self, index_name: Optional[str]) -> None:
220
        if self._index_client and index_name:
221
            if index_name not in self._index_client.list_index_names():
222
                raise ValueError(
223
                    f"Validation failed, index {index_name} does not exist."
224
                )
225

226
    def __init__(
227
        self,
228
        search_or_index_client: Any,
229
        id_field_key: str,
230
        chunk_field_key: str,
231
        embedding_field_key: str,
232
        metadata_string_field_key: str,
233
        doc_id_field_key: str,
234
        filterable_metadata_field_keys: Optional[
235
            Union[
236
                List[str],
237
                Dict[str, str],
238
                Dict[str, Tuple[str, MetadataIndexFieldType]],
239
            ]
240
        ] = None,
241
        index_name: Optional[str] = None,
242
        index_mapping: Optional[
243
            Callable[[Dict[str, str], Dict[str, Any]], Dict[str, str]]
244
        ] = None,
245
        index_management: IndexManagement = IndexManagement.NO_VALIDATION,
246
        embedding_dimensionality: int = 1536,
247
        **kwargs: Any,
248
    ) -> None:
249
        # ruff: noqa: E501
250
        """
251
        Embeddings and documents are stored in an Azure AI Search index,
252
        a merge or upload approach is used when adding embeddings.
253
        When adding multiple embeddings the index is updated by this vector store
254
        in batches of 10 documents, very large nodes may result in failure due to
255
        the batch byte size being exceeded.
256

257
        Args:
258
            search_client (azure.search.documents.SearchClient):
259
                Client for index to populated / queried.
260
            id_field_key (str): Index field storing the id
261
            chunk_field_key (str): Index field storing the node text
262
            embedding_field_key (str): Index field storing the embedding vector
263
            metadata_string_field_key (str):
264
                Index field storing node metadata as a json string.
265
                Schema is arbitrary, to filter on metadata values they must be stored
266
                as separate fields in the index, use filterable_metadata_field_keys
267
                to specify the metadata values that should be stored in these filterable fields
268
            doc_id_field_key (str): Index field storing doc_id
269
            index_mapping:
270
                Optional function with definition
271
                (enriched_doc: Dict[str, str], metadata: Dict[str, Any]): Dict[str,str]
272
                used to map document fields to the AI search index fields
273
                (return value of function).
274
                If none is specified a default mapping is provided which uses
275
                the field keys. The keys in the enriched_doc are
276
                ["id", "chunk", "embedding", "metadata"]
277
                The default mapping is:
278
                    - "id" to id_field_key
279
                    - "chunk" to chunk_field_key
280
                    - "embedding" to embedding_field_key
281
                    - "metadata" to metadata_field_key
282
            *kwargs (Any): Additional keyword arguments.
283

284
        Raises:
285
            ImportError: Unable to import `azure-search-documents`
286
            ValueError: If `search_or_index_client` is not provided
287
            ValueError: If `index_name` is not provided and `search_or_index_client`
288
                is of type azure.search.documents.SearchIndexClient
289
            ValueError: If `index_name` is provided and `search_or_index_client`
290
                is of type azure.search.documents.SearchClient
291
            ValueError: If `create_index_if_not_exists` is true and
292
                `search_or_index_client` is of type azure.search.documents.SearchClient
293
        """
294
        import_err_msg = (
295
            "`azure-search-documents` package not found, please run "
296
            "`pip install azure-search-documents==11.4.0`"
297
        )
298

299
        try:
300
            import azure.search.documents  # noqa
301
            from azure.search.documents import SearchClient
302
            from azure.search.documents.indexes import SearchIndexClient
303
        except ImportError:
304
            raise ImportError(import_err_msg)
305

306
        self._index_client: SearchIndexClient = cast(SearchIndexClient, None)
307
        self._search_client: SearchClient = cast(SearchClient, None)
308
        self.embedding_dimensionality = embedding_dimensionality
309

310
        # Validate search_or_index_client
311
        if search_or_index_client is not None:
312
            if isinstance(search_or_index_client, SearchIndexClient):
313
                # If SearchIndexClient is supplied so must index_name
314
                self._index_client = cast(SearchIndexClient, search_or_index_client)
315

316
                if not index_name:
317
                    raise ValueError(
318
                        "index_name must be supplied if search_or_index_client is of "
319
                        "type azure.search.documents.SearchIndexClient"
320
                    )
321

322
                self._search_client = self._index_client.get_search_client(
323
                    index_name=index_name
324
                )
325

326
            elif isinstance(search_or_index_client, SearchClient):
327
                self._search_client = cast(SearchClient, search_or_index_client)
328

329
                # Validate index_name
330
                if index_name:
331
                    raise ValueError(
332
                        "index_name cannot be supplied if search_or_index_client "
333
                        "is of type azure.search.documents.SearchClient"
334
                    )
335

336
            if not self._index_client and not self._search_client:
337
                raise ValueError(
338
                    "search_or_index_client must be of type "
339
                    "azure.search.documents.SearchClient or "
340
                    "azure.search.documents.SearchIndexClient"
341
                )
342
        else:
343
            raise ValueError("search_or_index_client not specified")
344

345
        if (
346
            index_management == IndexManagement.CREATE_IF_NOT_EXISTS
347
            and not self._index_client
348
        ):
349
            raise ValueError(
350
                "index_management has value of IndexManagement.CREATE_IF_NOT_EXISTS "
351
                "but search_or_index_client is not of type "
352
                "azure.search.documents.SearchIndexClient"
353
            )
354

355
        self._index_management = index_management
356

357
        # Default field mapping
358
        field_mapping = {
359
            "id": id_field_key,
360
            "chunk": chunk_field_key,
361
            "embedding": embedding_field_key,
362
            "metadata": metadata_string_field_key,
363
            "doc_id": doc_id_field_key,
364
        }
365

366
        self._field_mapping = field_mapping
367

368
        self._index_mapping = (
369
            self._default_index_mapping if index_mapping is None else index_mapping
370
        )
371

372
        # self._filterable_metadata_field_keys = filterable_metadata_field_keys
373
        self._metadata_to_index_field_map = self._normalise_metadata_to_index_fields(
374
            filterable_metadata_field_keys
375
        )
376

377
        if self._index_management == IndexManagement.CREATE_IF_NOT_EXISTS:
378
            if index_name:
379
                self._create_index_if_not_exists(index_name)
380

381
        if self._index_management == IndexManagement.VALIDATE_INDEX:
382
            self._validate_index(index_name)
383

384
    @property
385
    def client(self) -> Any:
386
        """Get client."""
387
        return self._search_client
388

389
    def _default_index_mapping(
390
        self, enriched_doc: Dict[str, str], metadata: Dict[str, Any]
391
    ) -> Dict[str, str]:
392
        index_doc: Dict[str, str] = {}
393

394
        for field in self._field_mapping:
395
            index_doc[self._field_mapping[field]] = enriched_doc[field]
396

397
        for metadata_field_name, (
398
            index_field_name,
399
            _,
400
        ) in self._metadata_to_index_field_map.items():
401
            metadata_value = metadata.get(metadata_field_name)
402
            if metadata_value:
403
                index_doc[index_field_name] = metadata_value
404

405
        return index_doc
406

407
    def add(
408
        self,
409
        nodes: List[BaseNode],
410
        **add_kwargs: Any,
411
    ) -> List[str]:
412
        """Add nodes to index associated with the configured search client.
413

414
        Args:
415
            nodes: List[BaseNode]: nodes with embeddings
416

417
        """
418
        if not self._search_client:
419
            raise ValueError("Search client not initialized")
420

421
        documents = []
422
        ids = []
423

424
        for node in nodes:
425
            logger.debug(f"Processing embedding: {node.node_id}")
426
            ids.append(node.node_id)
427

428
            index_document = self._create_index_document(node)
429

430
            documents.append(index_document)
431

432
            if len(documents) >= 10:
433
                logger.info(
434
                    f"Uploading batch of size {len(documents)}, "
435
                    f"current progress {len(ids)} of {len(nodes)}"
436
                )
437
                self._search_client.merge_or_upload_documents(documents)
438
                documents = []
439

440
        # Upload remaining batch of less than 10 documents
441
        if len(documents) > 0:
442
            logger.info(
443
                f"Uploading remaining batch of size {len(documents)}, "
444
                f"current progress {len(ids)} of {len(nodes)}"
445
            )
446
            self._search_client.merge_or_upload_documents(documents)
447
            documents = []
448

449
        return ids
450

451
    def _create_index_document(self, node: BaseNode) -> Dict[str, Any]:
452
        """Create AI Search index document from embedding result."""
453
        doc: Dict[str, Any] = {}
454
        doc["id"] = node.node_id
455
        doc["chunk"] = node.get_content(metadata_mode=MetadataMode.NONE) or ""
456
        doc["embedding"] = node.get_embedding()
457
        doc["doc_id"] = node.ref_doc_id
458

459
        node_metadata = node_to_metadata_dict(
460
            node,
461
            remove_text=True,
462
            flat_metadata=self.flat_metadata,
463
        )
464

465
        doc["metadata"] = json.dumps(node_metadata)
466

467
        return self._index_mapping(doc, node_metadata)
468

469
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
470
        """
471
        Delete documents from the AI Search Index
472
        with doc_id_field_key field equal to ref_doc_id.
473
        """
474
        # Locate documents to delete
475
        filter = f'{self._field_mapping["doc_id"]} eq \'{ref_doc_id}\''
476
        results = self._search_client.search(search_text="*", filter=filter)
477

478
        logger.debug(f"Searching with filter {filter}")
479

480
        docs_to_delete = []
481
        for result in results:
482
            doc = {}
483
            doc["id"] = result[self._field_mapping["id"]]
484
            logger.debug(f"Found document to delete: {doc}")
485
            docs_to_delete.append(doc)
486

487
        if len(docs_to_delete) > 0:
488
            logger.debug(f"Deleting {len(docs_to_delete)} documents")
489
            self._search_client.delete_documents(docs_to_delete)
490

491
    def _create_odata_filter(self, metadata_filters: MetadataFilters) -> str:
492
        """Generate an OData filter string using supplied metadata filters."""
493
        odata_filter: List[str] = []
494
        for f in metadata_filters.legacy_filters():
495
            if not isinstance(f, ExactMatchFilter):
496
                raise NotImplementedError(
497
                    "Only `ExactMatchFilter` filters are supported"
498
                )
499

500
            # Raise error if filtering on a metadata field that lacks a mapping to
501
            # an index field
502
            metadata_mapping = self._metadata_to_index_field_map.get(f.key)
503

504
            if not metadata_mapping:
505
                raise ValueError(
506
                    f"Metadata field '{f.key}' is missing a mapping to an index field, "
507
                    "provide entry in 'filterable_metadata_field_keys' for this "
508
                    "vector store"
509
                )
510

511
            index_field = metadata_mapping[0]
512

513
            if len(odata_filter) > 0:
514
                odata_filter.append(" and ")
515
            if isinstance(f.value, str):
516
                escaped_value = "".join([("''" if s == "'" else s) for s in f.value])
517
                odata_filter.append(f"{index_field} eq '{escaped_value}'")
518
            else:
519
                odata_filter.append(f"{index_field} eq {f.value}")
520

521
        odata_expr = "".join(odata_filter)
522

523
        logger.info(f"Odata filter: {odata_expr}")
524

525
        return odata_expr
526

527
    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
528
        odata_filter = None
529
        if query.filters is not None:
530
            odata_filter = self._create_odata_filter(query.filters)
531
        azure_query_result_search: AzureQueryResultSearchBase = (
532
            AzureQueryResultSearchDefault(
533
                query, self._field_mapping, odata_filter, self._search_client
534
            )
535
        )
536
        if query.mode == VectorStoreQueryMode.SPARSE:
537
            azure_query_result_search = AzureQueryResultSearchSparse(
538
                query, self._field_mapping, odata_filter, self._search_client
539
            )
540
        elif query.mode == VectorStoreQueryMode.HYBRID:
541
            azure_query_result_search = AzureQueryResultSearchHybrid(
542
                query, self._field_mapping, odata_filter, self._search_client
543
            )
544
        elif query.mode == VectorStoreQueryMode.SEMANTIC_HYBRID:
545
            azure_query_result_search = AzureQueryResultSearchSemanticHybrid(
546
                query, self._field_mapping, odata_filter, self._search_client
547
            )
548
        return azure_query_result_search.search()
549

550

551
class AzureQueryResultSearchBase:
552
    def __init__(
553
        self,
554
        query: VectorStoreQuery,
555
        field_mapping: Dict[str, str],
556
        odata_filter: Optional[str],
557
        search_client: Any,
558
    ) -> None:
559
        self._query = query
560
        self._field_mapping = field_mapping
561
        self._odata_filter = odata_filter
562
        self._search_client = search_client
563

564
    @property
565
    def _select_fields(self) -> List[str]:
566
        return [
567
            self._field_mapping["id"],
568
            self._field_mapping["chunk"],
569
            self._field_mapping["metadata"],
570
            self._field_mapping["doc_id"],
571
        ]
572

573
    def _create_search_query(self) -> str:
574
        return "*"
575

576
    def _create_query_vector(self) -> Optional[List[Any]]:
577
        return None
578

579
    def _create_query_result(
580
        self, search_query: str, vectors: Optional[List[Any]]
581
    ) -> VectorStoreQueryResult:
582
        results = self._search_client.search(
583
            search_text=search_query,
584
            vector_queries=vectors,
585
            top=self._query.similarity_top_k,
586
            select=self._select_fields,
587
            filter=self._odata_filter,
588
        )
589

590
        id_result = []
591
        node_result = []
592
        score_result = []
593
        for result in results:
594
            node_id = result[self._field_mapping["id"]]
595
            metadata = json.loads(result[self._field_mapping["metadata"]])
596
            score = result["@search.score"]
597
            chunk = result[self._field_mapping["chunk"]]
598

599
            try:
600
                node = metadata_dict_to_node(metadata)
601
                node.set_content(chunk)
602
            except Exception:
603
                # NOTE: deprecated legacy logic for backward compatibility
604
                metadata, node_info, relationships = legacy_metadata_dict_to_node(
605
                    metadata
606
                )
607

608
                node = TextNode(
609
                    text=chunk,
610
                    id_=node_id,
611
                    metadata=metadata,
612
                    start_char_idx=node_info.get("start", None),
613
                    end_char_idx=node_info.get("end", None),
614
                    relationships=relationships,
615
                )
616

617
            logger.debug(f"Retrieved node id {node_id} with node data of {node}")
618

619
            id_result.append(node_id)
620
            node_result.append(node)
621
            score_result.append(score)
622

623
        logger.debug(
624
            f"Search query '{search_query}' returned {len(id_result)} results."
625
        )
626

627
        return VectorStoreQueryResult(
628
            nodes=node_result, similarities=score_result, ids=id_result
629
        )
630

631
    def search(self) -> VectorStoreQueryResult:
632
        search_query = self._create_search_query()
633
        vectors = self._create_query_vector()
634
        return self._create_query_result(search_query, vectors)
635

636

637
class AzureQueryResultSearchDefault(AzureQueryResultSearchBase):
638
    def _create_query_vector(self) -> Optional[List[Any]]:
639
        """Query vector store."""
640
        from azure.search.documents.models import VectorizedQuery
641

642
        if not self._query.query_embedding:
643
            raise ValueError("Query missing embedding")
644

645
        vectorized_query = VectorizedQuery(
646
            vector=self._query.query_embedding,
647
            k_nearest_neighbors=self._query.similarity_top_k,
648
            fields=self._field_mapping["embedding"],
649
        )
650
        vector_queries = [vectorized_query]
651
        logger.info("Vector search with supplied embedding")
652
        return vector_queries
653

654

655
class AzureQueryResultSearchSparse(AzureQueryResultSearchBase):
656
    def _create_search_query(self) -> str:
657
        if self._query.query_str is None:
658
            raise ValueError("Query missing query string")
659

660
        search_query = self._query.query_str
661

662
        logger.info(f"Hybrid search with search text: {search_query}")
663
        return search_query
664

665

666
class AzureQueryResultSearchHybrid(
667
    AzureQueryResultSearchDefault, AzureQueryResultSearchSparse
668
):
669
    def _create_query_vector(self) -> Optional[List[Any]]:
670
        return AzureQueryResultSearchDefault._create_query_vector(self)
671

672
    def _create_search_query(self) -> str:
673
        return AzureQueryResultSearchSparse._create_search_query(self)
674

675

676
class AzureQueryResultSearchSemanticHybrid(AzureQueryResultSearchHybrid):
677
    def _create_query_vector(self) -> Optional[List[Any]]:
678
        """Query vector store."""
679
        from azure.search.documents.models import VectorizedQuery
680

681
        if not self._query.query_embedding:
682
            raise ValueError("Query missing embedding")
683
        # k is set to 50 to align with the number of accept document in azure semantic reranking model.
684
        # https://learn.microsoft.com/azure/search/semantic-search-overview
685
        vectorized_query = VectorizedQuery(
686
            vector=self._query.query_embedding,
687
            k_nearest_neighbors=50,
688
            fields=self._field_mapping["embedding"],
689
        )
690
        vector_queries = [vectorized_query]
691
        logger.info("Vector search with supplied embedding")
692
        return vector_queries
693

694
    def _create_query_result(
695
        self, search_query: str, vector_queries: Optional[List[Any]]
696
    ) -> VectorStoreQueryResult:
697
        results = self._search_client.search(
698
            search_text=search_query,
699
            vector_queries=vector_queries,
700
            top=self._query.similarity_top_k,
701
            select=self._select_fields,
702
            filter=self._odata_filter,
703
            query_type="semantic",
704
            semantic_configuration_name="mySemanticConfig",
705
        )
706

707
        id_result = []
708
        node_result = []
709
        score_result = []
710
        for result in results:
711
            node_id = result[self._field_mapping["id"]]
712
            metadata = json.loads(result[self._field_mapping["metadata"]])
713
            # use reranker_score instead of score
714
            score = result["@search.reranker_score"]
715
            chunk = result[self._field_mapping["chunk"]]
716

717
            try:
718
                node = metadata_dict_to_node(metadata)
719
                node.set_content(chunk)
720
            except Exception:
721
                # NOTE: deprecated legacy logic for backward compatibility
722
                metadata, node_info, relationships = legacy_metadata_dict_to_node(
723
                    metadata
724
                )
725

726
                node = TextNode(
727
                    text=chunk,
728
                    id_=node_id,
729
                    metadata=metadata,
730
                    start_char_idx=node_info.get("start", None),
731
                    end_char_idx=node_info.get("end", None),
732
                    relationships=relationships,
733
                )
734

735
            logger.debug(f"Retrieved node id {node_id} with node data of {node}")
736

737
            id_result.append(node_id)
738
            node_result.append(node)
739
            score_result.append(score)
740

741
        logger.debug(
742
            f"Search query '{search_query}' returned {len(id_result)} results."
743
        )
744

745
        return VectorStoreQueryResult(
746
            nodes=node_result, similarities=score_result, ids=id_result
747
        )
748

749

750
CognitiveSearchVectorStore = AzureAISearchVectorStore
751

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.