llama-index

Форк
0
86 строк · 2.8 Кб
1
"""Elasticsearch (or Opensearch) reader over REST api.
2

3
This only uses the basic search api, so it will work with Elasticsearch and Opensearch.
4

5
"""
6

7
from typing import Any, List, Optional
8

9
from llama_index.legacy.bridge.pydantic import PrivateAttr
10
from llama_index.legacy.readers.base import BasePydanticReader
11
from llama_index.legacy.schema import Document
12

13

14
class ElasticsearchReader(BasePydanticReader):
15
    """
16
    Read documents from an Elasticsearch/Opensearch index.
17

18
    These documents can then be used in a downstream Llama Index data structure.
19

20
    Args:
21
        endpoint (str): URL (http/https) of cluster
22
        index (str): Name of the index (required)
23
        httpx_client_args (dict): Optional additional args to pass to the `httpx.Client`
24
    """
25

26
    is_remote: bool = True
27
    endpoint: str
28
    index: str
29
    httpx_client_args: Optional[dict] = None
30

31
    _client: Any = PrivateAttr()
32

33
    def __init__(
34
        self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None
35
    ):
36
        """Initialize with parameters."""
37
        import_err_msg = """
38
            `httpx` package not found. Install via `pip install httpx`
39
        """
40
        try:
41
            import httpx
42
        except ImportError:
43
            raise ImportError(import_err_msg)
44
        self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {}))
45

46
        super().__init__(
47
            endpoint=endpoint, index=index, httpx_client_args=httpx_client_args
48
        )
49

50
    @classmethod
51
    def class_name(cls) -> str:
52
        return "ElasticsearchReader"
53

54
    def load_data(
55
        self,
56
        field: str,
57
        query: Optional[dict] = None,
58
        embedding_field: Optional[str] = None,
59
    ) -> List[Document]:
60
        """Read data from the Elasticsearch index.
61

62
        Args:
63
            field (str): Field in the document to retrieve text from
64
            query (Optional[dict]): Elasticsearch JSON query DSL object.
65
                For example:
66
                {"query": {"match": {"message": {"query": "this is a test"}}}}
67
            embedding_field (Optional[str]): If there are embeddings stored in
68
                this index, this field can be used
69
                to set the embedding field on the returned Document list.
70

71
        Returns:
72
            List[Document]: A list of documents.
73

74
        """
75
        res = self._client.post(f"{self.index}/_search", json=query).json()
76
        documents = []
77
        for hit in res["hits"]["hits"]:
78
            doc_id = hit["_id"]
79
            value = hit["_source"][field]
80
            embedding = hit["_source"].get(embedding_field or "", None)
81
            documents.append(
82
                Document(
83
                    id_=doc_id, text=value, metadata=hit["_source"], embedding=embedding
84
                )
85
            )
86
        return documents
87

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.