llama-index

Форк
0
77 строк · 2.4 Кб
1
"""txtai reader."""
2

3
from typing import Any, Dict, List
4

5
import numpy as np
6

7
from llama_index.legacy.readers.base import BaseReader
8
from llama_index.legacy.schema import Document
9

10

11
class TxtaiReader(BaseReader):
12
    """txtai reader.
13

14
    Retrieves documents through an existing in-memory txtai index.
15
    These documents can then be used in a downstream LlamaIndex data structure.
16
    If you wish use txtai itself as an index to to organize documents,
17
    insert documents, and perform queries on them, please use VectorStoreIndex
18
    with TxtaiVectorStore.
19

20
    Args:
21
        txtai_index (txtai.ann.ANN): A txtai Index object (required)
22

23
    """
24

25
    def __init__(self, index: Any):
26
        """Initialize with parameters."""
27
        import_err_msg = """
28
            `txtai` package not found. For instructions on
29
            how to install `txtai` please visit
30
            https://neuml.github.io/txtai/install/
31
        """
32
        try:
33
            import txtai  # noqa
34
        except ImportError:
35
            raise ImportError(import_err_msg)
36

37
        self._index = index
38

39
    def load_data(
40
        self,
41
        query: np.ndarray,
42
        id_to_text_map: Dict[str, str],
43
        k: int = 4,
44
        separate_documents: bool = True,
45
    ) -> List[Document]:
46
        """Load data from txtai index.
47

48
        Args:
49
            query (np.ndarray): A 2D numpy array of query vectors.
50
            id_to_text_map (Dict[str, str]): A map from ID's to text.
51
            k (int): Number of nearest neighbors to retrieve. Defaults to 4.
52
            separate_documents (Optional[bool]): Whether to return separate
53
                documents. Defaults to True.
54

55
        Returns:
56
            List[Document]: A list of documents.
57

58
        """
59
        search_result = self._index.search(query, k)
60
        documents = []
61
        for query_result in search_result:
62
            for doc_id, _ in query_result:
63
                doc_id = str(doc_id)
64
                if doc_id not in id_to_text_map:
65
                    raise ValueError(
66
                        f"Document ID {doc_id} not found in id_to_text_map."
67
                    )
68
                text = id_to_text_map[doc_id]
69
                documents.append(Document(text=text))
70

71
        if not separate_documents:
72
            # join all documents into one
73
            text_list = [doc.get_content() for doc in documents]
74
            text = "\n\n".join(text_list)
75
            documents = [Document(text=text)]
76

77
        return documents
78

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.