llama-index

bagel.py
171 строка · 5.4 Кб
Перенос по словам
1
from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, TypeVar, Union
2

3
from llama_index.legacy.readers.base import BaseReader
4
from llama_index.legacy.readers.schema.base import Document
5

6
# define types
7
ID = str
8
IDs = List[ID]
9

10
Vector = Union[Sequence[float], Sequence[int]]
11
Embedding = Vector
12
Embeddings = List[Embedding]
13

14
Metadata = Mapping[str, Union[str, int, float]]
15
Metadatas = List[Metadata]
16

17
# Metadata Query Grammar
18
LiteralValue = Union[str, int, float]
19
LogicalOperator = Literal["$and", "$or"]
20
WhereOperator = Literal["$gt", "$gte", "$lt", "$lte", "$ne", "$eq"]
21
OperatorExpression = Dict[Union[WhereOperator, LogicalOperator], LiteralValue]
22

23
Where = Dict[
24
    Union[str, LogicalOperator], Union[LiteralValue, OperatorExpression, List["Where"]]
25
]
26

27
WhereDocumentOperator = Union[Literal["$contains"], LogicalOperator]
28
WhereDocument = Dict[WhereDocumentOperator, Union[str, List["WhereDocument"]]]
29

30
ClusterMetadata = Dict[Any, Any]
31

32
Doc = str
33
Documents = List[Doc]
34

35
Parameter = TypeVar("Parameter", Embedding, Doc, Metadata, ID)
36
T = TypeVar("T")
37
OneOrMany = Union[T, List[T]]
38

39
# This should ust be List[Literal["documents", "embeddings", "metadatas", "distances"]]
40
# However, this provokes an incompatibility with the Overrides library and Python 3.7
41
Include = List[Literal["documents", "embeddings", "metadatas", "distances"]]
42

43
LiteralValue = LiteralValue
44
LogicalOperator = LogicalOperator
45
WhereOperator = WhereOperator
46
OperatorExpression = OperatorExpression
47
Where = Where
48
WhereDocumentOperator = WhereDocumentOperator
49

50

51
class BagelReader(BaseReader):
52
    """Reader for Bagel files."""
53

54
    def __init__(self, collection_name: str) -> None:
55
        """Initialize BagelReader.
56

57
        Args: collection_name: Name of the collection to load from.
58

59
        Returns: None
60
        """
61
        try:
62
            import bagel
63
        except ImportError:
64
            raise ImportError(
65
                "`bagel` package not found, please run `pip install bagel`"
66
            )
67
        from bagel.config import Settings
68

69
        if not collection_name:
70
            raise ValueError("collection_name cannot be empty")
71

72
        self.collection_name = collection_name
73

74
        server_settings = Settings(
75
            bagel_api_impl="rest", bagel_server_host="api.bageldb.ai"
76
        )
77

78
        self.client = bagel.Client(server_settings)
79

80
        self._collection = self.client.get_cluster(collection_name)
81

82
    def create_documents(self, results: Any) -> Any:
83
        """Create documents from the results.
84

85
        Args:
86
            results: Results from the query.
87

88
        Returns:
89
            List of documents.
90
        """
91
        documents = []
92
        # create a list of results
93
        all_results = list(
94
            zip(
95
                results["ids"][0],
96
                results["documents"][0],
97
                results["embeddings"][0],
98
                results["metadatas"][0],
99
            )
100
        )
101
        # iterate through the results
102
        for result in all_results:
103
            # create a Llama Document
104
            document = Document(
105
                doc_id=result[0],
106
                text=result[1],
107
                embedding=result[2],
108
                metadata=result[3],
109
            )
110
            documents.append(document)
111

112
        return documents
113

114
    def load_data(
115
        self,
116
        query_vector: Optional[OneOrMany[Embedding]] = None,
117
        query_texts: Optional[OneOrMany[Doc]] = None,
118
        limit: int = 10,
119
        where: Optional[Where] = None,
120
        where_document: Optional[WhereDocument] = None,
121
        include: Include = ["metadatas", "documents", "embeddings", "distances"],
122
    ) -> Any:
123
        """Get the top n_results documents for provided query_embeddings or query_texts.
124

125
        Args:
126
            query_embeddings: The embeddings to get the closes neighbors of. Optional.
127
            query_texts: The document texts to get the closes neighbors of. Optional.
128
            n_results: The number of neighbors to return for each query. Optional.
129
            where: A Where type dict used to filter results by. Optional.
130
            where_document: A WhereDocument type dict used to filter. Optional.
131
            include: A list of what to include in the results. Optional.
132

133
        Returns:
134
            Llama Index Document(s) with the closest embeddings to the
135
            query_embeddings or query_texts.
136
        """
137
        # get the results from the collection
138
        # If neither query_embeddings nor query_texts are provided,
139
        # or both are provided, raise an error
140
        if (query_vector is None and query_texts is None) or (
141
            query_vector is not None and query_texts is not None
142
        ):
143
            raise ValueError(
144
                "You must provide either embeddings or texts to find, but not both"
145
            )
146

147
        if where is None:
148
            where = {}
149

150
        if where_document is None:
151
            where_document = {}
152

153
        results = self._collection.find(
154
            query_embeddings=query_vector,
155
            query_texts=query_texts,
156
            n_results=limit,
157
            where=where,
158
            where_document=where_document,
159
            include=include,
160
        )
161

162
        # check if there are results
163
        if not results:
164
            raise ValueError("No results found")
165

166
        # check if there are embeddings or documents
167
        if not results["embeddings"] and not results["documents"]:
168
            raise ValueError("No embeddings or documents found")
169

170
        # create documents from the results
171
        return self.create_documents(results)
172
llama-index

Использование cookies