llama-index

Форк
0
267 строк · 7.9 Кб
1
"""Data structures.
2

3
Nodes are decoupled from the indices.
4

5
"""
6

7
import uuid
8
from abc import abstractmethod
9
from dataclasses import dataclass, field
10
from typing import Dict, List, Optional, Sequence, Set
11

12
from dataclasses_json import DataClassJsonMixin
13

14
from llama_index.legacy.data_structs.struct_type import IndexStructType
15
from llama_index.legacy.schema import BaseNode, TextNode
16

17
# TODO: legacy backport of old Node class
18
Node = TextNode
19

20

21
@dataclass
22
class IndexStruct(DataClassJsonMixin):
23
    """A base data struct for a LlamaIndex."""
24

25
    index_id: str = field(default_factory=lambda: str(uuid.uuid4()))
26
    summary: Optional[str] = None
27

28
    def get_summary(self) -> str:
29
        """Get text summary."""
30
        if self.summary is None:
31
            raise ValueError("summary field of the index_struct not set.")
32
        return self.summary
33

34
    @classmethod
35
    @abstractmethod
36
    def get_type(cls) -> IndexStructType:
37
        """Get index struct type."""
38

39

40
@dataclass
41
class IndexGraph(IndexStruct):
42
    """A graph representing the tree-structured index."""
43

44
    # mapping from index in tree to Node doc id.
45
    all_nodes: Dict[int, str] = field(default_factory=dict)
46
    root_nodes: Dict[int, str] = field(default_factory=dict)
47
    node_id_to_children_ids: Dict[str, List[str]] = field(default_factory=dict)
48

49
    @property
50
    def node_id_to_index(self) -> Dict[str, int]:
51
        """Map from node id to index."""
52
        return {node_id: index for index, node_id in self.all_nodes.items()}
53

54
    @property
55
    def size(self) -> int:
56
        """Get the size of the graph."""
57
        return len(self.all_nodes)
58

59
    def get_index(self, node: BaseNode) -> int:
60
        """Get index of node."""
61
        return self.node_id_to_index[node.node_id]
62

63
    def insert(
64
        self,
65
        node: BaseNode,
66
        index: Optional[int] = None,
67
        children_nodes: Optional[Sequence[BaseNode]] = None,
68
    ) -> None:
69
        """Insert node."""
70
        index = index or self.size
71
        node_id = node.node_id
72

73
        self.all_nodes[index] = node_id
74

75
        if children_nodes is None:
76
            children_nodes = []
77
        children_ids = [n.node_id for n in children_nodes]
78
        self.node_id_to_children_ids[node_id] = children_ids
79

80
    def get_children(self, parent_node: Optional[BaseNode]) -> Dict[int, str]:
81
        """Get children nodes."""
82
        if parent_node is None:
83
            return self.root_nodes
84
        else:
85
            parent_id = parent_node.node_id
86
            children_ids = self.node_id_to_children_ids[parent_id]
87
            return {
88
                self.node_id_to_index[child_id]: child_id for child_id in children_ids
89
            }
90

91
    def insert_under_parent(
92
        self,
93
        node: BaseNode,
94
        parent_node: Optional[BaseNode],
95
        new_index: Optional[int] = None,
96
    ) -> None:
97
        """Insert under parent node."""
98
        new_index = new_index or self.size
99
        if parent_node is None:
100
            self.root_nodes[new_index] = node.node_id
101
            self.node_id_to_children_ids[node.node_id] = []
102
        else:
103
            if parent_node.node_id not in self.node_id_to_children_ids:
104
                self.node_id_to_children_ids[parent_node.node_id] = []
105
            self.node_id_to_children_ids[parent_node.node_id].append(node.node_id)
106

107
        self.all_nodes[new_index] = node.node_id
108

109
    @classmethod
110
    def get_type(cls) -> IndexStructType:
111
        """Get type."""
112
        return IndexStructType.TREE
113

114

115
@dataclass
116
class KeywordTable(IndexStruct):
117
    """A table of keywords mapping keywords to text chunks."""
118

119
    table: Dict[str, Set[str]] = field(default_factory=dict)
120

121
    def add_node(self, keywords: List[str], node: BaseNode) -> None:
122
        """Add text to table."""
123
        for keyword in keywords:
124
            if keyword not in self.table:
125
                self.table[keyword] = set()
126
            self.table[keyword].add(node.node_id)
127

128
    @property
129
    def node_ids(self) -> Set[str]:
130
        """Get all node ids."""
131
        return set.union(*self.table.values())
132

133
    @property
134
    def keywords(self) -> Set[str]:
135
        """Get all keywords in the table."""
136
        return set(self.table.keys())
137

138
    @property
139
    def size(self) -> int:
140
        """Get the size of the table."""
141
        return len(self.table)
142

143
    @classmethod
144
    def get_type(cls) -> IndexStructType:
145
        """Get type."""
146
        return IndexStructType.KEYWORD_TABLE
147

148

149
@dataclass
150
class IndexList(IndexStruct):
151
    """A list of documents."""
152

153
    nodes: List[str] = field(default_factory=list)
154

155
    def add_node(self, node: BaseNode) -> None:
156
        """Add text to table, return current position in list."""
157
        # don't worry about child indices for now, nodes are all in order
158
        self.nodes.append(node.node_id)
159

160
    @classmethod
161
    def get_type(cls) -> IndexStructType:
162
        """Get type."""
163
        return IndexStructType.LIST
164

165

166
@dataclass
167
class IndexDict(IndexStruct):
168
    """A simple dictionary of documents."""
169

170
    # TODO: slightly deprecated, should likely be a list or set now
171
    # mapping from vector store id to node doc_id
172
    nodes_dict: Dict[str, str] = field(default_factory=dict)
173

174
    # TODO: deprecated, not used
175
    # mapping from node doc_id to vector store id
176
    doc_id_dict: Dict[str, List[str]] = field(default_factory=dict)
177

178
    # TODO: deprecated, not used
179
    # this should be empty for all other indices
180
    embeddings_dict: Dict[str, List[float]] = field(default_factory=dict)
181

182
    def add_node(
183
        self,
184
        node: BaseNode,
185
        text_id: Optional[str] = None,
186
    ) -> str:
187
        """Add text to table, return current position in list."""
188
        # # don't worry about child indices for now, nodes are all in order
189
        # self.nodes_dict[int_id] = node
190
        vector_id = text_id if text_id is not None else node.node_id
191
        self.nodes_dict[vector_id] = node.node_id
192

193
        return vector_id
194

195
    def delete(self, doc_id: str) -> None:
196
        """Delete a Node."""
197
        del self.nodes_dict[doc_id]
198

199
    @classmethod
200
    def get_type(cls) -> IndexStructType:
201
        """Get type."""
202
        return IndexStructType.VECTOR_STORE
203

204

205
@dataclass
206
class MultiModelIndexDict(IndexDict):
207
    """A simple dictionary of documents, but loads a MultiModelVectorStore."""
208

209
    @classmethod
210
    def get_type(cls) -> IndexStructType:
211
        """Get type."""
212
        return IndexStructType.MULTIMODAL_VECTOR_STORE
213

214

215
@dataclass
216
class KG(IndexStruct):
217
    """A table of keywords mapping keywords to text chunks."""
218

219
    # Unidirectional
220

221
    # table of keywords to node ids
222
    table: Dict[str, Set[str]] = field(default_factory=dict)
223

224
    # TODO: legacy attribute, remove in future releases
225
    rel_map: Dict[str, List[List[str]]] = field(default_factory=dict)
226

227
    # TBD, should support vector store, now we just persist the embedding memory
228
    # maybe chainable abstractions for *_stores could be designed
229
    embedding_dict: Dict[str, List[float]] = field(default_factory=dict)
230

231
    @property
232
    def node_ids(self) -> Set[str]:
233
        """Get all node ids."""
234
        return set.union(*self.table.values())
235

236
    def add_to_embedding_dict(self, triplet_str: str, embedding: List[float]) -> None:
237
        """Add embedding to dict."""
238
        self.embedding_dict[triplet_str] = embedding
239

240
    def add_node(self, keywords: List[str], node: BaseNode) -> None:
241
        """Add text to table."""
242
        node_id = node.node_id
243
        for keyword in keywords:
244
            if keyword not in self.table:
245
                self.table[keyword] = set()
246
            self.table[keyword].add(node_id)
247

248
    def search_node_by_keyword(self, keyword: str) -> List[str]:
249
        """Search for nodes by keyword."""
250
        if keyword not in self.table:
251
            return []
252
        return list(self.table[keyword])
253

254
    @classmethod
255
    def get_type(cls) -> IndexStructType:
256
        """Get type."""
257
        return IndexStructType.KG
258

259

260
@dataclass
261
class EmptyIndexStruct(IndexStruct):
262
    """Empty index."""
263

264
    @classmethod
265
    def get_type(cls) -> IndexStructType:
266
        """Get type."""
267
        return IndexStructType.EMPTY
268

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.