llama-index

Форк
0
142 строки · 4.5 Кб
1
import json
2
from typing import Any, Dict, Optional, Tuple
3

4
from llama_index.legacy.schema import (
5
    BaseNode,
6
    ImageNode,
7
    IndexNode,
8
    NodeRelationship,
9
    RelatedNodeInfo,
10
    TextNode,
11
)
12

13
DEFAULT_TEXT_KEY = "text"
14
DEFAULT_EMBEDDING_KEY = "embedding"
15
DEFAULT_DOC_ID_KEY = "doc_id"
16

17

18
def _validate_is_flat_dict(metadata_dict: dict) -> None:
19
    """
20
    Validate that metadata dict is flat,
21
    and key is str, and value is one of (str, int, float, None).
22
    """
23
    for key, val in metadata_dict.items():
24
        if not isinstance(key, str):
25
            raise ValueError("Metadata key must be str!")
26
        if not isinstance(val, (str, int, float, type(None))):
27
            raise ValueError(
28
                f"Value for metadata {key} must be one of (str, int, float, None)"
29
            )
30

31

32
def node_to_metadata_dict(
33
    node: BaseNode,
34
    remove_text: bool = False,
35
    text_field: str = DEFAULT_TEXT_KEY,
36
    flat_metadata: bool = False,
37
) -> Dict[str, Any]:
38
    """Common logic for saving Node data into metadata dict."""
39
    node_dict = node.dict()
40
    metadata: Dict[str, Any] = node_dict.get("metadata", {})
41

42
    if flat_metadata:
43
        _validate_is_flat_dict(metadata)
44

45
    # store entire node as json string - some minor text duplication
46
    if remove_text:
47
        node_dict[text_field] = ""
48

49
    # remove embedding from node_dict
50
    node_dict["embedding"] = None
51

52
    # dump remainder of node_dict to json string
53
    metadata["_node_content"] = json.dumps(node_dict)
54
    metadata["_node_type"] = node.class_name()
55

56
    # store ref doc id at top level to allow metadata filtering
57
    # kept for backwards compatibility, will consolidate in future
58
    metadata["document_id"] = node.ref_doc_id or "None"  # for Chroma
59
    metadata["doc_id"] = node.ref_doc_id or "None"  # for Pinecone, Qdrant, Redis
60
    metadata["ref_doc_id"] = node.ref_doc_id or "None"  # for Weaviate
61

62
    return metadata
63

64

65
def metadata_dict_to_node(metadata: dict, text: Optional[str] = None) -> BaseNode:
66
    """Common logic for loading Node data from metadata dict."""
67
    node_json = metadata.get("_node_content", None)
68
    node_type = metadata.get("_node_type", None)
69
    if node_json is None:
70
        raise ValueError("Node content not found in metadata dict.")
71

72
    node: BaseNode
73
    if node_type == IndexNode.class_name():
74
        node = IndexNode.parse_raw(node_json)
75
    elif node_type == ImageNode.class_name():
76
        node = ImageNode.parse_raw(node_json)
77
    else:
78
        node = TextNode.parse_raw(node_json)
79

80
    if text is not None:
81
        node.set_content(text)
82

83
    return node
84

85

86
# TODO: Deprecated conversion functions
87
def legacy_metadata_dict_to_node(
88
    metadata: dict, text_key: str = DEFAULT_TEXT_KEY
89
) -> Tuple[dict, dict, dict]:
90
    """Common logic for loading Node data from metadata dict."""
91
    # make a copy first
92
    if metadata is None:
93
        metadata = {}
94
    else:
95
        metadata = metadata.copy()
96

97
    # load node_info from json string
98
    node_info_str = metadata.pop("node_info", "")
99
    if node_info_str == "":
100
        node_info = {}
101
    else:
102
        node_info = json.loads(node_info_str)
103

104
    # load relationships from json string
105
    relationships_str = metadata.pop("relationships", "")
106
    relationships: Dict[NodeRelationship, RelatedNodeInfo]
107
    if relationships_str == "":
108
        relationships = {}
109
    else:
110
        relationships = {
111
            NodeRelationship(k): RelatedNodeInfo(node_id=str(v))
112
            for k, v in json.loads(relationships_str).items()
113
        }
114

115
    # remove other known fields
116
    metadata.pop(text_key, None)
117

118
    id_ = metadata.pop("id", None)
119
    document_id = metadata.pop("document_id", None)
120
    doc_id = metadata.pop("doc_id", None)
121
    ref_doc_id = metadata.pop("ref_doc_id", None)
122

123
    # don't remove id's from metadata that llama-index doesn't know about
124
    ref_doc_id_info = relationships.get(NodeRelationship.PARENT, None)
125
    if ref_doc_id_info is not None:
126
        ref_doc_id = ref_doc_id_info.node_id
127

128
    if id_ is not None and id_ != ref_doc_id:
129
        metadata["id"] = id_
130
    if document_id is not None and document_id != ref_doc_id:
131
        metadata["document_id"] = document_id
132
    if doc_id is not None and doc_id != ref_doc_id:
133
        metadata["doc_id"] = doc_id
134

135
    # remaining metadata is metadata or node_info
136
    new_metadata = {}
137
    for key, val in metadata.items():
138
        # don't enforce types on metadata anymore (we did in the past)
139
        # since how we store this data now has been updated
140
        new_metadata[key] = val
141

142
    return new_metadata, node_info, relationships
143

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.