llama-index

json.py
124 строки · 4.6 Кб
Перенос по словам
1
"""JSON Reader."""
2

3
import json
4
import re
5
from typing import Any, Generator, List, Optional
6

7
from llama_index.legacy.readers.base import BaseReader
8
from llama_index.legacy.schema import Document
9

10

11
def _depth_first_yield(
12
    json_data: Any,
13
    levels_back: int,
14
    collapse_length: Optional[int],
15
    path: List[str],
16
    ensure_ascii: bool = False,
17
) -> Generator[str, None, None]:
18
    """Do depth first yield of all of the leaf nodes of a JSON.
19

20
    Combines keys in the JSON tree using spaces.
21

22
    If levels_back is set to 0, prints all levels.
23
    If collapse_length is not None and the json_data is <= that number
24
      of characters, then we collapse it into one line.
25

26
    """
27
    if isinstance(json_data, (dict, list)):
28
        # only try to collapse if we're not at a leaf node
29
        json_str = json.dumps(json_data, ensure_ascii=ensure_ascii)
30
        if collapse_length is not None and len(json_str) <= collapse_length:
31
            new_path = path[-levels_back:]
32
            new_path.append(json_str)
33
            yield " ".join(new_path)
34
            return
35
        elif isinstance(json_data, dict):
36
            for key, value in json_data.items():
37
                new_path = path[:]
38
                new_path.append(key)
39
                yield from _depth_first_yield(
40
                    value, levels_back, collapse_length, new_path
41
                )
42
        elif isinstance(json_data, list):
43
            for _, value in enumerate(json_data):
44
                yield from _depth_first_yield(value, levels_back, collapse_length, path)
45
    else:
46
        new_path = path[-levels_back:]
47
        new_path.append(str(json_data))
48
        yield " ".join(new_path)
49

50

51
class JSONReader(BaseReader):
52
    """JSON reader.
53

54
    Reads JSON documents with options to help suss out relationships between nodes.
55

56
    Args:
57
        levels_back (int): the number of levels to go back in the JSON tree, 0
58
          if you want all levels. If levels_back is None, then we just format the
59
          JSON and make each line an embedding
60

61
        collapse_length (int): the maximum number of characters a JSON fragment
62
          would be collapsed in the output (levels_back needs to be not None)
63
          ex: if collapse_length = 10, and
64
          input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
65
          then a would be collapsed into one line, while b would not.
66
          Recommend starting around 100 and then adjusting from there.
67

68
        is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
69
        Defaults to False.
70

71
    """
72

73
    def __init__(
74
        self,
75
        levels_back: Optional[int] = None,
76
        collapse_length: Optional[int] = None,
77
        ensure_ascii: bool = False,
78
        is_jsonl: Optional[bool] = False,
79
    ) -> None:
80
        """Initialize with arguments."""
81
        super().__init__()
82
        self.levels_back = levels_back
83
        self.collapse_length = collapse_length
84
        self.ensure_ascii = ensure_ascii
85
        self.is_jsonl = is_jsonl
86

87
    def load_data(self, input_file: str) -> List[Document]:
88
        """Load data from the input file."""
89
        with open(input_file, encoding="utf-8") as f:
90
            load_data = []
91
            if self.is_jsonl:
92
                for line in f:
93
                    load_data.append(json.loads(line.strip()))
94
            else:
95
                load_data = [json.load(f)]
96

97
            documents = []
98
            for data in load_data:
99
                # print(data)
100
                if self.levels_back is None:
101
                    # If levels_back isn't set, we just format and make each
102
                    # line an embedding
103
                    json_output = json.dumps(
104
                        data, indent=0, ensure_ascii=self.ensure_ascii
105
                    )
106
                    lines = json_output.split("\n")
107
                    useful_lines = [
108
                        line for line in lines if not re.match(r"^[{}\[\],]*$", line)
109
                    ]
110
                    documents.append(Document(text="\n".join(useful_lines)))
111
                elif self.levels_back is not None:
112
                    # If levels_back is set, we make the embeddings contain the labels
113
                    # from further up the JSON tree
114
                    lines = [
115
                        *_depth_first_yield(
116
                            data,
117
                            self.levels_back,
118
                            self.collapse_length,
119
                            [],
120
                            self.ensure_ascii,
121
                        )
122
                    ]
123
                    documents.append(Document(text="\n".join(lines)))
124
            return documents
125
llama-index

Использование cookies