llama-index
124 строки · 4.6 Кб
1"""JSON Reader."""
2
3import json4import re5from typing import Any, Generator, List, Optional6
7from llama_index.legacy.readers.base import BaseReader8from llama_index.legacy.schema import Document9
10
11def _depth_first_yield(12json_data: Any,13levels_back: int,14collapse_length: Optional[int],15path: List[str],16ensure_ascii: bool = False,17) -> Generator[str, None, None]:18"""Do depth first yield of all of the leaf nodes of a JSON.19
20Combines keys in the JSON tree using spaces.
21
22If levels_back is set to 0, prints all levels.
23If collapse_length is not None and the json_data is <= that number
24of characters, then we collapse it into one line.
25
26"""
27if isinstance(json_data, (dict, list)):28# only try to collapse if we're not at a leaf node29json_str = json.dumps(json_data, ensure_ascii=ensure_ascii)30if collapse_length is not None and len(json_str) <= collapse_length:31new_path = path[-levels_back:]32new_path.append(json_str)33yield " ".join(new_path)34return35elif isinstance(json_data, dict):36for key, value in json_data.items():37new_path = path[:]38new_path.append(key)39yield from _depth_first_yield(40value, levels_back, collapse_length, new_path41)42elif isinstance(json_data, list):43for _, value in enumerate(json_data):44yield from _depth_first_yield(value, levels_back, collapse_length, path)45else:46new_path = path[-levels_back:]47new_path.append(str(json_data))48yield " ".join(new_path)49
50
51class JSONReader(BaseReader):52"""JSON reader.53
54Reads JSON documents with options to help suss out relationships between nodes.
55
56Args:
57levels_back (int): the number of levels to go back in the JSON tree, 0
58if you want all levels. If levels_back is None, then we just format the
59JSON and make each line an embedding
60
61collapse_length (int): the maximum number of characters a JSON fragment
62would be collapsed in the output (levels_back needs to be not None)
63ex: if collapse_length = 10, and
64input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
65then a would be collapsed into one line, while b would not.
66Recommend starting around 100 and then adjusting from there.
67
68is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
69Defaults to False.
70
71"""
72
73def __init__(74self,75levels_back: Optional[int] = None,76collapse_length: Optional[int] = None,77ensure_ascii: bool = False,78is_jsonl: Optional[bool] = False,79) -> None:80"""Initialize with arguments."""81super().__init__()82self.levels_back = levels_back83self.collapse_length = collapse_length84self.ensure_ascii = ensure_ascii85self.is_jsonl = is_jsonl86
87def load_data(self, input_file: str) -> List[Document]:88"""Load data from the input file."""89with open(input_file, encoding="utf-8") as f:90load_data = []91if self.is_jsonl:92for line in f:93load_data.append(json.loads(line.strip()))94else:95load_data = [json.load(f)]96
97documents = []98for data in load_data:99# print(data)100if self.levels_back is None:101# If levels_back isn't set, we just format and make each102# line an embedding103json_output = json.dumps(104data, indent=0, ensure_ascii=self.ensure_ascii105)106lines = json_output.split("\n")107useful_lines = [108line for line in lines if not re.match(r"^[{}\[\],]*$", line)109]110documents.append(Document(text="\n".join(useful_lines)))111elif self.levels_back is not None:112# If levels_back is set, we make the embeddings contain the labels113# from further up the JSON tree114lines = [115*_depth_first_yield(116data,117self.levels_back,118self.collapse_length,119[],120self.ensure_ascii,121)122]123documents.append(Document(text="\n".join(lines)))124return documents125