llama-index
116 строк · 3.5 Кб
1"""Tabular parser.
2
3Contains parsers for tabular data files.
4
5"""
6
7from pathlib import Path8from typing import Any, Dict, List, Optional9
10import pandas as pd11
12from llama_index.legacy.readers.base import BaseReader13from llama_index.legacy.schema import Document14
15
16class CSVReader(BaseReader):17"""CSV parser.18
19Args:
20concat_rows (bool): whether to concatenate all rows into one document.
21If set to False, a Document will be created for each row.
22True by default.
23
24"""
25
26def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:27"""Init params."""28super().__init__(*args, **kwargs)29self._concat_rows = concat_rows30
31def load_data(32self, file: Path, extra_info: Optional[Dict] = None33) -> List[Document]:34"""Parse file.35
36Returns:
37Union[str, List[str]]: a string or a List of strings.
38
39"""
40try:41import csv42except ImportError:43raise ImportError("csv module is required to read CSV files.")44text_list = []45with open(file) as fp:46csv_reader = csv.reader(fp)47for row in csv_reader:48text_list.append(", ".join(row))49if self._concat_rows:50return [Document(text="\n".join(text_list), metadata=extra_info)]51else:52return [Document(text=text, metadata=extra_info) for text in text_list]53
54
55class PandasCSVReader(BaseReader):56r"""Pandas-based CSV parser.57
58Parses CSVs using the separator detection from Pandas `read_csv`function.
59If special parameters are required, use the `pandas_config` dict.
60
61Args:
62concat_rows (bool): whether to concatenate all rows into one document.
63If set to False, a Document will be created for each row.
64True by default.
65
66col_joiner (str): Separator to use for joining cols per row.
67Set to ", " by default.
68
69row_joiner (str): Separator to use for joining each row.
70Only used when `concat_rows=True`.
71Set to "\n" by default.
72
73pandas_config (dict): Options for the `pandas.read_csv` function call.
74Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
75for more information.
76Set to empty dict by default, this means pandas will try to figure
77out the separators, table head, etc. on its own.
78
79"""
80
81def __init__(82self,83*args: Any,84concat_rows: bool = True,85col_joiner: str = ", ",86row_joiner: str = "\n",87pandas_config: dict = {},88**kwargs: Any89) -> None:90"""Init params."""91super().__init__(*args, **kwargs)92self._concat_rows = concat_rows93self._col_joiner = col_joiner94self._row_joiner = row_joiner95self._pandas_config = pandas_config96
97def load_data(98self, file: Path, extra_info: Optional[Dict] = None99) -> List[Document]:100"""Parse file."""101df = pd.read_csv(file, **self._pandas_config)102
103text_list = df.apply(104lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1105).tolist()106
107if self._concat_rows:108return [109Document(110text=(self._row_joiner).join(text_list), metadata=extra_info or {}111)112]113else:114return [115Document(text=text, metadata=extra_info or {}) for text in text_list116]117