llama-index

tabular_reader.py
116 строк · 3.5 Кб
Перенос по словам
1
"""Tabular parser.
2

3
Contains parsers for tabular data files.
4

5
"""
6

7
from pathlib import Path
8
from typing import Any, Dict, List, Optional
9

10
import pandas as pd
11

12
from llama_index.legacy.readers.base import BaseReader
13
from llama_index.legacy.schema import Document
14

15

16
class CSVReader(BaseReader):
17
    """CSV parser.
18

19
    Args:
20
        concat_rows (bool): whether to concatenate all rows into one document.
21
            If set to False, a Document will be created for each row.
22
            True by default.
23

24
    """
25

26
    def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None:
27
        """Init params."""
28
        super().__init__(*args, **kwargs)
29
        self._concat_rows = concat_rows
30

31
    def load_data(
32
        self, file: Path, extra_info: Optional[Dict] = None
33
    ) -> List[Document]:
34
        """Parse file.
35

36
        Returns:
37
            Union[str, List[str]]: a string or a List of strings.
38

39
        """
40
        try:
41
            import csv
42
        except ImportError:
43
            raise ImportError("csv module is required to read CSV files.")
44
        text_list = []
45
        with open(file) as fp:
46
            csv_reader = csv.reader(fp)
47
            for row in csv_reader:
48
                text_list.append(", ".join(row))
49
        if self._concat_rows:
50
            return [Document(text="\n".join(text_list), metadata=extra_info)]
51
        else:
52
            return [Document(text=text, metadata=extra_info) for text in text_list]
53

54

55
class PandasCSVReader(BaseReader):
56
    r"""Pandas-based CSV parser.
57

58
    Parses CSVs using the separator detection from Pandas `read_csv`function.
59
    If special parameters are required, use the `pandas_config` dict.
60

61
    Args:
62
        concat_rows (bool): whether to concatenate all rows into one document.
63
            If set to False, a Document will be created for each row.
64
            True by default.
65

66
        col_joiner (str): Separator to use for joining cols per row.
67
            Set to ", " by default.
68

69
        row_joiner (str): Separator to use for joining each row.
70
            Only used when `concat_rows=True`.
71
            Set to "\n" by default.
72

73
        pandas_config (dict): Options for the `pandas.read_csv` function call.
74
            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
75
            for more information.
76
            Set to empty dict by default, this means pandas will try to figure
77
            out the separators, table head, etc. on its own.
78

79
    """
80

81
    def __init__(
82
        self,
83
        *args: Any,
84
        concat_rows: bool = True,
85
        col_joiner: str = ", ",
86
        row_joiner: str = "\n",
87
        pandas_config: dict = {},
88
        **kwargs: Any
89
    ) -> None:
90
        """Init params."""
91
        super().__init__(*args, **kwargs)
92
        self._concat_rows = concat_rows
93
        self._col_joiner = col_joiner
94
        self._row_joiner = row_joiner
95
        self._pandas_config = pandas_config
96

97
    def load_data(
98
        self, file: Path, extra_info: Optional[Dict] = None
99
    ) -> List[Document]:
100
        """Parse file."""
101
        df = pd.read_csv(file, **self._pandas_config)
102

103
        text_list = df.apply(
104
            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
105
        ).tolist()
106

107
        if self._concat_rows:
108
            return [
109
                Document(
110
                    text=(self._row_joiner).join(text_list), metadata=extra_info or {}
111
                )
112
            ]
113
        else:
114
            return [
115
                Document(text=text, metadata=extra_info or {}) for text in text_list
116
            ]
117
llama-index

Использование cookies