llama-index

Форк
0
1
from typing import Any, List, Optional, Type, cast
2

3
import pandas as pd
4

5
from llama_index.legacy.bridge.pydantic import BaseModel, Field
6
from llama_index.legacy.program.llm_prompt_program import BaseLLMFunctionProgram
7
from llama_index.legacy.program.openai_program import OpenAIPydanticProgram
8
from llama_index.legacy.types import BasePydanticProgram
9

10

11
class DataFrameRow(BaseModel):
12
    """Row in a DataFrame."""
13

14
    row_values: List[Any] = Field(
15
        ...,
16
        description="List of row values, where each value corresponds to a row key.",
17
    )
18

19

20
class DataFrameColumn(BaseModel):
21
    """Column in a DataFrame."""
22

23
    column_name: str = Field(..., description="Column name.")
24
    column_desc: Optional[str] = Field(..., description="Column description.")
25

26

27
class DataFrame(BaseModel):
28
    """Data-frame class.
29

30
    Consists of a `rows` field which is a list of dictionaries,
31
    as well as a `columns` field which is a list of column names.
32

33
    """
34

35
    description: Optional[str] = None
36

37
    columns: List[DataFrameColumn] = Field(..., description="List of column names.")
38
    rows: List[DataFrameRow] = Field(
39
        ...,
40
        description="""List of DataFrameRow objects. Each DataFrameRow contains \
41
        valuesin order of the data frame column.""",
42
    )
43

44
    def to_df(self) -> pd.DataFrame:
45
        """To dataframe."""
46
        return pd.DataFrame(
47
            [row.row_values for row in self.rows],
48
            columns=[col.column_name for col in self.columns],
49
        )
50

51

52
class DataFrameRowsOnly(BaseModel):
53
    """Data-frame with rows. Assume column names are already known beforehand."""
54

55
    rows: List[DataFrameRow] = Field(..., description="""List of row objects.""")
56

57
    def to_df(self, existing_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
58
        """To dataframe."""
59
        if existing_df is None:
60
            return pd.DataFrame([row.row_values for row in self.rows])
61
        else:
62
            new_df = pd.DataFrame([row.row_values for row in self.rows])
63
            new_df.columns = existing_df.columns
64
            # assume row values are in order of column names
65
            return pd.concat([existing_df, new_df], ignore_index=True)
66

67

68
class DataFrameValuesPerColumn(BaseModel):
69
    """Data-frame as a list of column objects.
70

71
    Each column object contains a list of values. Note that they can be
72
    of variable length, and so may not be able to be converted to a dataframe.
73

74
    """
75

76
    columns: List[DataFrameRow] = Field(..., description="""List of column objects.""")
77

78

79
DEFAULT_FULL_DF_PARSER_TMPL = """
80
Please extract the following query into a structured data.
81
Query: {input_str}.
82
Please extract both the set of column names and row names.
83
"""
84

85
DEFAULT_ROWS_DF_PARSER_TMPL = """
86
Please extract the following query into structured data.
87
Query: {input_str}.
88
The column schema is the following: {column_schema}.
89
"""
90

91

92
class DFFullProgram(BasePydanticProgram[DataFrame]):
93
    """Data-frame program.
94

95
    Extracts text into a schema + datapoints.
96

97
    """
98

99
    def __init__(
100
        self,
101
        pydantic_program_cls: Type[BaseLLMFunctionProgram],
102
        df_parser_template_str: str = DEFAULT_FULL_DF_PARSER_TMPL,
103
        input_key: str = "input_str",
104
        **program_kwargs: Any,
105
    ) -> None:
106
        """Init params."""
107
        pydantic_program = pydantic_program_cls.from_defaults(
108
            DataFrame, df_parser_template_str, **program_kwargs
109
        )
110
        self._validate_program(pydantic_program)
111
        self._pydantic_program = pydantic_program
112
        self._input_key = input_key
113

114
    @classmethod
115
    def from_defaults(
116
        cls,
117
        pydantic_program_cls: Optional[Type[BaseLLMFunctionProgram]] = None,
118
        df_parser_template_str: str = DEFAULT_FULL_DF_PARSER_TMPL,
119
        input_key: str = "input_str",
120
    ) -> "DFFullProgram":
121
        """Full DF output parser."""
122
        pydantic_program_cls = pydantic_program_cls or OpenAIPydanticProgram
123

124
        return cls(
125
            pydantic_program_cls,
126
            df_parser_template_str=df_parser_template_str,
127
            input_key=input_key,
128
        )
129

130
    def _validate_program(self, pydantic_program: BasePydanticProgram) -> None:
131
        if pydantic_program.output_cls != DataFrame:
132
            raise ValueError("Output class of pydantic program must be `DataFrame`.")
133

134
    @property
135
    def output_cls(self) -> Type[DataFrame]:
136
        """Output class."""
137
        return DataFrame
138

139
    def __call__(self, *args: Any, **kwds: Any) -> DataFrame:
140
        """Call."""
141
        if self._input_key not in kwds:
142
            raise ValueError(f"Input key {self._input_key} not found in kwds.")
143
        result = self._pydantic_program(**{self._input_key: kwds[self._input_key]})
144
        return cast(DataFrame, result)
145

146

147
class DFRowsProgram(BasePydanticProgram[DataFrameRowsOnly]):
148
    """DF Rows output parser.
149

150
    Given DF schema, extract text into a set of rows.
151

152
    """
153

154
    def __init__(
155
        self,
156
        pydantic_program_cls: Type[BaseLLMFunctionProgram],
157
        df_parser_template_str: str = DEFAULT_ROWS_DF_PARSER_TMPL,
158
        column_schema: Optional[str] = None,
159
        input_key: str = "input_str",
160
        **program_kwargs: Any,
161
    ) -> None:
162
        """Init params."""
163
        # partial format df parser template string with column schema
164
        prompt_template_str = df_parser_template_str.replace(
165
            "{column_schema}", column_schema or ""
166
        )
167

168
        pydantic_program = pydantic_program_cls.from_defaults(
169
            DataFrameRowsOnly, prompt_template_str, **program_kwargs
170
        )
171
        self._validate_program(pydantic_program)
172
        self._pydantic_program = pydantic_program
173
        self._input_key = input_key
174

175
    def _validate_program(self, pydantic_program: BasePydanticProgram) -> None:
176
        if pydantic_program.output_cls != DataFrameRowsOnly:
177
            raise ValueError(
178
                "Output class of pydantic program must be `DataFramRowsOnly`."
179
            )
180

181
    @classmethod
182
    def from_defaults(
183
        cls,
184
        pydantic_program_cls: Optional[Type[BaseLLMFunctionProgram]] = None,
185
        df_parser_template_str: str = DEFAULT_ROWS_DF_PARSER_TMPL,
186
        df: Optional[pd.DataFrame] = None,
187
        column_schema: Optional[str] = None,
188
        input_key: str = "input_str",
189
        **kwargs: Any,
190
    ) -> "DFRowsProgram":
191
        """Rows DF output parser."""
192
        pydantic_program_cls = pydantic_program_cls or OpenAIPydanticProgram
193

194
        # either one of df or column_schema needs to be specified
195
        if df is None and column_schema is None:
196
            raise ValueError(
197
                "Either `df` or `column_schema` must be specified for "
198
                "DFRowsOutputParser."
199
            )
200
        # first, inject the column schema into the template string
201
        if column_schema is None:
202
            assert df is not None
203
            # by default, show column schema and some example values
204
            column_schema = ", ".join(df.columns)
205

206
        return cls(
207
            pydantic_program_cls,
208
            df_parser_template_str=df_parser_template_str,
209
            column_schema=column_schema,
210
            input_key=input_key,
211
            **kwargs,
212
        )
213

214
    @property
215
    def output_cls(self) -> Type[DataFrameRowsOnly]:
216
        """Output class."""
217
        return DataFrameRowsOnly
218

219
    def __call__(self, *args: Any, **kwds: Any) -> DataFrameRowsOnly:
220
        """Call."""
221
        if self._input_key not in kwds:
222
            raise ValueError(f"Input key {self._input_key} not found in kwds.")
223
        result = self._pydantic_program(**{self._input_key: kwds[self._input_key]})
224
        return cast(DataFrameRowsOnly, result)
225

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.