FEDOT

data_types.py
501 строка · 23.8 Кб
Перенос по словам
1
from __future__ import annotations
2

3
from collections.abc import Sequence
4
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
5

6
import numpy as np
7
import pandas as pd
8
from golem.core.log import LoggerAdapter, default_log
9

10
from fedot.core.repository.tasks import Task, TaskTypesEnum
11

12
if TYPE_CHECKING:
13
    from fedot.core.data.data import InputData
14

15
_convertable_types = (bool, float, int, str, type(None))  # preserve lexicographical order
16
_type_ids = range(len(_convertable_types))
17

18
TYPE_TO_ID = dict(zip(_convertable_types, _type_ids))
19

20
_TYPES = 'types'
21
_FLOAT_NUMBER = 'float_number'
22
_INT_NUMBER = 'int_number'
23
_STR_NUMBER = 'str_number'
24
_NAN_NUMBER = 'nan_number'
25
_NAN_IDS = 'nan_ids'
26

27
FEDOT_STR_NAN = 'fedot_nan'
28
# If unique values in the feature column is less than 13 - convert column into string type else to numerical
29
CATEGORICAL_MAX_UNIQUE_TH = 13
30
# column must be removed if failed rate is between these constants below
31
# because it means that in the column there are approximately the same number of truly string and ints/floats
32
ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM = 0.4
33
ACCEPTABLE_CONVERSION_FAILED_RATE_TOP = 0.65
34

35

36
class TableTypesCorrector:
37
    """
38
    Class for checking types in input data. Also perform conversion for columns with types conflicts
39
    """
40

41
    def __init__(self):
42
        # Threshold to convert numerical into categorical column
43
        self.categorical_max_uniques_th = CATEGORICAL_MAX_UNIQUE_TH
44

45
        self.acceptable_failed_rate_bottom = ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM
46
        self.acceptable_failed_rate_top = ACCEPTABLE_CONVERSION_FAILED_RATE_TOP
47

48
        self.features_columns_info = pd.DataFrame()
49
        self.target_columns_info = pd.DataFrame()
50

51
        # Dictionary with information about converted during fitting columns
52
        self.features_converted_columns = {}
53
        self.target_converted_columns = {}
54

55
        # Columns to delete due to types conflicts
56
        self.columns_to_del = []
57
        # Column ids for transformation due to number of unique values
58
        self.numerical_into_str = []
59
        self.categorical_into_float = []
60

61
        # Indices of columns with filed string into numerical transformation
62
        self.string_columns_transformation_failed = {}
63

64
        # Is target column contains non-numerical cells during conversion
65
        self.target_converting_has_errors = False
66

67
        # Lists with column types for converting calculated on source input data
68
        self.feature_type_ids = None
69
        self.target_type_ids = None
70
        self.log = default_log(self)
71

72
    def convert_data_for_fit(self, data: InputData):
73
        """ If column contain several data types - perform correction procedure """
74
        # Convert features to have an ability to insert str into float table or vice versa
75
        data.features = data.features.astype(object)
76

77
        # Determine types for each column in features and target if it is necessary
78
        self.features_columns_info = define_column_types(data.features)
79
        self.target_columns_info = define_column_types(data.target)
80

81
        # Correct types in features table
82
        data.features = self.feature_types_converting(features=data.features)
83
        # Remain only correct columns
84
        data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
85

86
        # And in target(s)
87
        data.target = self.target_types_converting(target=data.target, task=data.task)
88
        data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
89
                                                                              target=data.target,
90
                                                                              task=data.task)
91

92
        self._into_numeric_features_transformation_for_fit(data)
93
        # Launch conversion float and integer features into categorical
94
        self._into_categorical_features_transformation_for_fit(data)
95
        # Save info about features and target types
96
        self.feature_type_ids = data.supplementary_data.col_type_ids['features'].copy()
97
        self.target_type_ids = data.supplementary_data.col_type_ids.get(
98
            'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float)
99
        ).copy()
100

101
        self._retain_columns_info_without_types_conflicts(data)
102
        return data
103

104
    def convert_data_for_predict(self, data: InputData):
105
        """ Prepare data for predict stage. Include only column types transformation """
106
        # Ordering is important because after removing incorrect features - indices are obsolete
107
        data.features = data.features.astype(object)
108
        data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
109
        data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log)
110
        if data.target is not None:
111
            data.target = apply_type_transformation(data.target, self.target_type_ids, self.log)
112
        data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
113
                                                                              target=data.target,
114
                                                                              task=data.task)
115

116
        # Convert column types
117
        self._into_numeric_features_transformation_for_predict(data)
118
        self._into_categorical_features_transformation_for_predict(data)
119
        self._retain_columns_info_without_types_conflicts(data)
120
        return data
121

122
    def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
123
        """
124
        Remove from the table columns with conflicts with types were not resolved
125

126
        :param table: tabular dataset based on which new dataset will be generated
127
        :param converted_columns: dictionary with actions with table
128
        """
129
        self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id is None]
130
        table = np.delete(table, self.columns_to_del, 1)
131
        return table
132

133
    def feature_types_converting(self, features: np.ndarray) -> np.ndarray:
134
        """ Convert all elements in the data in every feature column into one type
135

136
        :param features: tabular features array
137
        """
138
        mixed_types_columns = _find_mixed_types_columns(self.features_columns_info)
139
        cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER])
140
        cols_with_strings_or_floats.apply(self._convert_feature_into_one_type, features=features)
141

142
        return features
143

144
    def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray:
145
        """ Convert all elements in every target column into one type
146

147
        :param target: tabular target array
148
        :param task: task to solve
149
        """
150
        mixed_types_columns = _find_mixed_types_columns(self.target_columns_info)
151
        cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER])
152
        cols_with_strings.apply(self._convert_target_into_one_type, target=target, task=task)
153

154
        return target
155

156
    def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = None,
157
                                  task: Task = None) -> dict:
158
        """ Prepare information about columns in a form of dictionary
159
        Dictionary has two keys: 'target' and 'features'
160
        """
161
        if self.features_columns_info.empty:
162
            # Information about column types is empty - there is a need to launch algorithm to collect info
163
            self.features_columns_info = define_column_types(predictors)
164
            predictors = self.feature_types_converting(features=predictors)
165
        if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting:
166
            self.target_columns_info = define_column_types(target)
167
            target = self.target_types_converting(target=target, task=task)
168

169
        feature_type_ids = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
170
        self._check_columns_vs_types_number(predictors, feature_type_ids)
171

172
        if target is None or task.task_type is TaskTypesEnum.ts_forecasting:
173
            return {'features': feature_type_ids}
174
        else:
175
            target_type_ids = _generate_list_with_types(self.target_columns_info, self.target_converted_columns)
176
            self._check_columns_vs_types_number(target, target_type_ids)
177
            return {'features': feature_type_ids, 'target': target_type_ids}
178

179
    def _retain_columns_info_without_types_conflicts(self, data: InputData):
180
        """ Update information in supplementary info - retain info only about remained columns.
181
        Such columns have no conflicts with types converting.
182
        """
183
        if self.string_columns_transformation_failed:
184
            self.log.warning(f'Columns with indices {self.string_columns_transformation_failed} were '
185
                             f'removed during mixed types column converting due to conflicts.')
186

187
            data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
188

189
            data.supplementary_data.col_type_ids['features'] = np.delete(
190
                data.supplementary_data.col_type_ids['features'],
191
                list(self.string_columns_transformation_failed)
192
            )
193

194
    def _check_columns_vs_types_number(self, table: np.ndarray, col_type_ids: Sequence):
195
        # Check if columns number correct
196
        _, n_cols = table.shape
197
        if n_cols != len(col_type_ids):
198
            # There is an incorrect types calculation
199
            self.log.warning('Columns number and types numbers do not match.')
200

201
    @staticmethod
202
    def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index):
203
        """ Removes from truly str column all pseudo str values """
204
        for col_id in columns:
205
            for row_id, item in enumerate(data.features[:, col_id]):
206
                try:
207
                    float(item)
208
                except ValueError:
209
                    continue
210
                else:
211
                    # item is numeric, remove its value
212
                    data.features[row_id, col_id] = np.nan
213

214
    def _convert_feature_into_one_type(self, column_info: pd.Series, features: np.ndarray):
215
        """ Determine new type for current feature column based on the string ratio. And then convert column into it.
216

217
        :param features: one-dimensional array with several data types
218
        :param column_info: dictionary with information about types in the column
219
        :param mixed_column_id: index of column in dataset
220
        """
221
        new_type_id = None
222
        if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]:
223
            # Column contain only one data type and nans
224
            non_nan_type_lst = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]]
225
            new_type_id = non_nan_type_lst[0]
226
        else:
227
            string_objects_number = column_info[_STR_NUMBER]
228
            all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum()
229
            string_ratio = string_objects_number / all_elements_number
230

231
            if string_ratio > 0:
232
                suggested_type = str
233
            else:
234
                suggested_type = _obtain_new_column_type(column_info)
235

236
            try:
237
                converted = features[:, column_info.name].astype(suggested_type)
238
                # If there were nans in the column - paste nan
239
                if column_info[_NAN_NUMBER]:
240
                    converted = converted.astype(object)
241
                    converted[column_info[_NAN_IDS]] = np.nan
242
                    del column_info[_NAN_IDS]
243
                features[:, column_info.name] = converted
244
            except ValueError:
245
                # Cannot convert string objects into int or float (for example 'a' into int)
246
                prefix = (f'Feature column with index {column_info.name} contains '
247
                          f'the following data types: {column_info[_TYPES]}.')
248
                self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
249
            else:
250
                new_type_id = TYPE_TO_ID[suggested_type]
251
        self.features_converted_columns[column_info.name] = new_type_id
252

253
    def _convert_target_into_one_type(self, column_info: pd.Series, target: np.ndarray,
254
                                      task: Task) -> Tuple[np.ndarray, str]:
255
        """ Convert target columns into one type based on column proportions of object and task """
256
        if task.task_type is TaskTypesEnum.classification:
257
            # For classification labels are string if at least one element is a string
258
            suggested_type = str
259
        else:
260
            suggested_type = _obtain_new_column_type(column_info)
261
        self.target_converted_columns[column_info.name] = TYPE_TO_ID[suggested_type]
262

263
        mixed_column = target[:, column_info.name]
264
        try:
265
            target[:, column_info.name] = mixed_column.astype(suggested_type)
266
        except ValueError:
267
            # Cannot convert string objects into int or float (for example 'a' into int)
268
            converted_column = pd.to_numeric(mixed_column, errors='coerce')
269

270
            prefix = (f'Target column with index {column_info.name} contains '
271
                      f'the following data types: {column_info[_TYPES]}.')
272
            log_message = f'{prefix} String cannot be converted into {suggested_type}. Set unconverted values to NaN.'
273
            self.log.debug(log_message)
274
            self.target_converting_has_errors = True
275
            target[:, column_info.name] = converted_column
276

277
    def _into_categorical_features_transformation_for_fit(self, data: InputData):
278
        """
279
        Perform automated categorical features determination. If feature column
280
        contains int or float values with few unique values (less than 13)
281
        """
282
        feature_type_ids = data.supplementary_data.col_type_ids['features']
283
        is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
284
        numeric_type_ids = np.flatnonzero(is_numeric_type)
285
        num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
286
        nuniques = num_df.nunique(dropna=True)
287

288
        # reduce dataframe to include only categorical features
289
        num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)]
290
        cat_col_ids = num_df.columns
291
        # Convert into string
292
        data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy()
293
        # Columns need to be transformed into categorical (string) ones
294
        self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
295
        # Update information about column types (in-place)
296
        feature_type_ids[cat_col_ids] = TYPE_TO_ID[str]
297

298
    def _into_categorical_features_transformation_for_predict(self, data: InputData):
299
        """ Apply conversion into categorical string column for every signed column """
300
        # Get numerical columns
301
        num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str)
302

303
        # Convert and apply categorical transformation
304
        data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
305

306
        # Update information about column types (in-place)
307
        feature_type_ids = data.supplementary_data.col_type_ids['features']
308
        feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str]
309

310
    def _into_numeric_features_transformation_for_fit(self, data: InputData):
311
        """
312
        Automatically determine categorical features which should be converted into float
313
        """
314
        is_str_type = data.supplementary_data.col_type_ids['features'] == TYPE_TO_ID[str]
315
        str_col_ids = np.flatnonzero(is_str_type)
316
        str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
317
        orig_nans_cnt = str_cols_df.isna().sum(axis=0)
318

319
        converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce')
320
        conv_nans_cnt = converted_str_cols_df.isna().sum(axis=0)
321

322
        failed_objects_cnt = conv_nans_cnt - orig_nans_cnt
323
        non_nan_all_objects_cnt = len(data.features) - orig_nans_cnt
324
        failed_ratio = failed_objects_cnt / non_nan_all_objects_cnt
325

326
        # Check if the majority of objects can be converted into numerical
327
        is_numeric = failed_ratio < self.acceptable_failed_rate_bottom
328
        is_numeric_ids = is_numeric[is_numeric].index
329
        data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy()
330
        self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
331

332
        # Update information about column types (in-place)
333
        feature_type_ids = data.supplementary_data.col_type_ids['features']
334
        feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float]
335

336
        # The columns consist mostly of truly str values and has a few ints/floats in it
337
        is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
338
        self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index)
339

340
        # If column contains a lot of '?' or 'x' as nans equivalents
341
        # add it to remove list
342
        is_of_mistakes = (
343
            (self.acceptable_failed_rate_bottom <= failed_ratio) &
344
            (failed_ratio < self.acceptable_failed_rate_top))
345
        self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index))
346

347
    def _into_numeric_features_transformation_for_predict(self, data: InputData):
348
        """ Apply conversion into float string column for every signed column """
349
        str_col_ids = np.setdiff1d(
350
            self.categorical_into_float,
351
            list(self.string_columns_transformation_failed)
352
        ).astype(int)
353
        str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
354
        data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
355

356
        # Update information about column types (in-place)
357
        feature_type_ids = data.supplementary_data.col_type_ids['features']
358
        feature_type_ids[str_col_ids] = TYPE_TO_ID[float]
359

360

361
def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
362
    """ Prepare information about types per columns. For each column store unique
363
    types, which column contains.
364
    """
365
    table_of_types = pd.DataFrame(table, copy=True)
366
    table_of_types = table_of_types.replace({np.nan: None}).applymap(lambda el: TYPE_TO_ID[type(el)])
367

368
    # Build dataframe with unique types for each column
369
    uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T
370

371
    # Build dataframe with amount of each type
372
    counts_index_mapper = {
373
        TYPE_TO_ID[float]: _FLOAT_NUMBER,
374
        TYPE_TO_ID[int]: _INT_NUMBER,
375
        TYPE_TO_ID[str]: _STR_NUMBER,
376
        TYPE_TO_ID[type(None)]: _NAN_NUMBER
377
    }
378
    types_counts = (
379
        table_of_types
380
        .apply(pd.value_counts, dropna=False)
381
        .reindex(counts_index_mapper.keys(), copy=False)  # Sets all type ids
382
        .replace(np.nan, 0)
383
        .rename(index=counts_index_mapper, copy=False)  # Renames all type ids to strs
384
        .astype(int)
385
    )
386

387
    # Build dataframe with nans indices
388
    nans_ids = (
389
        (table_of_types == TYPE_TO_ID[type(None)])
390
        .apply(np.flatnonzero, result_type='reduce')
391
        .to_frame(_NAN_IDS).T
392
    )
393

394
    # Combine all dataframes
395
    return pd.concat([uniques, types_counts, nans_ids])
396

397

398
def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame:
399
    """ Search for columns with several types in them """
400
    has_mixed_types = columns_info.loc[_TYPES].apply(len) > 1
401
    return columns_info.loc[:, has_mixed_types]
402

403

404
def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame:
405
    cols_have_any = frame.loc[rows_to_select].any()
406
    return frame.loc[:, cols_have_any]
407

408

409
def apply_type_transformation(table: np.ndarray, col_type_ids: Sequence, log: LoggerAdapter):
410
    """
411
    Apply transformation for columns in dataset into desired type. Perform
412
    transformation on predict stage when column types were already determined
413
    during fit
414
    """
415
    table_df = pd.DataFrame(table, copy=False)
416
    types_sr = pd.Series(col_type_ids).map({
417
        **{TYPE_TO_ID[t]: t for t in [int, str]},
418
        **{TYPE_TO_ID[t]: float for t in [bool, type(None), float]}
419
    })
420

421
    return table_df.apply(_convert_predict_column_into_desired_type, types_sr=types_sr, log=log).to_numpy()
422

423

424
def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series:
425
    """ Convert pandas column into numpy one-dimensional array """
426
    # convert only non-nans values
427
    true_nums = numerical_column[numerical_column.notna()]
428
    numerical_column[true_nums.index] = true_nums.astype(str, copy=False)
429
    return numerical_column
430

431

432
def _obtain_new_column_type(column_info: pd.Series):
433
    """ Suggest in or float type based on the presence of nan and float values """
434
    if column_info[[_FLOAT_NUMBER, _NAN_NUMBER]].any():
435
        # Even if one of types are float - all elements should be converted into float
436
        return float
437
    # It is available to convert numerical into integer type
438
    return int
439

440

441
def _convert_predict_column_into_desired_type(current_column: pd.Series, types_sr: pd.Series, log: LoggerAdapter):
442
    current_type = types_sr.loc[current_column.name]
443
    try:
444
        converted_column = current_column.astype(current_type)
445
        if current_type is str:
446
            has_comma_and_dot = np.isin(['.', ','], current_column).all()
447
            if has_comma_and_dot:
448
                # Most likely case: '20,000' must be converted into '20.000'
449
                warning = f'Column {current_column.name} contains both "." and ",". Standardize it.'
450
                log.warning(warning)
451
    except ValueError:
452
        converted_column = current_column.apply(_process_predict_column_values_one_by_one, current_type=current_type)
453
    return converted_column
454

455

456
def _generate_list_with_types(columns_types_info: pd.DataFrame,
457
                              converted_columns: Dict[int, Optional[int]]) -> np.ndarray:
458
    """ Create list with types for all remained columns
459

460
    :param columns_types_info: dictionary with initial column types
461
    :param converted_columns: dictionary with transformed column types
462
    """
463
    updated_col_type_ids = []
464

465
    for column_id, column_type_ids in columns_types_info.loc[_TYPES].items():
466
        if len(column_type_ids) == 1:
467
            # Column initially contain only one type
468
            updated_col_type_ids.append(column_type_ids[0])
469
        elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
470
            # Column with one type and nans
471
            filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
472
            updated_col_type_ids.append(filtered_types[0])
473
        else:
474
            if TYPE_TO_ID[str] in column_type_ids:
475
                # Mixed-types column with string
476
                new_col_id = converted_columns[column_id]
477
                if new_col_id is not None:
478
                    updated_col_type_ids.append(new_col_id)
479
            else:
480
                # Mixed-types with float and integer
481
                updated_col_type_ids.append(TYPE_TO_ID[float])
482

483
    return np.array(updated_col_type_ids)
484

485

486
def _process_predict_column_values_one_by_one(value, current_type: type):
487
    """ Process column values one by one and try to convert them into desirable type.
488
    If not successful replace with np.nan """
489
    new_value = np.nan
490
    try:
491
        new_value = current_type(value)
492
    except ValueError:
493
        if isinstance(value, str) and ('.' in value or ',' in value):
494
            value = value.replace(',', '.')
495
            try:
496
                # Since "10.6" can not be converted to 10 straightforward using int()
497
                if current_type is int:
498
                    new_value = int(float(value))
499
            except ValueError:
500
                pass
501
    return new_value
502
FEDOT

Использование cookies