FEDOT
501 строка · 23.8 Кб
1from __future__ import annotations
2
3from collections.abc import Sequence
4from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
5
6import numpy as np
7import pandas as pd
8from golem.core.log import LoggerAdapter, default_log
9
10from fedot.core.repository.tasks import Task, TaskTypesEnum
11
12if TYPE_CHECKING:
13from fedot.core.data.data import InputData
14
15_convertable_types = (bool, float, int, str, type(None)) # preserve lexicographical order
16_type_ids = range(len(_convertable_types))
17
18TYPE_TO_ID = dict(zip(_convertable_types, _type_ids))
19
20_TYPES = 'types'
21_FLOAT_NUMBER = 'float_number'
22_INT_NUMBER = 'int_number'
23_STR_NUMBER = 'str_number'
24_NAN_NUMBER = 'nan_number'
25_NAN_IDS = 'nan_ids'
26
27FEDOT_STR_NAN = 'fedot_nan'
28# If unique values in the feature column is less than 13 - convert column into string type else to numerical
29CATEGORICAL_MAX_UNIQUE_TH = 13
30# column must be removed if failed rate is between these constants below
31# because it means that in the column there are approximately the same number of truly string and ints/floats
32ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM = 0.4
33ACCEPTABLE_CONVERSION_FAILED_RATE_TOP = 0.65
34
35
36class TableTypesCorrector:
37"""
38Class for checking types in input data. Also perform conversion for columns with types conflicts
39"""
40
41def __init__(self):
42# Threshold to convert numerical into categorical column
43self.categorical_max_uniques_th = CATEGORICAL_MAX_UNIQUE_TH
44
45self.acceptable_failed_rate_bottom = ACCEPTABLE_CONVERSION_FAILED_RATE_BOTTOM
46self.acceptable_failed_rate_top = ACCEPTABLE_CONVERSION_FAILED_RATE_TOP
47
48self.features_columns_info = pd.DataFrame()
49self.target_columns_info = pd.DataFrame()
50
51# Dictionary with information about converted during fitting columns
52self.features_converted_columns = {}
53self.target_converted_columns = {}
54
55# Columns to delete due to types conflicts
56self.columns_to_del = []
57# Column ids for transformation due to number of unique values
58self.numerical_into_str = []
59self.categorical_into_float = []
60
61# Indices of columns with filed string into numerical transformation
62self.string_columns_transformation_failed = {}
63
64# Is target column contains non-numerical cells during conversion
65self.target_converting_has_errors = False
66
67# Lists with column types for converting calculated on source input data
68self.feature_type_ids = None
69self.target_type_ids = None
70self.log = default_log(self)
71
72def convert_data_for_fit(self, data: InputData):
73""" If column contain several data types - perform correction procedure """
74# Convert features to have an ability to insert str into float table or vice versa
75data.features = data.features.astype(object)
76
77# Determine types for each column in features and target if it is necessary
78self.features_columns_info = define_column_types(data.features)
79self.target_columns_info = define_column_types(data.target)
80
81# Correct types in features table
82data.features = self.feature_types_converting(features=data.features)
83# Remain only correct columns
84data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
85
86# And in target(s)
87data.target = self.target_types_converting(target=data.target, task=data.task)
88data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
89target=data.target,
90task=data.task)
91
92self._into_numeric_features_transformation_for_fit(data)
93# Launch conversion float and integer features into categorical
94self._into_categorical_features_transformation_for_fit(data)
95# Save info about features and target types
96self.feature_type_ids = data.supplementary_data.col_type_ids['features'].copy()
97self.target_type_ids = data.supplementary_data.col_type_ids.get(
98'target', np.empty((self.feature_type_ids.shape[0], 1), dtype=float)
99).copy()
100
101self._retain_columns_info_without_types_conflicts(data)
102return data
103
104def convert_data_for_predict(self, data: InputData):
105""" Prepare data for predict stage. Include only column types transformation """
106# Ordering is important because after removing incorrect features - indices are obsolete
107data.features = data.features.astype(object)
108data.features = self.remove_incorrect_features(data.features, self.features_converted_columns)
109data.features = apply_type_transformation(data.features, self.feature_type_ids, self.log)
110if data.target is not None:
111data.target = apply_type_transformation(data.target, self.target_type_ids, self.log)
112data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features,
113target=data.target,
114task=data.task)
115
116# Convert column types
117self._into_numeric_features_transformation_for_predict(data)
118self._into_categorical_features_transformation_for_predict(data)
119self._retain_columns_info_without_types_conflicts(data)
120return data
121
122def remove_incorrect_features(self, table: np.ndarray, converted_columns: dict):
123"""
124Remove from the table columns with conflicts with types were not resolved
125
126:param table: tabular dataset based on which new dataset will be generated
127:param converted_columns: dictionary with actions with table
128"""
129self.columns_to_del = [col_id for col_id, new_type_id in converted_columns.items() if new_type_id is None]
130table = np.delete(table, self.columns_to_del, 1)
131return table
132
133def feature_types_converting(self, features: np.ndarray) -> np.ndarray:
134""" Convert all elements in the data in every feature column into one type
135
136:param features: tabular features array
137"""
138mixed_types_columns = _find_mixed_types_columns(self.features_columns_info)
139cols_with_strings_or_floats = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER, _FLOAT_NUMBER])
140cols_with_strings_or_floats.apply(self._convert_feature_into_one_type, features=features)
141
142return features
143
144def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray:
145""" Convert all elements in every target column into one type
146
147:param target: tabular target array
148:param task: task to solve
149"""
150mixed_types_columns = _find_mixed_types_columns(self.target_columns_info)
151cols_with_strings = _select_from_rows_if_any(mixed_types_columns, [_STR_NUMBER])
152cols_with_strings.apply(self._convert_target_into_one_type, target=target, task=task)
153
154return target
155
156def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = None,
157task: Task = None) -> dict:
158""" Prepare information about columns in a form of dictionary
159Dictionary has two keys: 'target' and 'features'
160"""
161if self.features_columns_info.empty:
162# Information about column types is empty - there is a need to launch algorithm to collect info
163self.features_columns_info = define_column_types(predictors)
164predictors = self.feature_types_converting(features=predictors)
165if self.target_columns_info.empty and task.task_type is not TaskTypesEnum.ts_forecasting:
166self.target_columns_info = define_column_types(target)
167target = self.target_types_converting(target=target, task=task)
168
169feature_type_ids = _generate_list_with_types(self.features_columns_info, self.features_converted_columns)
170self._check_columns_vs_types_number(predictors, feature_type_ids)
171
172if target is None or task.task_type is TaskTypesEnum.ts_forecasting:
173return {'features': feature_type_ids}
174else:
175target_type_ids = _generate_list_with_types(self.target_columns_info, self.target_converted_columns)
176self._check_columns_vs_types_number(target, target_type_ids)
177return {'features': feature_type_ids, 'target': target_type_ids}
178
179def _retain_columns_info_without_types_conflicts(self, data: InputData):
180""" Update information in supplementary info - retain info only about remained columns.
181Such columns have no conflicts with types converting.
182"""
183if self.string_columns_transformation_failed:
184self.log.warning(f'Columns with indices {self.string_columns_transformation_failed} were '
185f'removed during mixed types column converting due to conflicts.')
186
187data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)
188
189data.supplementary_data.col_type_ids['features'] = np.delete(
190data.supplementary_data.col_type_ids['features'],
191list(self.string_columns_transformation_failed)
192)
193
194def _check_columns_vs_types_number(self, table: np.ndarray, col_type_ids: Sequence):
195# Check if columns number correct
196_, n_cols = table.shape
197if n_cols != len(col_type_ids):
198# There is an incorrect types calculation
199self.log.warning('Columns number and types numbers do not match.')
200
201@staticmethod
202def _remove_pseudo_str_values_from_str_column(data: InputData, columns: pd.Index):
203""" Removes from truly str column all pseudo str values """
204for col_id in columns:
205for row_id, item in enumerate(data.features[:, col_id]):
206try:
207float(item)
208except ValueError:
209continue
210else:
211# item is numeric, remove its value
212data.features[row_id, col_id] = np.nan
213
214def _convert_feature_into_one_type(self, column_info: pd.Series, features: np.ndarray):
215""" Determine new type for current feature column based on the string ratio. And then convert column into it.
216
217:param features: one-dimensional array with several data types
218:param column_info: dictionary with information about types in the column
219:param mixed_column_id: index of column in dataset
220"""
221new_type_id = None
222if len(column_info[_TYPES]) == 2 and TYPE_TO_ID[type(None)] in column_info[_TYPES]:
223# Column contain only one data type and nans
224non_nan_type_lst = [x for x in column_info[_TYPES] if x != TYPE_TO_ID[type(None)]]
225new_type_id = non_nan_type_lst[0]
226else:
227string_objects_number = column_info[_STR_NUMBER]
228all_elements_number = string_objects_number + column_info[[_INT_NUMBER, _FLOAT_NUMBER]].sum()
229string_ratio = string_objects_number / all_elements_number
230
231if string_ratio > 0:
232suggested_type = str
233else:
234suggested_type = _obtain_new_column_type(column_info)
235
236try:
237converted = features[:, column_info.name].astype(suggested_type)
238# If there were nans in the column - paste nan
239if column_info[_NAN_NUMBER]:
240converted = converted.astype(object)
241converted[column_info[_NAN_IDS]] = np.nan
242del column_info[_NAN_IDS]
243features[:, column_info.name] = converted
244except ValueError:
245# Cannot convert string objects into int or float (for example 'a' into int)
246prefix = (f'Feature column with index {column_info.name} contains '
247f'the following data types: {column_info[_TYPES]}.')
248self.log.warning(f'{prefix} String cannot be converted into {suggested_type}. Drop column.')
249else:
250new_type_id = TYPE_TO_ID[suggested_type]
251self.features_converted_columns[column_info.name] = new_type_id
252
253def _convert_target_into_one_type(self, column_info: pd.Series, target: np.ndarray,
254task: Task) -> Tuple[np.ndarray, str]:
255""" Convert target columns into one type based on column proportions of object and task """
256if task.task_type is TaskTypesEnum.classification:
257# For classification labels are string if at least one element is a string
258suggested_type = str
259else:
260suggested_type = _obtain_new_column_type(column_info)
261self.target_converted_columns[column_info.name] = TYPE_TO_ID[suggested_type]
262
263mixed_column = target[:, column_info.name]
264try:
265target[:, column_info.name] = mixed_column.astype(suggested_type)
266except ValueError:
267# Cannot convert string objects into int or float (for example 'a' into int)
268converted_column = pd.to_numeric(mixed_column, errors='coerce')
269
270prefix = (f'Target column with index {column_info.name} contains '
271f'the following data types: {column_info[_TYPES]}.')
272log_message = f'{prefix} String cannot be converted into {suggested_type}. Set unconverted values to NaN.'
273self.log.debug(log_message)
274self.target_converting_has_errors = True
275target[:, column_info.name] = converted_column
276
277def _into_categorical_features_transformation_for_fit(self, data: InputData):
278"""
279Perform automated categorical features determination. If feature column
280contains int or float values with few unique values (less than 13)
281"""
282feature_type_ids = data.supplementary_data.col_type_ids['features']
283is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])
284numeric_type_ids = np.flatnonzero(is_numeric_type)
285num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids)
286nuniques = num_df.nunique(dropna=True)
287
288# reduce dataframe to include only categorical features
289num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)]
290cat_col_ids = num_df.columns
291# Convert into string
292data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy()
293# Columns need to be transformed into categorical (string) ones
294self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str))
295# Update information about column types (in-place)
296feature_type_ids[cat_col_ids] = TYPE_TO_ID[str]
297
298def _into_categorical_features_transformation_for_predict(self, data: InputData):
299""" Apply conversion into categorical string column for every signed column """
300# Get numerical columns
301num_df = pd.DataFrame(data.features[:, self.numerical_into_str], columns=self.numerical_into_str)
302
303# Convert and apply categorical transformation
304data.features[:, self.numerical_into_str] = num_df.apply(convert_num_column_into_string_array).to_numpy()
305
306# Update information about column types (in-place)
307feature_type_ids = data.supplementary_data.col_type_ids['features']
308feature_type_ids[self.numerical_into_str] = TYPE_TO_ID[str]
309
310def _into_numeric_features_transformation_for_fit(self, data: InputData):
311"""
312Automatically determine categorical features which should be converted into float
313"""
314is_str_type = data.supplementary_data.col_type_ids['features'] == TYPE_TO_ID[str]
315str_col_ids = np.flatnonzero(is_str_type)
316str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
317orig_nans_cnt = str_cols_df.isna().sum(axis=0)
318
319converted_str_cols_df = str_cols_df.apply(pd.to_numeric, errors='coerce')
320conv_nans_cnt = converted_str_cols_df.isna().sum(axis=0)
321
322failed_objects_cnt = conv_nans_cnt - orig_nans_cnt
323non_nan_all_objects_cnt = len(data.features) - orig_nans_cnt
324failed_ratio = failed_objects_cnt / non_nan_all_objects_cnt
325
326# Check if the majority of objects can be converted into numerical
327is_numeric = failed_ratio < self.acceptable_failed_rate_bottom
328is_numeric_ids = is_numeric[is_numeric].index
329data.features[:, is_numeric_ids] = converted_str_cols_df[is_numeric_ids].to_numpy()
330self.categorical_into_float.extend(is_numeric_ids.difference(self.categorical_into_float))
331
332# Update information about column types (in-place)
333feature_type_ids = data.supplementary_data.col_type_ids['features']
334feature_type_ids[is_numeric_ids] = TYPE_TO_ID[float]
335
336# The columns consist mostly of truly str values and has a few ints/floats in it
337is_mixed = (self.acceptable_failed_rate_top <= failed_ratio) & (failed_ratio != 1)
338self._remove_pseudo_str_values_from_str_column(data, is_mixed[is_mixed].index)
339
340# If column contains a lot of '?' or 'x' as nans equivalents
341# add it to remove list
342is_of_mistakes = (
343(self.acceptable_failed_rate_bottom <= failed_ratio) &
344(failed_ratio < self.acceptable_failed_rate_top))
345self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index))
346
347def _into_numeric_features_transformation_for_predict(self, data: InputData):
348""" Apply conversion into float string column for every signed column """
349str_col_ids = np.setdiff1d(
350self.categorical_into_float,
351list(self.string_columns_transformation_failed)
352).astype(int)
353str_cols_df = pd.DataFrame(data.features[:, str_col_ids], columns=str_col_ids)
354data.features[:, str_col_ids] = str_cols_df.apply(pd.to_numeric, errors='coerce').to_numpy()
355
356# Update information about column types (in-place)
357feature_type_ids = data.supplementary_data.col_type_ids['features']
358feature_type_ids[str_col_ids] = TYPE_TO_ID[float]
359
360
361def define_column_types(table: Optional[np.ndarray]) -> pd.DataFrame:
362""" Prepare information about types per columns. For each column store unique
363types, which column contains.
364"""
365table_of_types = pd.DataFrame(table, copy=True)
366table_of_types = table_of_types.replace({np.nan: None}).applymap(lambda el: TYPE_TO_ID[type(el)])
367
368# Build dataframe with unique types for each column
369uniques = table_of_types.apply(pd.unique, result_type='reduce').to_frame(_TYPES).T
370
371# Build dataframe with amount of each type
372counts_index_mapper = {
373TYPE_TO_ID[float]: _FLOAT_NUMBER,
374TYPE_TO_ID[int]: _INT_NUMBER,
375TYPE_TO_ID[str]: _STR_NUMBER,
376TYPE_TO_ID[type(None)]: _NAN_NUMBER
377}
378types_counts = (
379table_of_types
380.apply(pd.value_counts, dropna=False)
381.reindex(counts_index_mapper.keys(), copy=False) # Sets all type ids
382.replace(np.nan, 0)
383.rename(index=counts_index_mapper, copy=False) # Renames all type ids to strs
384.astype(int)
385)
386
387# Build dataframe with nans indices
388nans_ids = (
389(table_of_types == TYPE_TO_ID[type(None)])
390.apply(np.flatnonzero, result_type='reduce')
391.to_frame(_NAN_IDS).T
392)
393
394# Combine all dataframes
395return pd.concat([uniques, types_counts, nans_ids])
396
397
398def _find_mixed_types_columns(columns_info: pd.DataFrame) -> pd.DataFrame:
399""" Search for columns with several types in them """
400has_mixed_types = columns_info.loc[_TYPES].apply(len) > 1
401return columns_info.loc[:, has_mixed_types]
402
403
404def _select_from_rows_if_any(frame: pd.DataFrame, rows_to_select: List[str]) -> pd.DataFrame:
405cols_have_any = frame.loc[rows_to_select].any()
406return frame.loc[:, cols_have_any]
407
408
409def apply_type_transformation(table: np.ndarray, col_type_ids: Sequence, log: LoggerAdapter):
410"""
411Apply transformation for columns in dataset into desired type. Perform
412transformation on predict stage when column types were already determined
413during fit
414"""
415table_df = pd.DataFrame(table, copy=False)
416types_sr = pd.Series(col_type_ids).map({
417**{TYPE_TO_ID[t]: t for t in [int, str]},
418**{TYPE_TO_ID[t]: float for t in [bool, type(None), float]}
419})
420
421return table_df.apply(_convert_predict_column_into_desired_type, types_sr=types_sr, log=log).to_numpy()
422
423
424def convert_num_column_into_string_array(numerical_column: pd.Series) -> pd.Series:
425""" Convert pandas column into numpy one-dimensional array """
426# convert only non-nans values
427true_nums = numerical_column[numerical_column.notna()]
428numerical_column[true_nums.index] = true_nums.astype(str, copy=False)
429return numerical_column
430
431
432def _obtain_new_column_type(column_info: pd.Series):
433""" Suggest in or float type based on the presence of nan and float values """
434if column_info[[_FLOAT_NUMBER, _NAN_NUMBER]].any():
435# Even if one of types are float - all elements should be converted into float
436return float
437# It is available to convert numerical into integer type
438return int
439
440
441def _convert_predict_column_into_desired_type(current_column: pd.Series, types_sr: pd.Series, log: LoggerAdapter):
442current_type = types_sr.loc[current_column.name]
443try:
444converted_column = current_column.astype(current_type)
445if current_type is str:
446has_comma_and_dot = np.isin(['.', ','], current_column).all()
447if has_comma_and_dot:
448# Most likely case: '20,000' must be converted into '20.000'
449warning = f'Column {current_column.name} contains both "." and ",". Standardize it.'
450log.warning(warning)
451except ValueError:
452converted_column = current_column.apply(_process_predict_column_values_one_by_one, current_type=current_type)
453return converted_column
454
455
456def _generate_list_with_types(columns_types_info: pd.DataFrame,
457converted_columns: Dict[int, Optional[int]]) -> np.ndarray:
458""" Create list with types for all remained columns
459
460:param columns_types_info: dictionary with initial column types
461:param converted_columns: dictionary with transformed column types
462"""
463updated_col_type_ids = []
464
465for column_id, column_type_ids in columns_types_info.loc[_TYPES].items():
466if len(column_type_ids) == 1:
467# Column initially contain only one type
468updated_col_type_ids.append(column_type_ids[0])
469elif len(column_type_ids) == 2 and TYPE_TO_ID[type(None)] in column_type_ids:
470# Column with one type and nans
471filtered_types = [x for x in column_type_ids if x != TYPE_TO_ID[type(None)]]
472updated_col_type_ids.append(filtered_types[0])
473else:
474if TYPE_TO_ID[str] in column_type_ids:
475# Mixed-types column with string
476new_col_id = converted_columns[column_id]
477if new_col_id is not None:
478updated_col_type_ids.append(new_col_id)
479else:
480# Mixed-types with float and integer
481updated_col_type_ids.append(TYPE_TO_ID[float])
482
483return np.array(updated_col_type_ids)
484
485
486def _process_predict_column_values_one_by_one(value, current_type: type):
487""" Process column values one by one and try to convert them into desirable type.
488If not successful replace with np.nan """
489new_value = np.nan
490try:
491new_value = current_type(value)
492except ValueError:
493if isinstance(value, str) and ('.' in value or ',' in value):
494value = value.replace(',', '.')
495try:
496# Since "10.6" can not be converted to 10 straightforward using int()
497if current_type is int:
498new_value = int(float(value))
499except ValueError:
500pass
501return new_value
502