FEDOT

Форк
0
/
synth_dataset_generator.py 
126 строк · 5.5 Кб
1
from typing import Dict
2

3
import numpy as np
4
from sklearn import datasets
5

6

7
def classification_dataset(samples_amount: int, features_amount: int, classes_amount: int,
8
                           features_options: Dict, noise_fraction: float = 0.1,
9
                           full_shuffle: bool = True, weights: list = None):
10
    """Generates a random dataset for ``n-class`` classification problem
11
    using scikit-learn API.
12

13
    Args:
14
        samples_amount: Total amount of samples in the resulted dataset.
15
        features_amount: Total amount of features per sample.
16
        classes_amount: The amount of classes in the dataset.
17
        features_options: The dictionary containing features options in key-value format
18

19
            .. details:: possible ``features_options`` variants:
20

21
                - ``informative`` -> the amount of informative features
22
                - ``redundant`` -> the amount of redundant features
23
                - ``repeated`` -> the amount of features that repeat the informative features
24
                - ``clusters_per_class`` -> the amount of clusters for each class
25

26
        noise_fraction: the fraction of noisy labels in the dataset
27
        full_shuffle: if true then all features and samples will be shuffled
28
        weights: The proportions of samples assigned to each class. If None, then classes are balanced
29

30
    Returns:
31
        array: features and target as numpy-arrays
32
    """
33

34
    features, target = datasets.make_classification(n_samples=samples_amount, n_features=features_amount,
35
                                                    n_informative=features_options['informative'],
36
                                                    n_redundant=features_options['redundant'],
37
                                                    n_repeated=features_options['repeated'],
38
                                                    n_classes=classes_amount,
39
                                                    n_clusters_per_class=features_options['clusters_per_class'],
40
                                                    weights=weights,
41
                                                    flip_y=noise_fraction,
42
                                                    shuffle=full_shuffle)
43

44
    return features, target
45

46

47
def regression_dataset(samples_amount: int, features_amount: int, features_options: Dict,
48
                       n_targets: int, noise: float = 0.0, shuffle: bool = True):
49
    """Generates a random dataset for regression problem using scikit-learn API.
50

51
    Args:
52
        samples_amount: total amount of samples in the resulted dataset
53
        features_amount: total amount of features per sample
54
        features_options: the dictionary containing features options in key-value format
55

56
            .. details:: possible ``features_options`` variants:
57

58
                - ``informative`` -> the amount of informative features
59
                - ``bias`` -> bias term in the underlying linear model
60

61
        n_targets: the amount of target variables
62
        noise: the standard deviation of the gaussian noise applied to the output
63
        shuffle: if ``True`` then all features and samples will be shuffled
64

65
    Returns:
66
        array: features and target as numpy-arrays
67
    """
68

69
    features, target = datasets.make_regression(n_samples=samples_amount, n_features=features_amount,
70
                                                n_informative=features_options['informative'],
71
                                                bias=features_options['bias'],
72
                                                n_targets=n_targets,
73
                                                noise=noise,
74
                                                shuffle=shuffle)
75

76
    return features, target
77

78

79
def gauss_quantiles_dataset(samples_amount: int, features_amount: int,
80
                            classes_amount: int, full_shuffle=True, **kwargs):
81
    """Generates a random dataset for n-class classification problem
82
    based on multi-dimensional gaussian distribution quantiles
83
    using scikit-learn API.
84

85
    Args:
86
        samples_amount: total amount of samples in the resulted dataset
87
        features_amount: total amount of features per sample
88
        classes_amount: the amount of classes in the dataset
89
        full_shuffle: if ``True`` then all features and samples will be shuffled
90
        kwargs: Optional['gauss_params'] mean and covariance values of the distribution
91

92
    Returns:
93
        array: features and target as numpy-arrays
94
    """
95

96
    if 'gauss_params' in kwargs:
97
        mean, cov = kwargs['gauss_params']
98
    else:
99
        mean, cov = None, 1.
100

101
    features, target = datasets.make_gaussian_quantiles(n_samples=samples_amount,
102
                                                        n_features=features_amount,
103
                                                        n_classes=classes_amount,
104
                                                        shuffle=full_shuffle,
105
                                                        mean=mean, cov=cov)
106
    return features, target
107

108

109
def generate_synthetic_data(length: int = 2200, periods: int = 5):
110
    """The function generates a synthetic one-dimensional array without omissions
111

112
    Args:
113
        length: the length of the array
114
        periods: the number of periods in the sine wave
115

116
    Returns:
117
        array: an array without gaps
118
    """
119

120
    sinusoidal_data = np.linspace(-periods * np.pi, periods * np.pi, length)
121
    sinusoidal_data = np.sin(sinusoidal_data)
122
    random_noise = np.random.normal(loc=0.0, scale=0.1, size=length)
123

124
    # Combining a sine wave and random noise
125
    synthetic_data = sinusoidal_data + random_noise
126
    return synthetic_data
127

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.