4
from sklearn import datasets
7
def classification_dataset(samples_amount: int, features_amount: int, classes_amount: int,
8
features_options: Dict, noise_fraction: float = 0.1,
9
full_shuffle: bool = True, weights: list = None):
10
"""Generates a random dataset for ``n-class`` classification problem
11
using scikit-learn API.
14
samples_amount: Total amount of samples in the resulted dataset.
15
features_amount: Total amount of features per sample.
16
classes_amount: The amount of classes in the dataset.
17
features_options: The dictionary containing features options in key-value format
19
.. details:: possible ``features_options`` variants:
21
- ``informative`` -> the amount of informative features
22
- ``redundant`` -> the amount of redundant features
23
- ``repeated`` -> the amount of features that repeat the informative features
24
- ``clusters_per_class`` -> the amount of clusters for each class
26
noise_fraction: the fraction of noisy labels in the dataset
27
full_shuffle: if true then all features and samples will be shuffled
28
weights: The proportions of samples assigned to each class. If None, then classes are balanced
31
array: features and target as numpy-arrays
34
features, target = datasets.make_classification(n_samples=samples_amount, n_features=features_amount,
35
n_informative=features_options['informative'],
36
n_redundant=features_options['redundant'],
37
n_repeated=features_options['repeated'],
38
n_classes=classes_amount,
39
n_clusters_per_class=features_options['clusters_per_class'],
41
flip_y=noise_fraction,
44
return features, target
47
def regression_dataset(samples_amount: int, features_amount: int, features_options: Dict,
48
n_targets: int, noise: float = 0.0, shuffle: bool = True):
49
"""Generates a random dataset for regression problem using scikit-learn API.
52
samples_amount: total amount of samples in the resulted dataset
53
features_amount: total amount of features per sample
54
features_options: the dictionary containing features options in key-value format
56
.. details:: possible ``features_options`` variants:
58
- ``informative`` -> the amount of informative features
59
- ``bias`` -> bias term in the underlying linear model
61
n_targets: the amount of target variables
62
noise: the standard deviation of the gaussian noise applied to the output
63
shuffle: if ``True`` then all features and samples will be shuffled
66
array: features and target as numpy-arrays
69
features, target = datasets.make_regression(n_samples=samples_amount, n_features=features_amount,
70
n_informative=features_options['informative'],
71
bias=features_options['bias'],
76
return features, target
79
def gauss_quantiles_dataset(samples_amount: int, features_amount: int,
80
classes_amount: int, full_shuffle=True, **kwargs):
81
"""Generates a random dataset for n-class classification problem
82
based on multi-dimensional gaussian distribution quantiles
83
using scikit-learn API.
86
samples_amount: total amount of samples in the resulted dataset
87
features_amount: total amount of features per sample
88
classes_amount: the amount of classes in the dataset
89
full_shuffle: if ``True`` then all features and samples will be shuffled
90
kwargs: Optional['gauss_params'] mean and covariance values of the distribution
93
array: features and target as numpy-arrays
96
if 'gauss_params' in kwargs:
97
mean, cov = kwargs['gauss_params']
101
features, target = datasets.make_gaussian_quantiles(n_samples=samples_amount,
102
n_features=features_amount,
103
n_classes=classes_amount,
104
shuffle=full_shuffle,
106
return features, target
109
def generate_synthetic_data(length: int = 2200, periods: int = 5):
110
"""The function generates a synthetic one-dimensional array without omissions
113
length: the length of the array
114
periods: the number of periods in the sine wave
117
array: an array without gaps
120
sinusoidal_data = np.linspace(-periods * np.pi, periods * np.pi, length)
121
sinusoidal_data = np.sin(sinusoidal_data)
122
random_noise = np.random.normal(loc=0.0, scale=0.1, size=length)
124
# Combining a sine wave and random noise
125
synthetic_data = sinusoidal_data + random_noise
126
return synthetic_data