mljar/mljar-supervised # codebase.md

This is page 2 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/supervised/preprocessing/label_binarizer.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | 
 3 | 
 4 | class LabelBinarizer(object):
 5 |     def __init__(self):
 6 |         self._new_columns = []
 7 |         self._uniq_values = None
 8 |         self._old_column = None
 9 |         self._old_column_dtype = None
10 | 
11 |     def fit(self, X, column):
12 |         self._old_column = column
13 |         self._old_column_dtype = str(X[column].dtype)
14 |         self._uniq_values = np.unique(X[column].values)
15 |         # self._uniq_values = [str(u) for u in self._uniq_values]
16 | 
17 |         if len(self._uniq_values) == 2:
18 |             self._new_columns.append(column + "_" + str(self._uniq_values[1]))
19 |         else:
20 |             for v in self._uniq_values:
21 |                 self._new_columns.append(column + "_" + str(v))
22 | 
23 |     def transform(self, X, column):
24 |         if len(self._uniq_values) == 2:
25 |             X[column + "_" + str(self._uniq_values[1])] = (
26 |                 X[column] == self._uniq_values[1]
27 |             ).astype(int)
28 |         else:
29 |             for v in self._uniq_values:
30 |                 X[column + "_" + str(v)] = (X[column] == v).astype(int)
31 | 
32 |         X.drop(column, axis=1, inplace=True)
33 |         return X
34 | 
35 |     def inverse_transform(self, X):
36 |         if self._old_column is None:
37 |             return X
38 | 
39 |         old_col = (X[self._new_columns[0]] * 0).astype(self._old_column_dtype)
40 | 
41 |         for unique_value in self._uniq_values:
42 |             new_col = f"{self._old_column}_{unique_value}"
43 |             if new_col not in self._new_columns:
44 |                 old_col[:] = unique_value
45 |             else:
46 |                 old_col[X[new_col] == 1] = unique_value
47 | 
48 |         X[self._old_column] = old_col
49 |         X.drop(self._new_columns, axis=1, inplace=True)
50 |         return X
51 | 
52 |     def to_json(self):
53 |         self._uniq_values = [str(i) for i in list(self._uniq_values)]
54 |         data_json = {
55 |             "new_columns": list(self._new_columns),
56 |             "unique_values": self._uniq_values,
57 |             "old_column": self._old_column,
58 |             "old_column_dtype": self._old_column_dtype,
59 |         }
60 | 
61 |         if (
62 |             "True" in self._uniq_values
63 |             and "False" in self._uniq_values
64 |             and len(self._uniq_values) == 2
65 |         ):
66 |             self._uniq_values = [False, True]
67 | 
68 |         return data_json
69 | 
70 |     def from_json(self, data_json):
71 |         self._new_columns = data_json.get("new_columns", None)
72 |         self._uniq_values = data_json.get("unique_values", None)
73 |         self._old_column = data_json.get("old_column", None)
74 |         self._old_column_dtype = data_json.get("old_column_dtype", None)
75 | 
76 |         if (
77 |             "True" in self._uniq_values
78 |             and "False" in self._uniq_values
79 |             and len(self._uniq_values) == 2
80 |         ):
81 |             self._uniq_values = [False, True]
82 | 
```

--------------------------------------------------------------------------------
/tests/data/iris_classes_missing_values_missing_target.csv:
--------------------------------------------------------------------------------

```
  1 | feature_1,feature_2,feature_3,feature_4,class
  2 | 5.1,3.5,1.4,0.2,1
  3 | 4.9,3.0,1.4,0.2,1
  4 | 4.7,3.2,1.3,,1
  5 | 4.6,3.1,1.5,,1
  6 | 5.0,3.6,1.4,0.2,1
  7 | ,3.9,1.7,0.4,1
  8 | 4.6,3.4,1.4,0.3,1
  9 | 5.0,3.4,1.5,0.2,1
 10 | 4.4,,1.4,0.2,1
 11 | 4.9,3.1,1.5,0.1,1
 12 | 5.4,3.7,1.5,0.2,1
 13 | 4.8,3.4,,0.2,1
 14 | 4.8,3.0,1.4,0.1,1
 15 | 4.3,3.0,1.1,0.1,1
 16 | 5.8,4.0,1.2,0.2,1
 17 | 5.7,4.4,1.5,0.4,1
 18 | 5.4,3.9,1.3,0.4,1
 19 | 5.1,3.5,1.4,0.3,
 20 | 5.7,3.8,1.7,0.3,1
 21 | 5.1,3.8,1.5,0.3,1
 22 | 5.4,3.4,1.7,0.2,1
 23 | 5.1,3.7,1.5,0.4,1
 24 | 4.6,3.6,1.0,0.2,1
 25 | 5.1,3.3,1.7,0.5,1
 26 | 4.8,3.4,1.9,0.2,1
 27 | 5.0,3.0,1.6,0.2,1
 28 | 5.0,3.4,1.6,0.4,1
 29 | 5.2,3.5,1.5,0.2,1
 30 | 5.2,3.4,1.4,0.2,1
 31 | 4.7,3.2,1.6,0.2,1
 32 | 4.8,3.1,1.6,0.2,1
 33 | 5.4,3.4,1.5,0.4,1
 34 | 5.2,4.1,1.5,0.1,1
 35 | 5.5,4.2,1.4,0.2,1
 36 | 4.9,3.1,1.5,0.1,1
 37 | 5.0,3.2,1.2,0.2,1
 38 | 5.5,3.5,1.3,0.2,1
 39 | 4.9,3.1,1.5,0.1,1
 40 | 4.4,3.0,1.3,0.2,1
 41 | 5.1,3.4,1.5,0.2,1
 42 | 5.0,3.5,1.3,0.3,1
 43 | 4.5,2.3,1.3,0.3,1
 44 | 4.4,3.2,1.3,0.2,1
 45 | 5.0,3.5,1.6,0.6,1
 46 | 5.1,3.8,1.9,0.4,1
 47 | 4.8,3.0,1.4,0.3,1
 48 | 5.1,3.8,1.6,0.2,1
 49 | 4.6,3.2,1.4,0.2,1
 50 | 5.3,3.7,1.5,0.2,1
 51 | 5.0,3.3,1.4,0.2,1
 52 | 7.0,3.2,4.7,1.4,2
 53 | 6.4,3.2,4.5,1.5,2
 54 | 6.9,3.1,4.9,1.5,
 55 | 5.5,2.3,4.0,1.3,2
 56 | 6.5,2.8,4.6,1.5,2
 57 | 5.7,2.8,4.5,1.3,2
 58 | 6.3,3.3,4.7,1.6,2
 59 | 4.9,2.4,3.3,1.0,2
 60 | 6.6,2.9,4.6,1.3,2
 61 | 5.2,2.7,3.9,1.4,2
 62 | 5.0,2.0,3.5,1.0,2
 63 | 5.9,3.0,4.2,1.5,2
 64 | 6.0,2.2,4.0,1.0,2
 65 | 6.1,2.9,4.7,1.4,2
 66 | 5.6,2.9,3.6,1.3,2
 67 | 6.7,3.1,4.4,1.4,2
 68 | 5.6,3.0,4.5,1.5,2
 69 | 5.8,2.7,4.1,1.0,2
 70 | 6.2,2.2,4.5,1.5,2
 71 | 5.6,2.5,3.9,1.1,2
 72 | 5.9,3.2,4.8,1.8,2
 73 | 6.1,2.8,4.0,1.3,2
 74 | 6.3,2.5,4.9,1.5,2
 75 | 6.1,2.8,4.7,1.2,2
 76 | 6.4,2.9,4.3,1.3,2
 77 | 6.6,3.0,4.4,1.4,2
 78 | 6.8,2.8,4.8,1.4,2
 79 | 6.7,3.0,5.0,1.7,2
 80 | 6.0,2.9,4.5,1.5,2
 81 | 5.7,2.6,3.5,1.0,2
 82 | 5.5,2.4,3.8,1.1,2
 83 | 5.5,2.4,3.7,1.0,2
 84 | 5.8,2.7,3.9,1.2,2
 85 | 6.0,2.7,5.1,1.6,2
 86 | 5.4,3.0,4.5,1.5,2
 87 | 6.0,3.4,4.5,1.6,2
 88 | 6.7,3.1,4.7,1.5,2
 89 | 6.3,2.3,4.4,1.3,2
 90 | 5.6,3.0,4.1,1.3,2
 91 | 5.5,2.5,4.0,1.3,2
 92 | 5.5,2.6,4.4,1.2,2
 93 | 6.1,3.0,4.6,1.4,2
 94 | 5.8,2.6,4.0,1.2,2
 95 | 5.0,2.3,3.3,1.0,2
 96 | 5.6,2.7,4.2,1.3,2
 97 | 5.7,3.0,4.2,1.2,2
 98 | 5.7,2.9,4.2,1.3,2
 99 | 6.2,2.9,4.3,1.3,2
100 | 5.1,2.5,3.0,1.1,2
101 | 5.7,2.8,4.1,1.3,2
102 | 6.3,3.3,6.0,2.5,121
103 | 5.8,2.7,5.1,1.9,121
104 | 7.1,3.0,5.9,2.1,121
105 | 6.3,2.9,5.6,1.8,121
106 | 6.5,3.0,5.8,2.2,121
107 | 7.6,3.0,6.6,2.1,121
108 | 4.9,2.5,4.5,1.7,121
109 | 7.3,2.9,6.3,1.8,121
110 | 6.7,2.5,5.8,1.8,121
111 | 7.2,3.6,6.1,2.5,121
112 | 6.5,3.2,5.1,2.0,121
113 | 6.4,2.7,5.3,1.9,121
114 | 6.8,3.0,5.5,2.1,121
115 | 5.7,2.5,5.0,2.0,121
116 | 5.8,2.8,5.1,2.4,121
117 | 6.4,3.2,5.3,2.3,121
118 | 6.5,3.0,5.5,1.8,121
119 | 7.7,3.8,6.7,2.2,121
120 | 7.7,2.6,6.9,2.3,121
121 | 6.0,2.2,5.0,1.5,121
122 | 6.9,3.2,5.7,2.3,121
123 | 5.6,2.8,4.9,2.0,121
124 | 7.7,2.8,6.7,2.0,121
125 | 6.3,2.7,4.9,1.8,121
126 | 6.7,3.3,5.7,2.1,121
127 | 7.2,3.2,6.0,1.8,121
128 | 6.2,2.8,4.8,1.8,121
129 | 6.1,3.0,4.9,1.8,121
130 | 6.4,2.8,5.6,2.1,121
131 | 7.2,3.0,5.8,1.6,121
132 | 7.4,2.8,6.1,1.9,121
133 | 7.9,3.8,6.4,2.0,121
134 | 6.4,2.8,5.6,2.2,121
135 | 6.3,2.8,5.1,1.5,121
136 | 6.1,2.6,5.6,1.4,121
137 | 7.7,3.0,6.1,2.3,121
138 | 6.3,3.4,5.6,2.4,121
139 | 6.4,3.1,5.5,1.8,121
140 | 6.0,3.0,4.8,1.8,121
141 | 6.9,3.1,5.4,2.1,121
142 | 6.7,3.1,5.6,2.4,121
143 | 6.9,3.1,5.1,2.3,121
144 | 5.8,2.7,5.1,1.9,121
145 | 6.8,3.2,5.9,2.3,121
146 | 6.7,3.3,5.7,2.5,121
147 | 6.7,3.0,5.2,2.3,121
148 | 6.3,2.5,5.0,1.9,121
149 | 6.5,3.0,5.2,2.0,121
150 | 6.2,3.4,5.4,2.3,121
151 | 5.9,3.0,5.1,1.8,121
152 | 
153 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_knn.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from numpy.testing import assert_almost_equal
 5 | from sklearn import datasets
 6 | 
 7 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm
 8 | from supervised.utils.metric import Metric
 9 | 
10 | 
11 | class KNeighborsRegressorAlgorithmTest(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.X, cls.y = datasets.make_regression(
15 |             n_samples=100,
16 |             n_features=5, 
17 |             n_informative=4, 
18 |             shuffle=False, 
19 |             random_state=0
20 |         )
21 | 
22 |     def test_reproduce_fit(self):
23 |         metric = Metric({"name": "mse"})
24 |         params = {"seed": 1, "ml_task": "regression"}
25 |         prev_loss = None
26 |         for _ in range(2):
27 |             model = KNeighborsRegressorAlgorithm(params)
28 |             model.fit(self.X, self.y)
29 |             y_predicted = model.predict(self.X)
30 |             loss = metric(self.y, y_predicted)
31 |             if prev_loss is not None:
32 |                 assert_almost_equal(prev_loss, loss)
33 |             prev_loss = loss
34 | 
35 | 
36 | class KNeighborsAlgorithmTest(unittest.TestCase):
37 |     @classmethod
38 |     def setUpClass(cls):
39 |         cls.X, cls.y = datasets.make_classification(
40 |             n_samples=100,
41 |             n_features=5,
42 |             n_informative=4,
43 |             n_redundant=1,
44 |             n_classes=2,
45 |             n_clusters_per_class=3,
46 |             n_repeated=0,
47 |             shuffle=False,
48 |             random_state=0,
49 |         )
50 | 
51 |     def test_reproduce_fit(self):
52 |         metric = Metric({"name": "logloss"})
53 |         params = {"seed": 1, "ml_task": "binary_classification"}
54 |         prev_loss = None
55 |         for _ in range(2):
56 |             model = KNeighborsAlgorithm(params)
57 |             model.fit(self.X, self.y)
58 |             y_predicted = model.predict(self.X)
59 |             loss = metric(self.y, y_predicted)
60 |             if prev_loss is not None:
61 |                 assert_almost_equal(prev_loss, loss)
62 |             prev_loss = loss
63 | 
64 |     def test_fit_predict(self):
65 |         metric = Metric({"name": "logloss"})
66 |         params = {"ml_task": "binary_classification"}
67 |         la = KNeighborsAlgorithm(params)
68 | 
69 |         la.fit(self.X, self.y)
70 |         y_predicted = la.predict(self.X)
71 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
72 | 
73 |     def test_is_fitted(self):
74 |         params = {"ml_task": "binary_classification"}
75 |         model = KNeighborsAlgorithm(params)
76 |         self.assertFalse(model.is_fitted())
77 |         model.fit(self.X, self.y)
78 |         self.assertTrue(model.is_fitted())
79 | 
80 |     def test_classes_attribute(self):
81 |         params = {"ml_task": "binary_classification"}
82 |         model = KNeighborsAlgorithm(params)
83 |         model.fit(self.X,self.y)
84 | 
85 |         try:
86 |             classes = model._classes  
87 |         except AttributeError:
88 |             classes = None
89 | 
90 |         self.assertTrue(np.array_equal(np.unique(self.y), classes))
91 | 
```

--------------------------------------------------------------------------------
/supervised/utils/importance.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | import os
 3 | import warnings
 4 | 
 5 | import pandas as pd
 6 | from sklearn.inspection import permutation_importance
 7 | 
 8 | from supervised.algorithms.registry import (
 9 |     BINARY_CLASSIFICATION,
10 |     MULTICLASS_CLASSIFICATION,
11 | )
12 | from supervised.utils.subsample import subsample
13 | 
14 | logger = logging.getLogger(__name__)
15 | from supervised.utils.config import LOG_LEVEL
16 | 
17 | logger.setLevel(LOG_LEVEL)
18 | 
19 | from sklearn.metrics import log_loss, make_scorer
20 | 
21 | 
22 | def log_loss_eps(y_true, y_pred):
23 |     ll = log_loss(y_true, y_pred)
24 |     return ll
25 | 
26 | 
27 | log_loss_scorer = make_scorer(log_loss_eps, greater_is_better=False, response_method="predict_proba")
28 | 
29 | 
30 | class PermutationImportance:
31 |     @staticmethod
32 |     def compute_and_plot(
33 |         model,
34 |         X_validation,
35 |         y_validation,
36 |         model_file_path,
37 |         learner_name,
38 |         metric_name=None,
39 |         ml_task=None,
40 |         n_jobs=-1,
41 |     ):
42 |         # for scoring check https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
43 |         if ml_task == BINARY_CLASSIFICATION:
44 |             scoring = log_loss_scorer
45 |         elif ml_task == MULTICLASS_CLASSIFICATION:
46 |             scoring = log_loss_scorer
47 |         else:
48 |             scoring = "neg_mean_squared_error"
49 | 
50 |         try:
51 |             with warnings.catch_warnings():
52 |                 warnings.simplefilter("ignore")
53 |                 # subsample validation data to speed-up importance computation
54 |                 # in the case of large number of columns, it can take a lot of time
55 |                 rows, cols = X_validation.shape
56 |                 if cols > 5000 and rows > 100:
57 |                     X_vald, _, y_vald, _ = subsample(
58 |                         X_validation, y_validation, train_size=100, ml_task=ml_task
59 |                     )
60 |                 elif cols > 50 and rows * cols > 200000 and rows > 1000:
61 |                     X_vald, _, y_vald, _ = subsample(
62 |                         X_validation, y_validation, train_size=1000, ml_task=ml_task
63 |                     )
64 |                 else:
65 |                     X_vald = X_validation
66 |                     y_vald = y_validation
67 | 
68 |                 importance = permutation_importance(
69 |                     model,
70 |                     X_vald,
71 |                     y_vald,
72 |                     scoring=scoring,
73 |                     n_jobs=n_jobs,
74 |                     random_state=12,
75 |                     n_repeats=5,  # default
76 |                 )
77 | 
78 |             sorted_idx = importance["importances_mean"].argsort()
79 | 
80 |             # save detailed importance
81 |             df_imp = pd.DataFrame(
82 |                 {
83 |                     "feature": X_vald.columns[sorted_idx],
84 |                     "mean_importance": importance["importances_mean"][sorted_idx],
85 |                 }
86 |             )
87 |             df_imp.to_csv(
88 |                 os.path.join(model_file_path, f"{learner_name}_importance.csv"),
89 |                 index=False,
90 |             )
91 |         except Exception as e:
92 |             print(str(e))
93 |             print("Problem during computing permutation importance. Skipping ...")
94 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_models_needed_for_predict.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import os
 3 | import tempfile
 4 | import unittest
 5 | 
 6 | from supervised import AutoML
 7 | from supervised.exceptions import AutoMLException
 8 | 
 9 | 
10 | class AutoMLModelsNeededForPredictTest(unittest.TestCase):
11 |     # models_needed_on_predict
12 | 
13 |     def test_models_needed_on_predict(self):
14 |         with tempfile.TemporaryDirectory() as tmpdir:
15 |             params = {
16 |                 "saved": [
17 |                     "model_1",
18 |                     "model_2",
19 |                     "model_3",
20 |                     "unused_model",
21 |                     "Ensemble",
22 |                     "model_4_Stacked",
23 |                     "Stacked_Ensemble",
24 |                 ],
25 |                 "stacked": ["Ensemble", "model_1", "model_2"],
26 |             }
27 |             with open(os.path.join(tmpdir, "params.json"), "w") as fout:
28 |                 fout.write(json.dumps(params))
29 |             os.mkdir(os.path.join(tmpdir, "Ensemble"))
30 |             with open(os.path.join(tmpdir, "Ensemble", "ensemble.json"), "w") as fout:
31 |                 params = {
32 |                     "selected_models": [
33 |                         {"model": "model_2"},
34 |                         {"model": "model_3"},
35 |                     ]
36 |                 }
37 |                 fout.write(json.dumps(params))
38 |             os.mkdir(os.path.join(tmpdir, "Stacked_Ensemble"))
39 |             with open(
40 |                 os.path.join(tmpdir, "Stacked_Ensemble", "ensemble.json"), "w"
41 |             ) as fout:
42 |                 params = {
43 |                     "selected_models": [
44 |                         {"model": "Ensemble"},
45 |                         {"model": "model_4_Stacked"},
46 |                     ]
47 |                 }
48 |                 fout.write(json.dumps(params))
49 | 
50 |             automl = AutoML(results_path=tmpdir)
51 |             with self.assertRaises(AutoMLException) as context:
52 |                 l = automl.models_needed_on_predict("missing_model")
53 |             l = automl.models_needed_on_predict("model_1")
54 |             self.assertTrue("model_1" in l)
55 |             self.assertTrue(len(l) == 1)
56 |             l = automl.models_needed_on_predict("model_3")
57 |             self.assertTrue("model_3" in l)
58 |             self.assertTrue(len(l) == 1)
59 |             l = automl.models_needed_on_predict("Ensemble")
60 |             self.assertTrue("model_2" in l)
61 |             self.assertTrue("model_3" in l)
62 |             self.assertTrue("Ensemble" in l)
63 |             self.assertTrue(len(l) == 3)
64 |             l = automl.models_needed_on_predict("model_4_Stacked")
65 |             self.assertTrue("model_1" in l)
66 |             self.assertTrue("model_2" in l)
67 |             self.assertTrue("model_3" in l)
68 |             self.assertTrue("Ensemble" in l)
69 |             self.assertTrue("model_4_Stacked" in l)
70 |             self.assertTrue(len(l) == 5)
71 |             l = automl.models_needed_on_predict("Stacked_Ensemble")
72 |             self.assertTrue("model_1" in l)
73 |             self.assertTrue("model_2" in l)
74 |             self.assertTrue("model_3" in l)
75 |             self.assertTrue("Ensemble" in l)
76 |             self.assertTrue("model_4_Stacked" in l)
77 |             self.assertTrue("Stacked_Ensemble" in l)
78 |             self.assertTrue(len(l) == 6)
79 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_golden_features.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | import shutil
  4 | import unittest
  5 | 
  6 | import pandas as pd
  7 | from sklearn import datasets
  8 | 
  9 | from supervised import AutoML
 10 | 
 11 | 
 12 | class AutoMLGoldenFeaturesTest(unittest.TestCase):
 13 |     automl_dir = "automl_tests"
 14 |     rows = 50
 15 | 
 16 |     def tearDown(self):
 17 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 18 | 
 19 |     def test_no_golden_features(self):
 20 |         N_COLS = 10
 21 |         X, y = datasets.make_classification(
 22 |             n_samples=100,
 23 |             n_features=N_COLS,
 24 |             n_informative=6,
 25 |             n_redundant=1,
 26 |             n_classes=2,
 27 |             n_clusters_per_class=3,
 28 |             n_repeated=0,
 29 |             shuffle=False,
 30 |             random_state=0,
 31 |         )
 32 | 
 33 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 34 | 
 35 |         automl = AutoML(
 36 |             results_path=self.automl_dir,
 37 |             total_time_limit=50,
 38 |             algorithms=["Xgboost"],
 39 |             train_ensemble=False,
 40 |             golden_features=False,
 41 |             explain_level=0,
 42 |             start_random_models=1,
 43 |         )
 44 |         automl.fit(X, y)
 45 | 
 46 |         self.assertEqual(len(automl._models), 1)
 47 | 
 48 |     def test_golden_features(self):
 49 |         N_COLS = 10
 50 |         X, y = datasets.make_classification(
 51 |             n_samples=100,
 52 |             n_features=N_COLS,
 53 |             n_informative=6,
 54 |             n_redundant=1,
 55 |             n_classes=2,
 56 |             n_clusters_per_class=3,
 57 |             n_repeated=0,
 58 |             shuffle=False,
 59 |             random_state=0,
 60 |         )
 61 | 
 62 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 63 | 
 64 |         automl = AutoML(
 65 |             results_path=self.automl_dir,
 66 |             total_time_limit=50,
 67 |             algorithms=["Xgboost"],
 68 |             train_ensemble=False,
 69 |             golden_features=True,
 70 |             explain_level=0,
 71 |             start_random_models=1,
 72 |         )
 73 |         automl.fit(X, y)
 74 | 
 75 |         self.assertEqual(len(automl._models), 2)
 76 | 
 77 |         # there should be 10 golden features
 78 |         with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
 79 |             d = json.loads(fin.read())
 80 |             self.assertEqual(len(d["new_features"]), 10)
 81 | 
 82 |     def test_golden_features_count(self):
 83 |         N_COLS = 10
 84 |         X, y = datasets.make_classification(
 85 |             n_samples=100,
 86 |             n_features=N_COLS,
 87 |             n_informative=6,
 88 |             n_redundant=1,
 89 |             n_classes=2,
 90 |             n_clusters_per_class=3,
 91 |             n_repeated=0,
 92 |             shuffle=False,
 93 |             random_state=0,
 94 |         )
 95 | 
 96 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 97 | 
 98 |         automl = AutoML(
 99 |             results_path=self.automl_dir,
100 |             total_time_limit=50,
101 |             algorithms=["Xgboost"],
102 |             train_ensemble=False,
103 |             golden_features=50,
104 |             explain_level=0,
105 |             start_random_models=1,
106 |         )
107 |         automl.fit(X, y)
108 | 
109 |         self.assertEqual(len(automl._models), 2)
110 | 
111 |         # there should be 50 golden features
112 |         with open(os.path.join(self.automl_dir, "golden_features.json")) as fin:
113 |             d = json.loads(fin.read())
114 |             self.assertEqual(len(d["new_features"]), 50)
115 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_sample_weight.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised import AutoML
 9 | 
10 | iris = datasets.load_iris()
11 | housing = datasets.fetch_california_housing()
12 | # limit data size for faster tests
13 | housing.data = housing.data[:500]
14 | housing.target = housing.target[:500]
15 | breast_cancer = datasets.load_breast_cancer()
16 | 
17 | 
18 | class AutoMLSampleWeightTest(unittest.TestCase):
19 |     automl_dir = "AutoMLSampleWeightTest"
20 | 
21 |     def tearDown(self):
22 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
23 | 
24 |     def test_iris_dataset_sample_weight(self):
25 |         """Tests AutoML in the iris dataset (Multiclass classification)
26 |         without and with sample weight"""
27 |         model = AutoML(
28 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
29 |         )
30 |         score_1 = model.fit(iris.data, iris.target).score(iris.data, iris.target)
31 |         self.assertGreater(score_1, 0.5)
32 | 
33 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
34 |         model = AutoML(
35 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
36 |         )
37 |         sample_weight = np.ones(iris.data.shape[0])
38 |         score_2 = model.fit(iris.data, iris.target, sample_weight=sample_weight).score(
39 |             iris.data, iris.target, sample_weight=sample_weight
40 |         )
41 |         assert_almost_equal(score_1, score_2)
42 | 
43 |     def test_housing_dataset(self):
44 |         """Tests AutoML in the housing dataset (Regression)
45 |         without and with sample weight"""
46 |         model = AutoML(
47 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
48 |         )
49 |         score_1 = model.fit(housing.data, housing.target).score(
50 |             housing.data, housing.target
51 |         )
52 |         self.assertGreater(score_1, 0.5)
53 | 
54 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
55 |         model = AutoML(
56 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
57 |         )
58 |         sample_weight = np.ones(housing.data.shape[0])
59 |         score_2 = model.fit(
60 |             housing.data, housing.target, sample_weight=sample_weight
61 |         ).score(housing.data, housing.target, sample_weight=sample_weight)
62 |         assert_almost_equal(score_1, score_2)
63 | 
64 |     def test_breast_cancer_dataset(self):
65 |         """Tests AutoML in the breast cancer (binary classification)
66 |         without and with sample weight"""
67 |         model = AutoML(
68 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
69 |         )
70 |         score_1 = model.fit(breast_cancer.data, breast_cancer.target).score(
71 |             breast_cancer.data, breast_cancer.target
72 |         )
73 |         self.assertGreater(score_1, 0.5)
74 | 
75 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
76 |         model = AutoML(
77 |             explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir
78 |         )
79 |         sample_weight = np.ones(breast_cancer.data.shape[0])
80 |         score_2 = model.fit(
81 |             breast_cancer.data, breast_cancer.target, sample_weight=sample_weight
82 |         ).score(breast_cancer.data, breast_cancer.target, sample_weight=sample_weight)
83 |         assert_almost_equal(score_1, score_2)
84 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/total_time_constraint.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised.callbacks.callback import Callback
 7 | from supervised.exceptions import NotTrainedException
 8 | from supervised.utils.config import LOG_LEVEL
 9 | 
10 | log = logging.getLogger(__name__)
11 | log.setLevel(LOG_LEVEL)
12 | 
13 | 
14 | class TotalTimeConstraint(Callback):
15 |     def __init__(self, params={}):
16 |         super(TotalTimeConstraint, self).__init__(params)
17 |         self.name = params.get("name", "total_time_constraint")
18 |         self.total_time_limit = params.get("total_time_limit")
19 |         self.total_time_start = params.get("total_time_start")
20 |         self.expected_learners_cnt = params.get("expected_learners_cnt", 1)
21 | 
22 |     def on_learner_train_start(self, logs):
23 |         self.train_start_time = time.time()
24 | 
25 |     def on_learner_train_end(self, logs):
26 |         if (
27 |             self.total_time_limit is not None
28 |             and len(self.learners) == 1
29 |             and self.expected_learners_cnt > 1
30 |             # just check for the first learner
31 |             # need to have more than 1 learner
32 |             # otherwise it is a finish of the training
33 |         ):
34 |             one_fold_time = time.time() - self.train_start_time
35 |             estimate_all_folds = one_fold_time * self.expected_learners_cnt
36 | 
37 |             total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
38 | 
39 |             # we need to add time for the rest of learners (assuming that all folds training time is the same)
40 |             estimate_elapsed_time = total_elapsed_time + one_fold_time * (
41 |                 self.expected_learners_cnt - 1
42 |             )
43 | 
44 |             if estimate_elapsed_time >= self.total_time_limit:
45 |                 raise NotTrainedException(
46 |                     "Stop training after the first fold. "
47 |                     f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. "
48 |                     "The time estimate for training on all folds is larger than total_time_limit."
49 |                 )
50 |         if (
51 |             self.total_time_limit is not None
52 |             and len(self.learners) < self.expected_learners_cnt
53 |             # dont stop for last learner, we are finishing anyway
54 |         ):
55 |             total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
56 | 
57 |             if total_elapsed_time > self.total_time_limit + 600:
58 |                 # add 10 minutes of margin
59 |                 # margin is added because of unexpected time changes
60 |                 # if training on each fold will be the same
61 |                 # then the training will be stopped after first fold (above condition)
62 |                 raise NotTrainedException(
63 |                     "Force to stop the training. "
64 |                     "Total time for AutoML training already exceeded."
65 |                 )
66 | 
67 |     def on_iteration_end(self, logs, predictions):
68 |         total_elapsed_time = np.round(time.time() - self.total_time_start, 2)
69 | 
70 |         if self.total_time_limit is not None:
71 |             log.debug(
72 |                 f"Total elapsed time {total_elapsed_time} seconds. "
73 |                 + f"Time left {np.round(self.total_time_limit - total_elapsed_time, 2)} seconds."
74 |             )
75 |             # not time left, stop now
76 |             if total_elapsed_time >= self.total_time_limit:
77 |                 self.learner.stop_training = True
78 |         else:
79 |             log.debug(f"Total elapsed time {total_elapsed_time} seconds")
80 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_repeated_validation.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | 
  5 | import pandas as pd
  6 | from sklearn import datasets
  7 | 
  8 | from supervised import AutoML
  9 | from supervised.algorithms.random_forest import additional
 10 | from supervised.utils.common import construct_learner_name
 11 | 
 12 | additional["max_steps"] = 1
 13 | additional["trees_in_step"] = 1
 14 | 
 15 | from supervised.algorithms.xgboost import additional
 16 | 
 17 | additional["max_rounds"] = 1
 18 | 
 19 | 
 20 | class AutoMLRepeatedValidationTest(unittest.TestCase):
 21 |     automl_dir = "AutoMLRepeatedValidationTest"
 22 | 
 23 |     def tearDown(self):
 24 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 25 | 
 26 |     def test_repeated_kfold(self):
 27 |         REPEATS = 3
 28 |         FOLDS = 2
 29 | 
 30 |         a = AutoML(
 31 |             results_path=self.automl_dir,
 32 |             total_time_limit=10,
 33 |             algorithms=["Random Forest"],
 34 |             train_ensemble=False,
 35 |             validation_strategy={
 36 |                 "validation_type": "kfold",
 37 |                 "k_folds": FOLDS,
 38 |                 "repeats": REPEATS,
 39 |                 "shuffle": True,
 40 |                 "stratify": True,
 41 |             },
 42 |             start_random_models=1,
 43 |         )
 44 | 
 45 |         X, y = datasets.make_classification(
 46 |             n_samples=100,
 47 |             n_features=5,
 48 |             n_informative=4,
 49 |             n_redundant=1,
 50 |             n_classes=2,
 51 |             n_clusters_per_class=3,
 52 |             n_repeated=0,
 53 |             shuffle=False,
 54 |             random_state=0,
 55 |         )
 56 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 57 | 
 58 |         a.fit(X, y)
 59 | 
 60 |         result_files = os.listdir(
 61 |             os.path.join(self.automl_dir, "1_Default_RandomForest")
 62 |         )
 63 | 
 64 |         cnt = 0
 65 |         for repeat in range(REPEATS):
 66 |             for fold in range(FOLDS):
 67 |                 learner_name = construct_learner_name(fold, repeat, REPEATS)
 68 |                 self.assertTrue(f"{learner_name}.random_forest" in result_files)
 69 |                 self.assertTrue(f"{learner_name}_training.log" in result_files)
 70 |                 cnt += 1
 71 |         self.assertTrue(cnt, 6)
 72 | 
 73 |     def test_repeated_split(self):
 74 |         REPEATS = 3
 75 |         FOLDS = 1
 76 | 
 77 |         a = AutoML(
 78 |             results_path=self.automl_dir,
 79 |             total_time_limit=10,
 80 |             algorithms=["Random Forest"],
 81 |             train_ensemble=False,
 82 |             validation_strategy={
 83 |                 "validation_type": "split",
 84 |                 "repeats": REPEATS,
 85 |                 "shuffle": True,
 86 |                 "stratify": True,
 87 |             },
 88 |             start_random_models=1,
 89 |         )
 90 | 
 91 |         X, y = datasets.make_classification(
 92 |             n_samples=100,
 93 |             n_features=5,
 94 |             n_informative=4,
 95 |             n_redundant=1,
 96 |             n_classes=2,
 97 |             n_clusters_per_class=3,
 98 |             n_repeated=0,
 99 |             shuffle=False,
100 |             random_state=0,
101 |         )
102 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
103 | 
104 |         a.fit(X, y)
105 | 
106 |         result_files = os.listdir(
107 |             os.path.join(self.automl_dir, "1_Default_RandomForest")
108 |         )
109 |         cnt = 0
110 |         for repeat in range(REPEATS):
111 |             for fold in range(FOLDS):
112 |                 learner_name = construct_learner_name(fold, repeat, REPEATS)
113 |                 self.assertTrue(f"{learner_name}.random_forest" in result_files)
114 |                 self.assertTrue(f"{learner_name}_training.log" in result_files)
115 |                 cnt += 1
116 |         self.assertTrue(cnt, 3)
117 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/datetime_transformer.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | class DateTimeTransformer(object):
  6 |     def __init__(self):
  7 |         self._new_columns = []
  8 |         self._old_column = None
  9 |         self._min_datetime = None
 10 |         self._transforms = []
 11 | 
 12 |     def fit(self, X, column):
 13 |         self._old_column = column
 14 |         self._min_datetime = np.min(X[column])
 15 | 
 16 |         values = X[column].dt.year
 17 |         if len(np.unique(values)) > 1:
 18 |             self._transforms += ["year"]
 19 |             new_column = column + "_Year"
 20 |             self._new_columns += [new_column]
 21 | 
 22 |         values = X[column].dt.month
 23 |         if len(np.unique(values)) > 1:
 24 |             self._transforms += ["month"]
 25 |             new_column = column + "_Month"
 26 |             self._new_columns += [new_column]
 27 | 
 28 |         values = X[column].dt.day
 29 |         if len(np.unique(values)) > 1:
 30 |             self._transforms += ["day"]
 31 |             new_column = column + "_Day"
 32 |             self._new_columns += [new_column]
 33 | 
 34 |         values = X[column].dt.weekday
 35 |         if len(np.unique(values)) > 1:
 36 |             self._transforms += ["weekday"]
 37 |             new_column = column + "_WeekDay"
 38 |             self._new_columns += [new_column]
 39 | 
 40 |         values = X[column].dt.dayofyear
 41 |         if len(np.unique(values)) > 1:
 42 |             self._transforms += ["dayofyear"]
 43 |             new_column = column + "_DayOfYear"
 44 |             self._new_columns += [new_column]
 45 | 
 46 |         values = X[column].dt.hour
 47 |         if len(np.unique(values)) > 1:
 48 |             self._transforms += ["hour"]
 49 |             new_column = column + "_Hour"
 50 |             self._new_columns += [new_column]
 51 | 
 52 |         values = (X[column] - self._min_datetime).dt.days
 53 |         if len(np.unique(values)) > 1:
 54 |             self._transforms += ["days_diff"]
 55 |             new_column = column + "_Days_Diff_To_Min"
 56 |             self._new_columns += [new_column]
 57 | 
 58 |     def transform(self, X):
 59 |         column = self._old_column
 60 | 
 61 |         if "year" in self._transforms:
 62 |             new_column = column + "_Year"
 63 |             X[new_column] = X[column].dt.year
 64 | 
 65 |         if "month" in self._transforms:
 66 |             new_column = column + "_Month"
 67 |             X[new_column] = X[column].dt.month
 68 | 
 69 |         if "day" in self._transforms:
 70 |             new_column = column + "_Day"
 71 |             X[new_column] = X[column].dt.day
 72 | 
 73 |         if "weekday" in self._transforms:
 74 |             new_column = column + "_WeekDay"
 75 |             X[new_column] = X[column].dt.weekday
 76 | 
 77 |         if "dayofyear" in self._transforms:
 78 |             new_column = column + "_DayOfYear"
 79 |             X[new_column] = X[column].dt.dayofyear
 80 | 
 81 |         if "hour" in self._transforms:
 82 |             new_column = column + "_Hour"
 83 |             X[new_column] = X[column].dt.hour
 84 | 
 85 |         if "days_diff" in self._transforms:
 86 |             new_column = column + "_Days_Diff_To_Min"
 87 |             X[new_column] = (X[column] - self._min_datetime).dt.days
 88 | 
 89 |         X.drop(column, axis=1, inplace=True)
 90 |         return X
 91 | 
 92 |     def to_json(self):
 93 |         data_json = {
 94 |             "new_columns": list(self._new_columns),
 95 |             "old_column": self._old_column,
 96 |             "min_datetime": str(self._min_datetime),
 97 |             "transforms": list(self._transforms),
 98 |         }
 99 |         return data_json
100 | 
101 |     def from_json(self, data_json):
102 |         self._new_columns = data_json.get("new_columns", None)
103 |         self._old_column = data_json.get("old_column", None)
104 |         d = data_json.get("min_datetime", None)
105 |         self._min_datetime = None if d is None else pd.to_datetime(d)
106 |         self._transforms = data_json.get("transforms", [])
107 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_linear.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.linear import LinearAlgorithm, LinearRegressorAlgorithm
  9 | from supervised.utils.metric import Metric
 10 | 
 11 | 
 12 | class LinearRegressorAlgorithmTest(unittest.TestCase):
 13 |     @classmethod
 14 |     def setUpClass(cls):
 15 |         cls.X, cls.y = datasets.make_regression(
 16 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 17 |         )
 18 | 
 19 |     def test_reproduce_fit(self):
 20 |         metric = Metric({"name": "mse"})
 21 |         params = {"seed": 1, "ml_task": "regression"}
 22 |         prev_loss = None
 23 |         for _ in range(3):
 24 |             model = LinearRegressorAlgorithm(params)
 25 |             model.fit(self.X, self.y)
 26 |             y_predicted = model.predict(self.X)
 27 |             loss = metric(self.y, y_predicted)
 28 |             if prev_loss is not None:
 29 |                 assert_almost_equal(prev_loss, loss)
 30 |             prev_loss = loss
 31 | 
 32 | 
 33 | class LinearAlgorithmTest(unittest.TestCase):
 34 |     @classmethod
 35 |     def setUpClass(cls):
 36 |         cls.X, cls.y = datasets.make_classification(
 37 |             n_samples=100,
 38 |             n_features=5,
 39 |             n_informative=4,
 40 |             n_redundant=1,
 41 |             n_classes=2,
 42 |             n_clusters_per_class=3,
 43 |             n_repeated=0,
 44 |             shuffle=False,
 45 |             random_state=0,
 46 |         )
 47 | 
 48 |     def test_reproduce_fit(self):
 49 |         metric = Metric({"name": "logloss"})
 50 |         params = {"seed": 1, "ml_task": "binary_classification"}
 51 |         prev_loss = None
 52 |         for _ in range(3):
 53 |             model = LinearAlgorithm(params)
 54 |             model.fit(self.X, self.y)
 55 |             y_predicted = model.predict(self.X)
 56 |             loss = metric(self.y, y_predicted)
 57 |             if prev_loss is not None:
 58 |                 assert_almost_equal(prev_loss, loss)
 59 |             prev_loss = loss
 60 | 
 61 |     def test_fit_predict(self):
 62 |         metric = Metric({"name": "logloss"})
 63 |         params = {"ml_task": "binary_classification"}
 64 |         la = LinearAlgorithm(params)
 65 | 
 66 |         la.fit(self.X, self.y)
 67 |         y_predicted = la.predict(self.X)
 68 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
 69 | 
 70 |     def test_copy(self):
 71 |         metric = Metric({"name": "logloss"})
 72 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
 73 |         model.fit(self.X, self.y)
 74 |         y_predicted = model.predict(self.X)
 75 |         loss = metric(self.y, y_predicted)
 76 | 
 77 |         model2 = LinearAlgorithm({})
 78 |         model2 = model.copy()
 79 |         self.assertEqual(type(model), type(model2))
 80 |         y_predicted = model2.predict(self.X)
 81 |         loss2 = metric(self.y, y_predicted)
 82 |         assert_almost_equal(loss, loss2)
 83 | 
 84 |     def test_save_and_load(self):
 85 |         metric = Metric({"name": "logloss"})
 86 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
 87 |         model.fit(self.X, self.y)
 88 |         y_predicted = model.predict(self.X)
 89 |         loss = metric(self.y, y_predicted)
 90 | 
 91 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
 92 | 
 93 |         model.save(filename)
 94 |         model2 = LinearAlgorithm({"ml_task": "binary_classification"})
 95 |         model2.load(filename)
 96 |         # Finished with the file, delete it
 97 |         os.remove(filename)
 98 | 
 99 |         y_predicted = model2.predict(self.X)
100 |         loss2 = metric(self.y, y_predicted)
101 |         assert_almost_equal(loss, loss2)
102 | 
103 |     def test_is_fitted(self):
104 |         model = LinearAlgorithm({"ml_task": "binary_classification"})
105 |         self.assertFalse(model.is_fitted())
106 |         model.fit(self.X, self.y)
107 |         self.assertTrue(model.is_fitted())
108 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/knn.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | import sklearn
  4 | from sklearn.base import ClassifierMixin, RegressorMixin
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  7 | 
  8 | from supervised.algorithms.registry import (
  9 |     BINARY_CLASSIFICATION,
 10 |     MULTICLASS_CLASSIFICATION,
 11 |     REGRESSION,
 12 |     AlgorithmsRegistry,
 13 | )
 14 | from supervised.algorithms.sklearn import SklearnAlgorithm
 15 | from supervised.utils.config import LOG_LEVEL
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | logger.setLevel(LOG_LEVEL)
 19 | 
 20 | 
 21 | KNN_ROWS_LIMIT = 1000
 22 | 
 23 | 
 24 | class KNNFit(SklearnAlgorithm):
 25 |     def file_extension(self):
 26 |         return "k_neighbors"
 27 | 
 28 |     def is_fitted(self):
 29 |         return (
 30 |             hasattr(self.model, "n_samples_fit_")
 31 |             and self.model.n_samples_fit_ is not None
 32 |             and self.model.n_samples_fit_ > 0
 33 |         )
 34 | 
 35 |     def fit(
 36 |         self,
 37 |         X,
 38 |         y,
 39 |         sample_weight=None,
 40 |         X_validation=None,
 41 |         y_validation=None,
 42 |         sample_weight_validation=None,
 43 |         log_to_file=None,
 44 |         max_time=None,
 45 |     ):
 46 |         rows_limit = self.params.get("rows_limit", KNN_ROWS_LIMIT)
 47 |         if X.shape[0] > rows_limit:
 48 |             X1, _, y1, _ = train_test_split(
 49 |                 X, y, train_size=rows_limit, stratify=y, random_state=1234
 50 |             )
 51 |             self.model.fit(X1, y1)
 52 |         else:
 53 |             self.model.fit(X, y)
 54 | 
 55 |     @property
 56 |     def _classes(self):
 57 |         # Returns the unique classes based on the fitted model
 58 |         if hasattr(self.model, "classes_"):
 59 |             return self.model.classes_
 60 |         else:
 61 |             return None
 62 | 
 63 | 
 64 | class KNeighborsAlgorithm(ClassifierMixin, KNNFit):
 65 |     algorithm_name = "k-Nearest Neighbors"
 66 |     algorithm_short_name = "Nearest Neighbors"
 67 | 
 68 |     def __init__(self, params):
 69 |         super(KNeighborsAlgorithm, self).__init__(params)
 70 |         logger.debug("KNeighborsAlgorithm.__init__")
 71 |         self.library_version = sklearn.__version__
 72 |         self.max_iters = 1
 73 |         self.model = KNeighborsClassifier(
 74 |             n_neighbors=params.get("n_neighbors", 3),
 75 |             weights=params.get("weights", "uniform"),
 76 |             algorithm="kd_tree",
 77 |             n_jobs=params.get("n_jobs", -1),
 78 |         )
 79 | 
 80 | 
 81 | class KNeighborsRegressorAlgorithm(RegressorMixin, KNNFit):
 82 |     algorithm_name = "k-Nearest Neighbors"
 83 |     algorithm_short_name = "Nearest Neighbors"
 84 | 
 85 |     def __init__(self, params):
 86 |         super(KNeighborsRegressorAlgorithm, self).__init__(params)
 87 |         logger.debug("KNeighborsRegressorAlgorithm.__init__")
 88 |         self.library_version = sklearn.__version__
 89 |         self.max_iters = 1
 90 |         self.model = KNeighborsRegressor(
 91 |             n_neighbors=params.get("n_neighbors", 3),
 92 |             weights=params.get("weights", "uniform"),
 93 |             algorithm="ball_tree",
 94 |             n_jobs=params.get("n_jobs", -1),
 95 |         )
 96 | 
 97 | 
 98 | knn_params = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}
 99 | 
100 | default_params = {"n_neighbors": 5, "weights": "uniform"}
101 | 
102 | additional = {"max_rows_limit": 100000, "max_cols_limit": 100}
103 | 
104 | required_preprocessing = [
105 |     "missing_values_inputation",
106 |     "convert_categorical",
107 |     "datetime_transform",
108 |     "text_transform",
109 |     "scale",
110 |     "target_as_integer",
111 | ]
112 | 
113 | AlgorithmsRegistry.add(
114 |     BINARY_CLASSIFICATION,
115 |     KNeighborsAlgorithm,
116 |     knn_params,
117 |     required_preprocessing,
118 |     additional,
119 |     default_params,
120 | )
121 | AlgorithmsRegistry.add(
122 |     MULTICLASS_CLASSIFICATION,
123 |     KNeighborsAlgorithm,
124 |     knn_params,
125 |     required_preprocessing,
126 |     additional,
127 |     default_params,
128 | )
129 | 
130 | AlgorithmsRegistry.add(
131 |     REGRESSION,
132 |     KNeighborsRegressorAlgorithm,
133 |     knn_params,
134 |     required_preprocessing,
135 |     additional,
136 |     default_params,
137 | )
138 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_time_constraints.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import time
  3 | import unittest
  4 | 
  5 | from supervised import AutoML
  6 | from supervised.tuner.time_controller import TimeController
  7 | 
  8 | 
  9 | class AutoMLTimeConstraintsTest(unittest.TestCase):
 10 |     automl_dir = "automl_tests"
 11 | 
 12 |     def tearDown(self):
 13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 14 | 
 15 |     def test_set_total_time_limit(self):
 16 |         model_type = "Xgboost"
 17 |         automl = AutoML(
 18 |             results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type]
 19 |         )
 20 | 
 21 |         automl._time_ctrl = TimeController(
 22 |             time.time(), 100, None, ["simple_algorithms", "not_so_random"], "Xgboost"
 23 |         )
 24 | 
 25 |         time_spend = 0
 26 |         for i in range(12):
 27 |             automl._start_time -= 10
 28 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 29 |             if automl._time_ctrl.enough_time(model_type, "not_so_random"):
 30 |                 time_spend += 10
 31 | 
 32 |         self.assertTrue(time_spend < 100)
 33 | 
 34 |     def test_set_model_time_limit(self):
 35 |         model_type = "Xgboost"
 36 |         automl = AutoML(
 37 |             results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type]
 38 |         )
 39 |         automl._time_ctrl = TimeController(
 40 |             time.time(), None, 10, ["simple_algorithms", "not_so_random"], "Xgboost"
 41 |         )
 42 | 
 43 |         for i in range(12):
 44 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 45 |             # should be always true
 46 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 47 | 
 48 |     def test_set_model_time_limit_omit_total_time(self):
 49 |         model_type = "Xgboost"
 50 |         automl = AutoML(
 51 |             results_path=self.automl_dir,
 52 |             total_time_limit=10,
 53 |             model_time_limit=10,
 54 |             algorithms=[model_type],
 55 |         )
 56 |         automl._time_ctrl = TimeController(
 57 |             time.time(), 10, 10, ["simple_algorithms", "not_so_random"], "Xgboost"
 58 |         )
 59 | 
 60 |         for i in range(12):
 61 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10)
 62 |             # should be always true
 63 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 64 | 
 65 |     def test_enough_time_to_train(self):
 66 |         model_type = "Xgboost"
 67 |         model_type_2 = "LightGBM"
 68 | 
 69 |         model_type = "Xgboost"
 70 |         automl = AutoML(
 71 |             results_path=self.automl_dir,
 72 |             total_time_limit=10,
 73 |             model_time_limit=10,
 74 |             algorithms=[model_type, model_type_2],
 75 |         )
 76 |         automl._time_ctrl = TimeController(
 77 |             time.time(),
 78 |             10,
 79 |             10,
 80 |             ["simple_algorithms", "not_so_random"],
 81 |             [model_type, model_type_2],
 82 |         )
 83 | 
 84 |         for i in range(5):
 85 |             automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 1)
 86 |             # should be always true
 87 |             self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random"))
 88 | 
 89 |         for i in range(5):
 90 |             automl._time_ctrl.log_time(
 91 |                 f"LightGBM_{i}", model_type_2, "not_so_random", 1
 92 |             )
 93 |             # should be always true
 94 |             self.assertTrue(
 95 |                 automl._time_ctrl.enough_time(model_type_2, "not_so_random")
 96 |             )
 97 | 
 98 |     def test_expected_learners_cnt(self):
 99 |         automl = AutoML(results_path=self.automl_dir)
100 |         automl._validation_strategy = {"k_folds": 7, "repeats": 6}
101 |         self.assertEqual(automl._expected_learners_cnt(), 42)
102 | 
103 |         automl._validation_strategy = {"k_folds": 7}
104 |         self.assertEqual(automl._expected_learners_cnt(), 7)
105 |         automl._validation_strategy = {}
106 |         self.assertEqual(automl._expected_learners_cnt(), 1)
107 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/preprocessing_missing.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 5 | 
 6 | 
 7 | class PreprocessingMissingValues(object):
 8 |     FILL_NA_MIN = "na_fill_min_1"
 9 |     FILL_NA_MEAN = "na_fill_mean"
10 |     FILL_NA_MEDIAN = "na_fill_median"
11 |     FILL_DATETIME = "na_fill_datetime"
12 | 
13 |     NA_EXCLUDE = "na_exclude"
14 |     MISSING_VALUE = "_missing_value_"
15 |     REMOVE_COLUMN = "remove_column"
16 | 
17 |     def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN):
18 |         self._columns = columns
19 |         # fill method
20 |         self._na_fill_method = na_fill_method
21 |         # fill parameters stored as a dict, feature -> fill value
22 |         self._na_fill_params = {}
23 |         self._datetime_columns = []
24 | 
25 |     def fit(self, X):
26 |         X = self._fit_na_fill(X)
27 | 
28 |     def _fit_na_fill(self, X):
29 |         for column in self._columns:
30 |             if np.sum(pd.isnull(X[column]) == True) == 0:
31 |                 continue
32 |             self._na_fill_params[column] = self._get_fill_value(X[column])
33 |             if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
34 |                 self._datetime_columns += [column]
35 | 
36 |     def _get_fill_value(self, x):
37 |         # categorical type
38 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL:
39 |             if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
40 |                 return (
41 |                     PreprocessingMissingValues.MISSING_VALUE
42 |                 )  # add new categorical value
43 |             return PreprocessingUtils.get_most_frequent(x)
44 |         # datetime
45 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME:
46 |             return PreprocessingUtils.get_most_frequent(x)
47 |         # text
48 |         if PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT:
49 |             return PreprocessingMissingValues.MISSING_VALUE
50 | 
51 |         # numerical type
52 |         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
53 |             return PreprocessingUtils.get_min(x) - 1.0
54 |         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN:
55 |             return PreprocessingUtils.get_mean(x)
56 |         return PreprocessingUtils.get_median(x)
57 | 
58 |     def transform(self, X):
59 |         X = self._transform_na_fill(X)
60 |         # this is additional run through columns,
61 |         # in case of transforming data with new columns with missing values
62 |         # X = self._make_sure_na_filled(X) # disbaled for now
63 |         return X
64 | 
65 |     def _transform_na_fill(self, X):
66 |         for column, value in self._na_fill_params.items():
67 |             ind = pd.isnull(X.loc[:, column])
68 |             X.loc[ind, column] = value
69 |         return X
70 | 
71 |     def _make_sure_na_filled(self, X):
72 |         self._fit_na_fill(X)
73 |         return self._transform_na_fill(X)
74 | 
75 |     def to_json(self):
76 |         # prepare json with all parameters
77 |         if len(self._na_fill_params) == 0:
78 |             return {}
79 |         params = {
80 |             "fill_method": self._na_fill_method,
81 |             "fill_params": self._na_fill_params,
82 |             "datetime_columns": list(self._datetime_columns),
83 |         }
84 |         for col in self._datetime_columns:
85 |             params["fill_params"][col] = str(params["fill_params"][col])
86 |         return params
87 | 
88 |     def from_json(self, params):
89 |         if params is not None:
90 |             self._na_fill_method = params.get("fill_method", None)
91 |             self._na_fill_params = params.get("fill_params", {})
92 |             self._datetime_columns = params.get("datetime_columns", [])
93 |             for col in self._datetime_columns:
94 |                 self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col])
95 |         else:
96 |             self._na_fill_method, self._na_fill_params = None, None
97 |             self._datetime_columns = []
98 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/scale.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | from sklearn import preprocessing
 3 | 
 4 | 
 5 | class Scale(object):
 6 |     SCALE_NORMAL = "scale_normal"
 7 |     SCALE_LOG_AND_NORMAL = "scale_log_and_normal"
 8 | 
 9 |     def __init__(self, columns=[], scale_method=SCALE_NORMAL):
10 |         self.scale_method = scale_method
11 |         self.columns = columns
12 |         self.scale = preprocessing.StandardScaler(
13 |             copy=True, with_mean=True, with_std=True
14 |         )
15 |         self.X_min_values = None  # it is used in SCALE_LOG_AND_NORMAL
16 | 
17 |     def fit(self, X):
18 |         if len(self.columns):
19 |             for c in self.columns:
20 |                 X[c] = X[c].astype(float)
21 | 
22 |             if self.scale_method == self.SCALE_NORMAL:
23 |                 self.scale.fit(X[self.columns])
24 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
25 |                 self.X_min_values = np.min(X[self.columns], axis=0)
26 |                 self.scale.fit(np.log(X[self.columns] - self.X_min_values + 1))
27 | 
28 |     def transform(self, X):
29 |         if len(self.columns):
30 |             for c in self.columns:
31 |                 X[c] = X[c].astype(float)
32 |             if self.scale_method == self.SCALE_NORMAL:
33 |                 X.loc[:, self.columns] = self.scale.transform(X[self.columns])
34 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
35 |                 X[self.columns] = np.log(
36 |                     np.clip(
37 |                         X[self.columns] - self.X_min_values + 1, a_min=1, a_max=None
38 |                     )
39 |                 )
40 |                 X.loc[:, self.columns] = self.scale.transform(X[self.columns])
41 |         return X
42 | 
43 |     def inverse_transform(self, X):
44 |         if len(self.columns):
45 |             if self.scale_method == self.SCALE_NORMAL:
46 |                 X.loc[:, self.columns] = self.scale.inverse_transform(X[self.columns])
47 |             elif self.scale_method == self.SCALE_LOG_AND_NORMAL:
48 |                 X[self.columns] = X[self.columns].astype("float64")
49 | 
50 |                 X[self.columns] = self.scale.inverse_transform(X[self.columns])
51 |                 X[self.columns] = np.exp(X[self.columns])
52 | 
53 |                 X.loc[:, self.columns] += self.X_min_values - 1
54 |         return X
55 | 
56 |     def to_json(self):
57 |         if len(self.columns) == 0:
58 |             return None
59 |         data_json = {
60 |             "scale": list(self.scale.scale_),
61 |             "mean": list(self.scale.mean_),
62 |             "var": list(self.scale.var_),
63 |             "n_samples_seen": int(self.scale.n_samples_seen_),
64 |             "n_features_in": int(self.scale.n_features_in_),
65 |             "columns": self.columns,
66 |             "scale_method": self.scale_method,
67 |         }
68 |         if self.X_min_values is not None:
69 |             data_json["X_min_values"] = list(self.X_min_values)
70 |         return data_json
71 | 
72 |     def from_json(self, data_json):
73 |         self.scale = preprocessing.StandardScaler(
74 |             copy=True, with_mean=True, with_std=True
75 |         )
76 |         self.scale.scale_ = data_json.get("scale")
77 |         if self.scale.scale_ is not None:
78 |             self.scale.scale_ = np.array(self.scale.scale_)
79 |         self.scale.mean_ = data_json.get("mean")
80 |         if self.scale.mean_ is not None:
81 |             self.scale.mean_ = np.array(self.scale.mean_)
82 |         self.scale.var_ = data_json.get("var")
83 |         if self.scale.var_ is not None:
84 |             self.scale.var_ = np.array(self.scale.var_)
85 |         self.scale.n_samples_seen_ = int(data_json.get("n_samples_seen"))
86 |         self.scale.n_features_in_ = int(data_json.get("n_features_in"))
87 |         self.columns = data_json.get("columns", [])
88 |         self.scale.feature_names_in_ = data_json.get("columns")
89 |         self.scale_method = data_json.get("scale_method")
90 |         self.X_min_values = data_json.get("X_min_values")
91 |         if self.X_min_values is not None:
92 |             self.X_min_values = np.array(self.X_min_values)
93 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/kmeans_transformer.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import time
  3 | 
  4 | import joblib
  5 | import numpy as np
  6 | from sklearn.cluster import MiniBatchKMeans
  7 | from sklearn.preprocessing import StandardScaler
  8 | 
  9 | from supervised.exceptions import AutoMLException
 10 | 
 11 | 
 12 | class KMeansTransformer(object):
 13 |     def __init__(self, results_path=None, model_name=None, k_fold=None):
 14 |         self._new_features = []
 15 |         self._input_columns = []
 16 |         self._error = None
 17 |         self._kmeans = None
 18 |         self._scale = None
 19 |         self._model_name = model_name
 20 |         self._k_fold = k_fold
 21 | 
 22 |         if results_path is not None:
 23 |             self._result_file = os.path.join(
 24 |                 self._model_name, f"kmeans_fold_{k_fold}.joblib"
 25 |             )
 26 |             self._result_path = os.path.join(results_path, self._result_file)
 27 |             # self.try_load()
 28 | 
 29 |     def fit(self, X, y):
 30 |         if self._new_features:
 31 |             return
 32 |         if self._error is not None and self._error:
 33 |             raise AutoMLException(
 34 |                 "KMeans Features not created due to error (please check errors.md). "
 35 |                 + self._error
 36 |             )
 37 |             return
 38 |         if X.shape[1] == 0:
 39 |             self._error = f"KMeans not created. No continous features. Input data shape: {X.shape}, {y.shape}"
 40 |             raise AutoMLException("KMeans Features not created. No continous features.")
 41 | 
 42 |         start_time = time.time()
 43 | 
 44 |         n_clusters = int(np.log10(X.shape[0]) * 8)
 45 |         n_clusters = max(8, n_clusters)
 46 |         n_clusters = min(n_clusters, X.shape[1])
 47 | 
 48 |         self._input_columns = X.columns.tolist()
 49 |         # scale data
 50 |         self._scale = StandardScaler(copy=True, with_mean=True, with_std=True)
 51 |         X = self._scale.fit_transform(X)
 52 | 
 53 |         # Kmeans
 54 |         self._kmeans = kmeans = MiniBatchKMeans(n_clusters=n_clusters, init="k-means++")
 55 |         self._kmeans.fit(X)
 56 |         self._create_new_features_names()
 57 | 
 58 |         # print(
 59 |         #    f"Created {len(self._new_features)} KMeans Features in {np.round(time.time() - start_time,2)} seconds."
 60 |         # )
 61 | 
 62 |     def _create_new_features_names(self):
 63 |         n_clusters = self._kmeans.cluster_centers_.shape[0]
 64 |         self._new_features = [f"Dist_Cluster_{i}" for i in range(n_clusters)]
 65 |         self._new_features += ["Cluster"]
 66 | 
 67 |     def transform(self, X):
 68 |         if self._kmeans is None:
 69 |             raise AutoMLException("KMeans not fitted")
 70 | 
 71 |         # scale
 72 |         X_scaled = self._scale.transform(X[self._input_columns])
 73 | 
 74 |         # kmeans
 75 |         distances = self._kmeans.transform(X_scaled)
 76 |         clusters = self._kmeans.predict(X_scaled)
 77 | 
 78 |         X[self._new_features[:-1]] = distances
 79 |         X[self._new_features[-1]] = clusters
 80 | 
 81 |         return X
 82 | 
 83 |     def to_json(self):
 84 |         self.save()
 85 |         data_json = {
 86 |             "new_features": self._new_features,
 87 |             "result_file": self._result_file,
 88 |             "input_columns": self._input_columns,
 89 |         }
 90 |         if self._error is not None and self._error:
 91 |             data_json["error"] = self._error
 92 |         return data_json
 93 | 
 94 |     def from_json(self, data_json, results_path):
 95 |         self._new_features = data_json.get("new_features", [])
 96 |         self._input_columns = data_json.get("input_columns", [])
 97 |         self._result_file = data_json.get("result_file")
 98 |         self._result_path = os.path.join(results_path, self._result_file)
 99 |         self._error = data_json.get("error")
100 |         self.try_load()
101 | 
102 |     def save(self):
103 |         joblib.dump(
104 |             {"kmeans": self._kmeans, "scale": self._scale},
105 |             self._result_path,
106 |             compress=True,
107 |         )
108 | 
109 |     def try_load(self):
110 |         if os.path.exists(self._result_path):
111 |             data = joblib.load(self._result_path)
112 |             self._kmeans = data["kmeans"]
113 |             self._scale = data["scale"]
114 | 
115 |             self._create_new_features_names()
116 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_handle_imbalance.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised import AutoML
  8 | from supervised.algorithms.random_forest import additional
  9 | from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION
 10 | 
 11 | additional["max_steps"] = 1
 12 | additional["trees_in_step"] = 1
 13 | 
 14 | from supervised.algorithms.xgboost import additional
 15 | 
 16 | additional["max_rounds"] = 1
 17 | 
 18 | 
 19 | class AutoMLHandleImbalanceTest(unittest.TestCase):
 20 |     automl_dir = "AutoMLHandleImbalanceTest"
 21 | 
 22 |     def tearDown(self):
 23 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 24 | 
 25 |     def test_handle_drastic_imbalance(self):
 26 |         a = AutoML(
 27 |             results_path=self.automl_dir,
 28 |             total_time_limit=10,
 29 |             algorithms=["Random Forest"],
 30 |             train_ensemble=False,
 31 |             validation_strategy={
 32 |                 "validation_type": "kfold",
 33 |                 "k_folds": 10,
 34 |                 "shuffle": True,
 35 |                 "stratify": True,
 36 |             },
 37 |             start_random_models=1,
 38 |         )
 39 | 
 40 |         rows = 100
 41 |         X = pd.DataFrame(
 42 |             {
 43 |                 "f1": np.random.rand(rows),
 44 |                 "f2": np.random.rand(rows),
 45 |                 "f3": np.random.rand(rows),
 46 |             }
 47 |         )
 48 |         y = np.ones(rows)
 49 | 
 50 |         y[:8] = 0
 51 |         y[10:12] = 2
 52 |         y = pd.Series(np.array(y), name="target")
 53 |         a._ml_task = MULTICLASS_CLASSIFICATION
 54 |         a._handle_drastic_imbalance(X, y)
 55 | 
 56 |         self.assertEqual(X.shape[0], 130)
 57 |         self.assertEqual(X.shape[1], 3)
 58 |         self.assertEqual(y.shape[0], 130)
 59 | 
 60 |     def test_handle_drastic_imbalance_sample_weight(self):
 61 |         a = AutoML(
 62 |             results_path=self.automl_dir,
 63 |             total_time_limit=10,
 64 |             algorithms=["Random Forest"],
 65 |             train_ensemble=False,
 66 |             validation_strategy={
 67 |                 "validation_type": "kfold",
 68 |                 "k_folds": 10,
 69 |                 "shuffle": True,
 70 |                 "stratify": True,
 71 |             },
 72 |             start_random_models=1,
 73 |         )
 74 | 
 75 |         rows = 100
 76 |         X = pd.DataFrame(
 77 |             {
 78 |                 "f1": np.random.rand(rows),
 79 |                 "f2": np.random.rand(rows),
 80 |                 "f3": np.random.rand(rows),
 81 |             }
 82 |         )
 83 |         y = np.ones(rows)
 84 |         sample_weight = pd.Series(np.array(range(rows)), name="sample_weight")
 85 | 
 86 |         y[:1] = 0
 87 |         y[10:11] = 2
 88 | 
 89 |         y = pd.Series(np.array(y), name="target")
 90 |         a._ml_task = MULTICLASS_CLASSIFICATION
 91 |         a._handle_drastic_imbalance(X, y, sample_weight)
 92 | 
 93 |         self.assertEqual(X.shape[0], 138)
 94 |         self.assertEqual(X.shape[1], 3)
 95 |         self.assertEqual(y.shape[0], 138)
 96 | 
 97 |         self.assertEqual(np.sum(sample_weight[100:119]), 0)
 98 |         self.assertEqual(np.sum(sample_weight[119:138]), 19 * 10)
 99 | 
100 |     def test_imbalance_dont_change_data_after_fit(self):
101 |         a = AutoML(
102 |             results_path=self.automl_dir,
103 |             total_time_limit=5,
104 |             train_ensemble=False,
105 |             validation_strategy={
106 |                 "validation_type": "kfold",
107 |                 "k_folds": 10,
108 |                 "shuffle": True,
109 |                 "stratify": True,
110 |             },
111 |             start_random_models=1,
112 |             explain_level=0,
113 |         )
114 | 
115 |         rows = 100
116 |         X = pd.DataFrame(
117 |             {
118 |                 "f1": np.random.rand(rows),
119 |                 "f2": np.random.rand(rows),
120 |                 "f3": np.random.rand(rows),
121 |             }
122 |         )
123 |         y = np.ones(rows)
124 | 
125 |         y[:8] = 0
126 |         y[10:12] = 2
127 |         sample_weight = np.ones(rows)
128 | 
129 |         a.fit(X, y, sample_weight=sample_weight)
130 | 
131 |         # original data **without** inserted samples to handle imbalance
132 |         self.assertEqual(X.shape[0], rows)
133 |         self.assertEqual(y.shape[0], rows)
134 |         self.assertEqual(sample_weight.shape[0], rows)
135 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_random_forest.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.random_forest import (
  9 |     RandomForestAlgorithm,
 10 |     RandomForestRegressorAlgorithm,
 11 |     additional,
 12 |     regression_additional,
 13 | )
 14 | from supervised.utils.metric import Metric
 15 | 
 16 | additional["trees_in_step"] = 1
 17 | regression_additional["trees_in_step"] = 1
 18 | additional["max_steps"] = 1
 19 | regression_additional["max_steps"] = 1
 20 | 
 21 | 
 22 | class RandomForestRegressorAlgorithmTest(unittest.TestCase):
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         cls.X, cls.y = datasets.make_regression(
 26 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 27 |         )
 28 | 
 29 |     def test_reproduce_fit(self):
 30 |         metric = Metric({"name": "mse"})
 31 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"}
 32 |         prev_loss = None
 33 |         for _ in range(3):
 34 |             model = RandomForestRegressorAlgorithm(params)
 35 |             model.fit(self.X, self.y)
 36 |             y_predicted = model.predict(self.X)
 37 |             loss = metric(self.y, y_predicted)
 38 |             if prev_loss is not None:
 39 |                 assert_almost_equal(prev_loss, loss)
 40 |             prev_loss = loss
 41 | 
 42 | 
 43 | class RandomForestAlgorithmTest(unittest.TestCase):
 44 |     @classmethod
 45 |     def setUpClass(cls):
 46 |         cls.X, cls.y = datasets.make_classification(
 47 |             n_samples=100,
 48 |             n_features=5,
 49 |             n_informative=4,
 50 |             n_redundant=1,
 51 |             n_classes=2,
 52 |             n_clusters_per_class=3,
 53 |             n_repeated=0,
 54 |             shuffle=False,
 55 |             random_state=0,
 56 |         )
 57 | 
 58 |     def test_reproduce_fit(self):
 59 |         metric = Metric({"name": "logloss"})
 60 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"}
 61 |         prev_loss = None
 62 |         for _ in range(3):
 63 |             model = RandomForestAlgorithm(params)
 64 |             model.fit(self.X, self.y)
 65 |             y_predicted = model.predict(self.X)
 66 |             loss = metric(self.y, y_predicted)
 67 |             if prev_loss is not None:
 68 |                 assert_almost_equal(prev_loss, loss)
 69 |             prev_loss = loss
 70 | 
 71 |     def test_fit_predict(self):
 72 |         metric = Metric({"name": "logloss"})
 73 |         params = {"ml_task": "binary_classification"}
 74 |         rf = RandomForestAlgorithm(params)
 75 | 
 76 |         rf.fit(self.X, self.y)
 77 |         y_predicted = rf.predict(self.X)
 78 |         self.assertTrue(metric(self.y, y_predicted) < 1.5)
 79 | 
 80 |     def test_copy(self):
 81 |         metric = Metric({"name": "logloss"})
 82 |         rf = RandomForestAlgorithm({"ml_task": "binary_classification"})
 83 |         rf.fit(self.X, self.y)
 84 |         y_predicted = rf.predict(self.X)
 85 |         loss = metric(self.y, y_predicted)
 86 | 
 87 |         rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"})
 88 |         rf2 = rf.copy()
 89 |         self.assertEqual(type(rf), type(rf2))
 90 |         y_predicted = rf2.predict(self.X)
 91 |         loss2 = metric(self.y, y_predicted)
 92 |         assert_almost_equal(loss, loss2)
 93 | 
 94 |     def test_save_and_load(self):
 95 |         metric = Metric({"name": "logloss"})
 96 |         rf = RandomForestAlgorithm({"ml_task": "binary_classification"})
 97 |         rf.fit(self.X, self.y)
 98 |         y_predicted = rf.predict(self.X)
 99 |         loss = metric(self.y, y_predicted)
100 | 
101 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
102 | 
103 |         rf.save(filename)
104 |         rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"})
105 |         rf2.load(filename)
106 |         # Finished with the file, delete it
107 |         os.remove(filename)
108 | 
109 |         y_predicted = rf2.predict(self.X)
110 |         loss2 = metric(self.y, y_predicted)
111 |         assert_almost_equal(loss, loss2)
112 | 
113 |     def test_is_fitted(self):
114 |         model = RandomForestAlgorithm({"ml_task": "binary_classification"})
115 |         self.assertFalse(model.is_fitted())
116 |         model.fit(self.X, self.y)
117 |         self.assertTrue(model.is_fitted())
118 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_extra_trees.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | 
  8 | from supervised.algorithms.extra_trees import (
  9 |     ExtraTreesAlgorithm,
 10 |     ExtraTreesRegressorAlgorithm,
 11 |     additional,
 12 |     regression_additional,
 13 | )
 14 | from supervised.utils.metric import Metric
 15 | 
 16 | additional["trees_in_step"] = 1
 17 | regression_additional["trees_in_step"] = 1
 18 | additional["max_steps"] = 1
 19 | regression_additional["max_steps"] = 1
 20 | 
 21 | 
 22 | class ExtraTreesRegressorAlgorithmTest(unittest.TestCase):
 23 |     @classmethod
 24 |     def setUpClass(cls):
 25 |         cls.X, cls.y = datasets.make_regression(
 26 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 27 |         )
 28 | 
 29 |     def test_reproduce_fit(self):
 30 |         metric = Metric({"name": "mse"})
 31 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"}
 32 |         prev_loss = None
 33 |         for _ in range(3):
 34 |             model = ExtraTreesRegressorAlgorithm(params)
 35 |             model.fit(self.X, self.y)
 36 |             y_predicted = model.predict(self.X)
 37 |             loss = metric(self.y, y_predicted)
 38 |             if prev_loss is not None:
 39 |                 assert_almost_equal(prev_loss, loss)
 40 |             prev_loss = loss
 41 | 
 42 | 
 43 | class ExtraTreesAlgorithmTest(unittest.TestCase):
 44 |     @classmethod
 45 |     def setUpClass(cls):
 46 |         cls.X, cls.y = datasets.make_classification(
 47 |             n_samples=100,
 48 |             n_features=5,
 49 |             n_informative=4,
 50 |             n_redundant=1,
 51 |             n_classes=2,
 52 |             n_clusters_per_class=3,
 53 |             n_repeated=0,
 54 |             shuffle=False,
 55 |             random_state=0,
 56 |         )
 57 | 
 58 |     def test_reproduce_fit(self):
 59 |         metric = Metric({"name": "logloss"})
 60 |         params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"}
 61 |         prev_loss = None
 62 |         for _ in range(3):
 63 |             model = ExtraTreesAlgorithm(params)
 64 |             model.fit(self.X, self.y)
 65 |             y_predicted = model.predict(self.X)
 66 |             loss = metric(self.y, y_predicted)
 67 |             if prev_loss is not None:
 68 |                 assert_almost_equal(prev_loss, loss)
 69 |             prev_loss = loss
 70 | 
 71 |     def test_fit_predict(self):
 72 |         metric = Metric({"name": "logloss"})
 73 |         params = {"trees_in_step": 50, "ml_task": "binary_classification"}
 74 |         rf = ExtraTreesAlgorithm(params)
 75 | 
 76 |         rf.fit(self.X, self.y)
 77 |         y_predicted = rf.predict(self.X)
 78 |         self.assertTrue(metric(self.y, y_predicted) < 0.6)
 79 | 
 80 |     def test_copy(self):
 81 |         metric = Metric({"name": "logloss"})
 82 |         rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 83 |         rf.fit(self.X, self.y)
 84 |         y_predicted = rf.predict(self.X)
 85 |         loss = metric(self.y, y_predicted)
 86 | 
 87 |         rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 88 |         rf2 = rf.copy()
 89 |         self.assertEqual(type(rf), type(rf2))
 90 |         y_predicted = rf2.predict(self.X)
 91 |         loss2 = metric(self.y, y_predicted)
 92 |         assert_almost_equal(loss, loss2)
 93 | 
 94 |     def test_save_and_load(self):
 95 |         metric = Metric({"name": "logloss"})
 96 |         rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
 97 |         rf.fit(self.X, self.y)
 98 |         y_predicted = rf.predict(self.X)
 99 |         loss = metric(self.y, y_predicted)
100 | 
101 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
102 | 
103 |         rf.save(filename)
104 |         rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"})
105 |         rf2.load(filename)
106 |         # Finished with the file, delete it
107 |         os.remove(filename)
108 | 
109 |         y_predicted = rf2.predict(self.X)
110 |         loss2 = metric(self.y, y_predicted)
111 |         assert_almost_equal(loss, loss2)
112 | 
113 |     def test_is_fitted(self):
114 |         params = {"trees_in_step": 50, "ml_task": "binary_classification"}
115 |         model = ExtraTreesAlgorithm(params)
116 |         self.assertFalse(model.is_fitted())
117 |         model.fit(self.X, self.y)
118 |         self.assertTrue(model.is_fitted())
119 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_lightgbm.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from numpy.testing import assert_almost_equal
  8 | from sklearn import datasets
  9 | 
 10 | from supervised.algorithms.lightgbm import LightgbmAlgorithm, additional
 11 | from supervised.utils.metric import Metric
 12 | 
 13 | additional["max_rounds"] = 1
 14 | 
 15 | 
 16 | class LightgbmAlgorithmTest(unittest.TestCase):
 17 |     @classmethod
 18 |     def setUpClass(cls):
 19 |         cls.X, cls.y = datasets.make_classification(
 20 |             n_samples=100,
 21 |             n_features=5,
 22 |             n_informative=4,
 23 |             n_redundant=1,
 24 |             n_classes=2,
 25 |             n_clusters_per_class=3,
 26 |             n_repeated=0,
 27 |             shuffle=False,
 28 |             random_state=0,
 29 |         )
 30 |         cls.params = {
 31 |             "metric": "binary_logloss",
 32 |             "num_leaves": "2",
 33 |             "learning_rate": 0.1,
 34 |             "feature_fraction": 0.8,
 35 |             "bagging_fraction": 0.8,
 36 |             "bagging_freq": 1,
 37 |             "seed": 1,
 38 |             "early_stopping_rounds": 0,
 39 |         }
 40 | 
 41 |     def test_reproduce_fit(self):
 42 |         metric = Metric({"name": "logloss"})
 43 |         prev_loss = None
 44 |         for i in range(3):
 45 |             model = LightgbmAlgorithm(self.params)
 46 |             model.fit(self.X, self.y)
 47 |             y_predicted = model.predict(self.X)
 48 |             loss = metric(self.y, y_predicted)
 49 |             if prev_loss is not None:
 50 |                 assert_almost_equal(prev_loss, loss)
 51 |             prev_loss = loss
 52 | 
 53 |     def test_fit_predict(self):
 54 |         metric = Metric({"name": "logloss"})
 55 |         lgb = LightgbmAlgorithm(self.params)
 56 |         lgb.fit(self.X, self.y)
 57 |         y_predicted = lgb.predict(self.X)
 58 |         loss = metric(self.y, y_predicted)
 59 |         self.assertTrue(loss < 0.7)
 60 | 
 61 |     def test_copy(self):
 62 |         # train model #1
 63 |         metric = Metric({"name": "logloss"})
 64 |         lgb = LightgbmAlgorithm(self.params)
 65 |         lgb.fit(self.X, self.y)
 66 |         y_predicted = lgb.predict(self.X)
 67 |         loss = metric(self.y, y_predicted)
 68 |         # create model #2
 69 |         lgb2 = LightgbmAlgorithm(self.params)
 70 |         # model #2 is set to None, while initialized
 71 |         self.assertTrue(lgb2.model is None)
 72 |         # do a copy and use it for predictions
 73 |         lgb2 = lgb.copy()
 74 |         self.assertEqual(type(lgb), type(lgb2))
 75 |         y_predicted = lgb2.predict(self.X)
 76 |         loss2 = metric(self.y, y_predicted)
 77 |         self.assertEqual(loss, loss2)
 78 | 
 79 |     def test_save_and_load(self):
 80 |         metric = Metric({"name": "logloss"})
 81 |         lgb = LightgbmAlgorithm(self.params)
 82 |         lgb.fit(self.X, self.y)
 83 |         y_predicted = lgb.predict(self.X)
 84 |         loss = metric(self.y, y_predicted)
 85 | 
 86 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
 87 |         lgb.save(filename)
 88 |         lgb2 = LightgbmAlgorithm({})
 89 |         self.assertTrue(lgb.uid != lgb2.uid)
 90 |         self.assertTrue(lgb2.model is None)
 91 |         lgb2.load(filename)
 92 |         # Finished with the file, delete it
 93 |         os.remove(filename)
 94 | 
 95 |         y_predicted = lgb2.predict(self.X)
 96 |         loss2 = metric(self.y, y_predicted)
 97 |         assert_almost_equal(loss, loss2)
 98 | 
 99 |     def test_get_metric_name(self):
100 |         model = LightgbmAlgorithm(self.params)
101 |         self.assertEqual(model.get_metric_name(), "logloss")
102 | 
103 |     def test_restricted_characters_in_feature_name(self):
104 |         df = pd.DataFrame(
105 |             {
106 |                 "y": np.random.randint(0, 2, size=100),
107 |                 "[test1]": np.random.uniform(0, 1, size=100),
108 |                 "test2 < 1": np.random.uniform(0, 1, size=100),
109 |             }
110 |         )
111 | 
112 |         y = df.iloc[:, 0]
113 |         X = df.iloc[:, 1:]
114 | 
115 |         metric = Metric({"name": "logloss"})
116 |         params = {"objective": "binary:logistic", "eval_metric": "logloss"}
117 |         lgb = LightgbmAlgorithm(self.params)
118 |         lgb.fit(X, y)
119 |         lgb.predict(X)
120 | 
121 |     def test_is_fitted(self):
122 |         model = LightgbmAlgorithm(self.params)
123 |         self.assertFalse(model.is_fitted())
124 |         model.fit(self.X, self.y)
125 |         self.assertTrue(model.is_fitted())
126 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/preprocessing_utils.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy import stats
  4 | from sklearn import preprocessing
  5 | 
  6 | 
  7 | class PreprocessingUtilsException(Exception):
  8 |     pass
  9 | 
 10 | 
 11 | class PreprocessingUtils(object):
 12 |     CATEGORICAL = "categorical"
 13 |     CONTINOUS = "continous"
 14 |     DISCRETE = "discrete"
 15 |     DATETIME = "datetime"
 16 |     TEXT = "text"
 17 | 
 18 |     @staticmethod
 19 |     def get_type(x):
 20 |         if len(x.shape) > 1:
 21 |             if x.shape[1] != 1:
 22 |                 raise PreprocessingUtilsException(
 23 |                     "Please select one column to get its type"
 24 |                 )
 25 |         col_type = str(x.dtype)
 26 | 
 27 |         data_type = PreprocessingUtils.CATEGORICAL
 28 |         if col_type.startswith("float"):
 29 |             data_type = PreprocessingUtils.CONTINOUS
 30 |         elif col_type.startswith("int") or col_type.startswith("uint"):
 31 |             data_type = PreprocessingUtils.DISCRETE
 32 |         elif col_type.startswith("datetime"):
 33 |             data_type = PreprocessingUtils.DATETIME
 34 |         elif col_type.startswith("category"):
 35 |             # do not check the additional condition for text feature
 36 |             # treat it as categorical
 37 |             return PreprocessingUtils.CATEGORICAL
 38 | 
 39 |         if data_type == PreprocessingUtils.CATEGORICAL:
 40 |             # check maybe this categorical is a text
 41 |             # it is a text, if:
 42 |             # has more than 200 unique values
 43 |             # more than half of rows is unique
 44 |             unique_cnt = len(np.unique(x[~pd.isnull(x)]))
 45 |             if unique_cnt > 200 and unique_cnt > int(0.5 * x.shape[0]):
 46 |                 data_type = PreprocessingUtils.TEXT
 47 | 
 48 |         return data_type
 49 | 
 50 |     @staticmethod
 51 |     def is_categorical(x_org):
 52 |         x = x_org[~pd.isnull(x_org)]
 53 |         return PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL
 54 | 
 55 |     @staticmethod
 56 |     def is_datetime(x_org):
 57 |         x = x_org[~pd.isnull(x_org)]
 58 |         return PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME
 59 | 
 60 |     @staticmethod
 61 |     def is_text(x_org):
 62 |         x = x_org[~pd.isnull(x_org)]
 63 |         return PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT
 64 | 
 65 |     @staticmethod
 66 |     def is_0_1(x_org):
 67 |         x = x_org[~pd.isnull(x_org)]
 68 |         u = np.unique(x)
 69 |         if len(u) != 2:
 70 |             return False
 71 |         return 0 in u and 1 in u
 72 | 
 73 |     @staticmethod
 74 |     def num_class(x_org):
 75 |         x = x_org[~pd.isnull(x_org)]
 76 |         u = np.unique(x)
 77 |         return len(u)
 78 | 
 79 |     @staticmethod
 80 |     def is_scale_needed(x_org):
 81 |         x = x_org[~pd.isnull(x_org)]
 82 |         abs_avg = np.abs(np.mean(x))
 83 |         stddev = np.std(x)
 84 |         if abs_avg > 0.5 or stddev > 1.5:
 85 |             return True
 86 |         return False
 87 | 
 88 |     @staticmethod
 89 |     def is_log_scale_needed(x_org):
 90 |         x_full = np.array(x_org[~pd.isnull(x_org)])
 91 |         # first scale on raw data
 92 |         x = preprocessing.scale(x_full)
 93 |         # second scale on log data
 94 |         x_log = preprocessing.scale(np.log(x_full - np.min(x_full) + 1))
 95 | 
 96 |         # the old approach, let's check how new approach will work
 97 |         # original_skew = np.abs(stats.skew(x))
 98 |         # log_skew = np.abs(stats.skew(x_log))
 99 |         # return log_skew < original_skew
100 |         ########################################################################
101 |         # p is probability of being normal distributions
102 |         k2, p1 = stats.normaltest(x)
103 |         k2, p2 = stats.normaltest(x_log)
104 | 
105 |         return p2 > p1
106 | 
107 |     @staticmethod
108 |     def is_na(x):
109 |         return np.sum(pd.isnull(x) == True) > 0
110 | 
111 |     @staticmethod
112 |     def get_most_frequent(x):
113 |         a = x.value_counts()
114 |         first = sorted(dict(a).items(), key=lambda x: -x[1])[0]
115 |         return first[0]
116 | 
117 |     @staticmethod
118 |     def get_min(x):
119 |         v = np.amin(np.nanmin(x))
120 |         if pd.isnull(v):
121 |             return 0
122 |         return float(v)
123 | 
124 |     @staticmethod
125 |     def get_mean(x):
126 |         v = np.nanmean(x)
127 |         if pd.isnull(v):
128 |             return 0
129 |         return float(v)
130 | 
131 |     @staticmethod
132 |     def get_median(x):
133 |         v = np.nanmedian(x)
134 |         if pd.isnull(v):
135 |             return 0
136 |         return float(v)
137 | 
```

--------------------------------------------------------------------------------
/tests/tests_fairness/test_binary_classification.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised import AutoML
  8 | 
  9 | 
 10 | class FairnessInBinaryClassificationTest(unittest.TestCase):
 11 |     automl_dir = "automl_fairness_testing"
 12 | 
 13 |     def tearDown(self):
 14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 15 | 
 16 |     def test_init(self):
 17 |         X = np.random.uniform(size=(30, 2))
 18 |         y = np.random.randint(0, 2, size=(30,))
 19 |         S = pd.DataFrame({"sensitive": ["A", "B"] * 15})
 20 | 
 21 |         automl = AutoML(
 22 |             results_path=self.automl_dir,
 23 |             model_time_limit=10,
 24 |             algorithms=["Xgboost"],
 25 |             explain_level=0,
 26 |             train_ensemble=False,
 27 |             stack_models=False,
 28 |             validation_strategy={"validation_type": "split"},
 29 |             start_random_models=1,
 30 |         )
 31 | 
 32 |         automl.fit(X, y, sensitive_features=S)
 33 | 
 34 |         self.assertGreater(len(automl._models), 0)
 35 | 
 36 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
 37 |         self.assertEqual(len(sensitive_features_names), 1)
 38 |         self.assertTrue("sensitive" in sensitive_features_names)
 39 | 
 40 |         self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None)
 41 |         self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1)
 42 |         self.assertTrue(automl._models[0].get_worst_fairness() is not None)
 43 |         self.assertTrue(automl._models[0].get_best_fairness() is not None)
 44 | 
 45 |     def test_arguments(self):
 46 |         X = np.random.uniform(size=(30, 2))
 47 |         y = np.random.randint(0, 2, size=(30,))
 48 |         S = pd.DataFrame({"sensitive": ["A", "B"] * 15})
 49 | 
 50 |         automl = AutoML(
 51 |             results_path=self.automl_dir,
 52 |             model_time_limit=10,
 53 |             algorithms=["Xgboost"],
 54 |             privileged_groups=[{"sensitive": "A"}],
 55 |             underprivileged_groups=[{"sensitive": "B"}],
 56 |             fairness_metric="demographic_parity_ratio",
 57 |             fairness_threshold=0.2,
 58 |             explain_level=0,
 59 |             train_ensemble=False,
 60 |             stack_models=False,
 61 |             validation_strategy={"validation_type": "split"},
 62 |             start_random_models=1,
 63 |         )
 64 | 
 65 |         automl.fit(X, y, sensitive_features=S)
 66 | 
 67 |         self.assertGreater(len(automl._models), 0)
 68 | 
 69 |     def test_wrong_metric_name(self):
 70 |         X = np.random.uniform(size=(30, 2))
 71 |         y = np.random.randint(0, 2, size=(30,))
 72 |         S = pd.DataFrame({"sensitive": ["A", "B"] * 15})
 73 | 
 74 |         with self.assertRaises(ValueError) as context:
 75 |             automl = AutoML(
 76 |                 results_path=self.automl_dir,
 77 |                 model_time_limit=10,
 78 |                 algorithms=["Xgboost"],
 79 |                 privileged_groups=[{"sensitive": "A"}],
 80 |                 underprivileged_groups=[{"sensitive": "B"}],
 81 |                 fairness_metric="wrong_metric_name",
 82 |                 fairness_threshold=0.2,
 83 |                 explain_level=0,
 84 |                 train_ensemble=False,
 85 |                 stack_models=False,
 86 |                 validation_strategy={"validation_type": "split"},
 87 |                 start_random_models=1,
 88 |             )
 89 |             automl.fit(X, y, sensitive_features=S)
 90 |         self.assertTrue("is not allowed" in str(context.exception))
 91 | 
 92 |     def test_two_sensitive_features(self):
 93 |         X = np.random.uniform(size=(30, 2))
 94 |         y = np.random.randint(0, 2, size=(30,))
 95 |         S = pd.DataFrame(
 96 |             {
 97 |                 "sensitive_1": ["White", "Black"] * 15,
 98 |                 "sensitive_2": ["Male", "Female"] * 15,
 99 |             }
100 |         )
101 | 
102 |         automl = AutoML(
103 |             results_path=self.automl_dir,
104 |             model_time_limit=10,
105 |             algorithms=["Xgboost"],
106 |             explain_level=0,
107 |             train_ensemble=False,
108 |             stack_models=False,
109 |             start_random_models=1,
110 |         )
111 | 
112 |         automl.fit(X, y, sensitive_features=S)
113 | 
114 |         self.assertGreater(len(automl._models), 0)
115 | 
116 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
117 |         self.assertEqual(len(sensitive_features_names), 2)
118 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/plots.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | from matplotlib import pyplot as plt
  3 | 
  4 | 
  5 | class FairnessPlots:
  6 |     @staticmethod
  7 |     def binary_classification(
  8 |         fairness_metric,
  9 |         col_name,
 10 |         metrics,
 11 |         selection_rates,
 12 |         max_selection_rate,
 13 |         fairness_threshold,
 14 |     ):
 15 |         figures = []
 16 |         # selection rate figure
 17 |         fair_selection_rate = max_selection_rate * fairness_threshold
 18 | 
 19 |         fig = plt.figure(figsize=(10, 7))
 20 |         ax1 = fig.add_subplot(1, 1, 1)
 21 |         bars = ax1.bar(metrics.index[1:], metrics["Selection Rate"][1:])
 22 | 
 23 |         ax1.spines[["right", "top", "left"]].set_visible(False)
 24 |         ax1.yaxis.set_visible(False)
 25 |         _ = ax1.bar_label(bars, padding=5)
 26 | 
 27 |         if fairness_metric == "demographic_parity_ratio":
 28 |             ax1.axhline(y=fair_selection_rate, zorder=0, color="grey", ls="--", lw=1.5)
 29 |             _ = ax1.text(
 30 |                 y=fair_selection_rate,
 31 |                 x=-0.6,
 32 |                 s="Fairness threshold",
 33 |                 ha="center",
 34 |                 fontsize=12,
 35 |                 bbox=dict(facecolor="white", edgecolor="grey", ls="--"),
 36 |             )
 37 |             _ = ax1.text(
 38 |                 y=1.2 * fair_selection_rate,
 39 |                 x=-0.6,
 40 |                 s="Fair",
 41 |                 ha="center",
 42 |                 fontsize=12,
 43 |             )
 44 |             _ = ax1.text(
 45 |                 y=0.8 * fair_selection_rate,
 46 |                 x=-0.6,
 47 |                 s="Unfair",
 48 |                 ha="center",
 49 |                 fontsize=12,
 50 |             )
 51 | 
 52 |             ax1.axhspan(
 53 |                 fairness_threshold * max_selection_rate,
 54 |                 1.25 * np.max(selection_rates[1:]),
 55 |                 color="green",
 56 |                 alpha=0.05,
 57 |             )
 58 |             ax1.axhspan(
 59 |                 0, fairness_threshold * max_selection_rate, color="red", alpha=0.05
 60 |             )
 61 | 
 62 |         figures += [
 63 |             {
 64 |                 "title": f"Selection Rate for {col_name}",
 65 |                 "fname": f"selection_rate_{col_name}.png",
 66 |                 "figure": fig,
 67 |             }
 68 |         ]
 69 | 
 70 |         fig, axes = plt.subplots(figsize=(10, 5), ncols=2, sharey=True)
 71 |         fig.tight_layout()
 72 |         bars = axes[0].barh(
 73 |             metrics.index[1:],
 74 |             metrics["False Negative Rate"][1:],
 75 |             zorder=10,
 76 |             color="tab:orange",
 77 |         )
 78 |         xmax = 1.2 * max(
 79 |             metrics["False Negative Rate"][1:].max(),
 80 |             metrics["False Positive Rate"][1:].max(),
 81 |         )
 82 |         axes[0].set_xlim(0, xmax)
 83 |         axes[0].invert_xaxis()
 84 |         axes[0].set_title("False Negative Rate")
 85 |         _ = axes[0].bar_label(bars, padding=5)
 86 | 
 87 |         bars = axes[1].barh(
 88 |             metrics.index[1:],
 89 |             metrics["False Positive Rate"][1:],
 90 |             zorder=10,
 91 |             color="tab:blue",
 92 |         )
 93 |         axes[1].tick_params(axis="y", colors="tab:orange")  # tick color
 94 |         axes[1].set_xlim(0, xmax)
 95 |         axes[1].set_title("False Positive Rate")
 96 |         _ = axes[1].bar_label(bars, padding=5)
 97 |         _ = plt.subplots_adjust(wspace=0, top=0.85, bottom=0.1, left=0.18, right=0.95)
 98 | 
 99 |         figures += [
100 |             {
101 |                 "title": f"False Rates for {col_name}",
102 |                 "fname": f"false_rates_{col_name}.png",
103 |                 "figure": fig,
104 |             }
105 |         ]
106 | 
107 |         return figures
108 | 
109 |     @staticmethod
110 |     def regression(fairness_metric, col_name, metrics, fairness_metric_name):
111 |         figures = []
112 |         metric_name = fairness_metric.split("@")[1].upper()
113 | 
114 |         fig = plt.figure(figsize=(10, 7))
115 |         ax1 = fig.add_subplot(1, 1, 1)
116 |         bars = ax1.bar(metrics.index[1:], metrics[metric_name][1:])
117 | 
118 |         ax1.spines[["right", "top"]].set_visible(False)
119 |         # ax1.yaxis.set_visible(False)
120 |         ax1.set_ylabel(metric_name)
121 |         _ = ax1.bar_label(bars, padding=5)
122 | 
123 |         figures += [
124 |             {
125 |                 "title": f"{metric_name} for {col_name}",
126 |                 "fname": f"{metric_name}_{col_name}.png",
127 |                 "figure": fig,
128 |             }
129 |         ]
130 | 
131 |         return figures
132 | 
```

--------------------------------------------------------------------------------
/supervised/validation/validator_custom.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import joblib
  5 | import numpy as np
  6 | 
  7 | log = logging.getLogger(__name__)
  8 | 
  9 | from supervised.exceptions import AutoMLException
 10 | from supervised.utils.utils import load_data
 11 | from supervised.validation.validator_base import BaseValidator
 12 | 
 13 | 
 14 | class CustomValidator(BaseValidator):
 15 |     def __init__(self, params):
 16 |         BaseValidator.__init__(self, params)
 17 | 
 18 |         cv_path = self.params.get("cv_path")
 19 | 
 20 |         if cv_path is None:
 21 |             raise AutoMLException("You need to specify `cv` as list or iterable")
 22 | 
 23 |         self.cv = joblib.load(cv_path)
 24 |         self.cv = list(self.cv)
 25 | 
 26 |         self._results_path = self.params.get("results_path")
 27 |         self._X_path = self.params.get("X_path")
 28 |         self._y_path = self.params.get("y_path")
 29 |         self._sample_weight_path = self.params.get("sample_weight_path")
 30 |         self._sensitive_features_path = self.params.get("sensitive_features_path")
 31 | 
 32 |         if self._X_path is None or self._y_path is None:
 33 |             raise AutoMLException("No data path set in CustomValidator params")
 34 | 
 35 |         folds_path = os.path.join(self._results_path, "folds")
 36 | 
 37 |         if not os.path.exists(folds_path):
 38 |             os.mkdir(folds_path)
 39 | 
 40 |             print("Custom validation strategy")
 41 |             for fold_cnt, (train_index, validation_index) in enumerate(self.cv):
 42 |                 print(f"Split {fold_cnt}.")
 43 |                 print(f"Train {train_index.shape[0]} samples.")
 44 |                 print(f"Validation {validation_index.shape[0]} samples.")
 45 |                 train_index_file = os.path.join(
 46 |                     self._results_path,
 47 |                     "folds",
 48 |                     f"fold_{fold_cnt}_train_indices.npy",
 49 |                 )
 50 |                 validation_index_file = os.path.join(
 51 |                     self._results_path,
 52 |                     "folds",
 53 |                     f"fold_{fold_cnt}_validation_indices.npy",
 54 |                 )
 55 | 
 56 |                 np.save(train_index_file, train_index)
 57 |                 np.save(validation_index_file, validation_index)
 58 | 
 59 |         else:
 60 |             log.debug("Folds split already done, reuse it")
 61 | 
 62 |     def get_split(self, k, repeat=0):
 63 |         try:
 64 |             train_index_file = os.path.join(
 65 |                 self._results_path, "folds", f"fold_{k}_train_indices.npy"
 66 |             )
 67 |             validation_index_file = os.path.join(
 68 |                 self._results_path, "folds", f"fold_{k}_validation_indices.npy"
 69 |             )
 70 | 
 71 |             train_index = np.load(train_index_file)
 72 |             validation_index = np.load(validation_index_file)
 73 | 
 74 |             X = load_data(self._X_path)
 75 |             y = load_data(self._y_path)
 76 |             y = y["target"]
 77 | 
 78 |             sample_weight = None
 79 |             if self._sample_weight_path is not None:
 80 |                 sample_weight = load_data(self._sample_weight_path)
 81 |                 sample_weight = sample_weight["sample_weight"]
 82 | 
 83 |             sensitive_features = None
 84 |             if self._sensitive_features_path is not None:
 85 |                 sensitive_features = load_data(self._sensitive_features_path)
 86 | 
 87 |             train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]}
 88 |             validation_data = {
 89 |                 "X": X.iloc[validation_index],
 90 |                 "y": y.iloc[validation_index],
 91 |             }            
 92 |             if sample_weight is not None:
 93 |                 train_data["sample_weight"] = sample_weight.iloc[train_index]
 94 |                 validation_data["sample_weight"] = sample_weight.iloc[validation_index]
 95 |             if sensitive_features is not None:
 96 |                 train_data["sensitive_features"] = sensitive_features.iloc[train_index]
 97 |                 validation_data["sensitive_features"] = sensitive_features.iloc[
 98 |                     validation_index
 99 |                 ]
100 | 
101 |         except Exception as e:
102 |             import traceback
103 | 
104 |             print(traceback.format_exc())
105 |             raise AutoMLException("Problem with custom validation. " + str(e))
106 |         return (train_data, validation_data)
107 | 
108 |     def get_n_splits(self):
109 |         return len(self.cv)
110 | 
111 |     def get_repeats(self):
112 |         return 1
113 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_integration.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn import datasets
  7 | 
  8 | from supervised import AutoML
  9 | 
 10 | 
 11 | class AutoMLIntegrationTest(unittest.TestCase):
 12 |     automl_dir = "AutoMLIntegrationTest"
 13 | 
 14 |     def tearDown(self):
 15 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 16 | 
 17 |     def test_integration(self):
 18 |         a = AutoML(
 19 |             results_path=self.automl_dir,
 20 |             total_time_limit=1,
 21 |             explain_level=0,
 22 |             start_random_models=1,
 23 |         )
 24 | 
 25 |         X, y = datasets.make_classification(
 26 |             n_samples=100,
 27 |             n_features=5,
 28 |             n_informative=4,
 29 |             n_redundant=1,
 30 |             n_classes=2,
 31 |             n_clusters_per_class=3,
 32 |             n_repeated=0,
 33 |             shuffle=False,
 34 |             random_state=0,
 35 |         )
 36 | 
 37 |         a.fit(X, y)
 38 |         p = a.predict(X)
 39 |         self.assertIsInstance(p, np.ndarray)
 40 |         self.assertEqual(len(p), X.shape[0])
 41 | 
 42 |     def test_one_column_input_regression(self):
 43 |         a = AutoML(
 44 |             results_path=self.automl_dir,
 45 |             total_time_limit=5,
 46 |             explain_level=0,
 47 |             start_random_models=1,
 48 |         )
 49 | 
 50 |         X, y = datasets.make_regression(n_features=1)
 51 | 
 52 |         a.fit(X, y)
 53 |         p = a.predict(X)
 54 | 
 55 |         self.assertIsInstance(p, np.ndarray)
 56 |         self.assertEqual(len(p), X.shape[0])
 57 | 
 58 |     def test_one_column_input_bin_class(self):
 59 |         a = AutoML(
 60 |             results_path=self.automl_dir,
 61 |             total_time_limit=5,
 62 |             explain_level=0,
 63 |             start_random_models=1,
 64 |         )
 65 | 
 66 |         X = pd.DataFrame({"feature_1": np.random.rand(100)})
 67 |         y = (np.random.rand(X.shape[0]) > 0.5).astype(int)
 68 | 
 69 |         a.fit(X, y)
 70 |         p = a.predict(X)
 71 | 
 72 |         self.assertIsInstance(p, np.ndarray)
 73 |         self.assertEqual(len(p), X.shape[0])
 74 | 
 75 |     def test_different_input_types(self):
 76 |         """Test the different data input types for AutoML"""
 77 |         model = AutoML(
 78 |             total_time_limit=10,
 79 |             explain_level=0,
 80 |             start_random_models=1,
 81 |             algorithms=["Linear"],
 82 |             verbose=0,
 83 |         )
 84 |         X, y = datasets.make_regression()
 85 | 
 86 |         # First test - X and y as numpy arrays
 87 | 
 88 |         pred = model.fit(X, y).predict(X)
 89 | 
 90 |         self.assertIsInstance(pred, np.ndarray)
 91 |         self.assertEqual(len(pred), X.shape[0])
 92 | 
 93 |         del model
 94 | 
 95 |         model = AutoML(
 96 |             total_time_limit=10,
 97 |             explain_level=0,
 98 |             start_random_models=1,
 99 |             algorithms=["Linear"],
100 |             verbose=0,
101 |         )
102 |         # Second test - X and y as pandas dataframe
103 |         X_pandas = pd.DataFrame(X)
104 |         y_pandas = pd.DataFrame(y)
105 |         pred_pandas = model.fit(X_pandas, y_pandas).predict(X_pandas)
106 | 
107 |         self.assertIsInstance(pred_pandas, np.ndarray)
108 |         self.assertEqual(len(pred_pandas), X.shape[0])
109 | 
110 |         del model
111 | 
112 |         model = AutoML(
113 |             total_time_limit=10,
114 |             explain_level=0,
115 |             start_random_models=1,
116 |             algorithms=["Linear"],
117 |             verbose=0,
118 |         )
119 |         # Third test - X and y as lists
120 |         X_list = pd.DataFrame(X).values.tolist()
121 |         y_list = pd.DataFrame(y).values.tolist()
122 |         pred_list = model.fit(X_pandas, y_pandas).predict(X_pandas)
123 | 
124 |         self.assertIsInstance(pred_list, np.ndarray)
125 |         self.assertEqual(len(pred_list), X.shape[0])
126 | 
127 |     def test_integration_float16_data(self):
128 |         a = AutoML(
129 |             results_path=self.automl_dir,
130 |             total_time_limit=1,
131 |             explain_level=0,
132 |             start_random_models=1,
133 |         )
134 | 
135 |         X, y = datasets.make_classification(
136 |             n_samples=100,
137 |             n_features=5,
138 |             n_informative=4,
139 |             n_redundant=1,
140 |             n_classes=2,
141 |             n_clusters_per_class=3,
142 |             n_repeated=0,
143 |             shuffle=False,
144 |             random_state=0,
145 |         )
146 |         X = pd.DataFrame(X)
147 |         X = X.astype(float)
148 |         a.fit(X, y)
149 |         p = a.predict(X)
150 |         self.assertIsInstance(p, np.ndarray)
151 |         self.assertEqual(len(p), X.shape[0])
152 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/algorithm.py:
--------------------------------------------------------------------------------

```python
  1 | import uuid
  2 | 
  3 | import numpy as np
  4 | 
  5 | from sklearn.base import BaseEstimator
  6 | 
  7 | from supervised.utils.common import construct_learner_name
  8 | from supervised.utils.importance import PermutationImportance
  9 | from supervised.utils.shap import PlotSHAP
 10 | 
 11 | 
 12 | class BaseAlgorithm(BaseEstimator):
 13 |     """
 14 |     This is an abstract class.
 15 |     All algorithms inherit from BaseAlgorithm.
 16 |     """
 17 | 
 18 |     algorithm_name = "Unknown"
 19 |     algorithm_short_name = "Unknown"
 20 | 
 21 |     def __init__(self, params):
 22 |         self.params = params
 23 |         self.stop_training = False
 24 |         self.library_version = None
 25 |         self.model = None
 26 |         self.uid = params.get("uid", str(uuid.uuid4()))
 27 |         self.ml_task = params.get("ml_task")
 28 |         self.model_file_path = None
 29 |         self.name = "amazing_learner"
 30 | 
 31 |     def set_learner_name(self, fold, repeat, repeats):
 32 |         self.name = construct_learner_name(fold, repeat, repeats)
 33 | 
 34 |     def is_fitted(self):
 35 |         # base class method
 36 |         return False
 37 | 
 38 |     def reload(self):
 39 |         if not self.is_fitted() and self.model_file_path is not None:
 40 |             self.load(self.model_file_path)
 41 | 
 42 |     def fit(
 43 |         self,
 44 |         X,
 45 |         y,
 46 |         sample_weight=None,
 47 |         X_validation=None,
 48 |         y_validation=None,
 49 |         sample_weight_validation=None,
 50 |         log_to_file=None,
 51 |         max_time=None,
 52 |     ):
 53 |         pass
 54 | 
 55 |     def predict(self, X):
 56 |         pass
 57 | 
 58 |     # needed for feature importance
 59 |     def predict_proba(self, X):
 60 |         y = self.predict(X)
 61 |         if "num_class" in self.params and self.params["num_class"] > 2:
 62 |             return y
 63 |         return np.column_stack((1 - y, y))
 64 | 
 65 |     def update(self, update_params):
 66 |         pass
 67 | 
 68 |     def copy(self):
 69 |         pass
 70 | 
 71 |     def save(self, model_file_path):
 72 |         pass
 73 | 
 74 |     def load(self, model_file_path):
 75 |         pass
 76 | 
 77 |     def get_fname(self):
 78 |         return f"{self.name}.{self.file_extension()}"
 79 | 
 80 |     def interpret(
 81 |         self,
 82 |         X_train,
 83 |         y_train,
 84 |         X_validation,
 85 |         y_validation,
 86 |         model_file_path,
 87 |         learner_name,
 88 |         target_name=None,
 89 |         class_names=None,
 90 |         metric_name=None,
 91 |         ml_task=None,
 92 |         explain_level=2,
 93 |     ):
 94 |         # do not produce feature importance for Baseline
 95 |         if self.algorithm_short_name == "Baseline":
 96 |             return
 97 |         if explain_level > 0:
 98 |             PermutationImportance.compute_and_plot(
 99 |                 self,
100 |                 X_validation,
101 |                 y_validation,
102 |                 model_file_path,
103 |                 learner_name,
104 |                 metric_name,
105 |                 ml_task,
106 |                 self.params.get("n_jobs", -1),
107 |             )
108 |         if explain_level > 1:
109 |             PlotSHAP.compute(
110 |                 self,
111 |                 X_train,
112 |                 y_train,
113 |                 X_validation,
114 |                 y_validation,
115 |                 model_file_path,
116 |                 learner_name,
117 |                 class_names,
118 |                 ml_task,
119 |             )
120 | 
121 |     def get_metric_name(self):
122 |         return None
123 | 
124 |     def get_params(self):
125 |         params = {
126 |             "library_version": self.library_version,
127 |             "algorithm_name": self.algorithm_name,
128 |             "algorithm_short_name": self.algorithm_short_name,
129 |             "uid": self.uid,
130 |             "params": self.params,
131 |             "name": self.name,
132 |         }
133 |         if hasattr(self, "best_ntree_limit") and self.best_ntree_limit is not None:
134 |             params["best_ntree_limit"] = self.best_ntree_limit
135 |         return params
136 | 
137 |     def set_params(self, json_desc, learner_path):
138 |         self.library_version = json_desc.get("library_version", self.library_version)
139 |         self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
140 |         self.algorithm_short_name = json_desc.get(
141 |             "algorithm_short_name", self.algorithm_short_name
142 |         )
143 |         self.uid = json_desc.get("uid", self.uid)
144 |         self.params = json_desc.get("params", self.params)
145 |         self.name = json_desc.get("name", self.name)
146 |         self.model_file_path = learner_path
147 | 
148 |         if hasattr(self, "best_ntree_limit"):
149 |             self.best_ntree_limit = json_desc.get(
150 |                 "best_ntree_limit", self.best_ntree_limit
151 |             )
152 | 
```

--------------------------------------------------------------------------------
/tests/data/iris_missing_values_missing_target.csv:
--------------------------------------------------------------------------------

```
  1 | feature_1,feature_2,feature_3,feature_4,class
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3.0,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,,Iris-setosa
  5 | 4.6,3.1,1.5,,Iris-setosa
  6 | 5.0,3.6,1.4,0.2,Iris-setosa
  7 | ,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5.0,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,,0.2,Iris-setosa
 14 | 4.8,3.0,1.4,0.1,Iris-setosa
 15 | 4.3,3.0,1.1,0.1,Iris-setosa
 16 | 5.8,4.0,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1.0,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5.0,3.0,1.6,0.2,Iris-setosa
 28 | 5.0,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5.0,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3.0,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5.0,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5.0,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3.0,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5.0,3.3,1.4,0.2,Iris-setosa
 52 | 7.0,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,
 55 | 5.5,2.3,4.0,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1.0,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5.0,2.0,3.5,1.0,Iris-versicolor
 63 | 5.9,3.0,4.2,1.5,Iris-versicolor
 64 | 6.0,2.2,4.0,1.0,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3.0,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1.0,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4.0,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3.0,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3.0,5.0,1.7,Iris-versicolor
 80 | 6.0,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1.0,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1.0,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6.0,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3.0,4.5,1.5,Iris-versicolor
 87 | 6.0,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3.0,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4.0,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3.0,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4.0,1.2,Iris-versicolor
 95 | 5.0,2.3,3.3,1.0,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3.0,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3.0,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6.0,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3.0,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3.0,5.8,2.2,Iris-virginica
107 | 7.6,3.0,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2.0,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3.0,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5.0,2.0,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3.0,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6.0,2.2,5.0,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2.0,Iris-virginica
124 | 7.7,2.8,6.7,2.0,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6.0,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3.0,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3.0,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2.0,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3.0,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6.0,3.0,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3.0,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5.0,1.9,Iris-virginica
149 | 6.5,3.0,5.2,2.0,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3.0,5.1,1.8,Iris-virginica
152 | 
153 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/preprocessing_categorical.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | from supervised.preprocessing.label_binarizer import LabelBinarizer
  5 | from supervised.preprocessing.label_encoder import LabelEncoder
  6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
  7 | 
  8 | 
  9 | class PreprocessingCategorical(object):
 10 |     CONVERT_ONE_HOT = "categorical_to_onehot"
 11 |     CONVERT_INTEGER = "categorical_to_int"
 12 | 
 13 |     FEW_CATEGORIES = "few_categories"
 14 |     MANY_CATEGORIES = "many_categories"
 15 | 
 16 |     def __init__(self, columns=[], method=CONVERT_INTEGER):
 17 |         self._convert_method = method
 18 |         self._convert_params = {}
 19 |         self._columns = columns
 20 |         self._enc = None
 21 | 
 22 |     def fit(self, X, y=None):
 23 |         self._fit_categorical_convert(X)
 24 | 
 25 |     def _fit_categorical_convert(self, X):
 26 |         for column in self._columns:
 27 |             if PreprocessingUtils.get_type(X[column]) != PreprocessingUtils.CATEGORICAL:
 28 |                 # no need to convert, already a number
 29 |                 continue
 30 |             # limit categories - it is needed when doing one hot encoding
 31 |             # this code is also used in predict.py file
 32 |             # and transform_utils.py
 33 |             # TODO it needs refactoring !!!
 34 |             too_much_categories = len(np.unique(list(X[column].values))) > 200
 35 |             lbl = None
 36 |             if (
 37 |                 self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT
 38 |                 and not too_much_categories
 39 |             ):
 40 |                 lbl = LabelBinarizer()
 41 |                 lbl.fit(X, column)
 42 |             else:
 43 |                 lbl = LabelEncoder()
 44 |                 lbl.fit(X[column])
 45 | 
 46 |             if lbl is not None:
 47 |                 self._convert_params[column] = lbl.to_json()
 48 | 
 49 |     def transform(self, X):
 50 |         for column, lbl_params in self._convert_params.items():
 51 |             if "unique_values" in lbl_params and "new_columns" in lbl_params:
 52 |                 # convert to one hot
 53 |                 lbl = LabelBinarizer()
 54 |                 lbl.from_json(lbl_params)
 55 |                 X = lbl.transform(X, column)
 56 |             else:
 57 |                 # convert to integer
 58 |                 lbl = LabelEncoder()
 59 |                 lbl.from_json(lbl_params)
 60 |                 transformed_values = lbl.transform(X.loc[:, column])
 61 |                 # check for pandas FutureWarning: Setting an item
 62 |                 # of incompatible dtype is deprecated and will raise
 63 |                 # in a future error of pandas.
 64 |                 if transformed_values.dtype != X.loc[:, column].dtype and \
 65 |                     (X.loc[:, column].dtype == bool or X.loc[:, column].dtype == int):
 66 |                     X = X.astype({column: transformed_values.dtype})
 67 |                 if isinstance(X[column].dtype, pd.CategoricalDtype):
 68 |                     X[column] = X[column].astype('object')
 69 |                 X.loc[:, column] = transformed_values
 70 | 
 71 |         return X
 72 | 
 73 |     def inverse_transform(self, X):
 74 |         for column, lbl_params in self._convert_params.items():
 75 |             if "unique_values" in lbl_params and "new_columns" in lbl_params:
 76 |                 # convert to one hot
 77 |                 lbl = LabelBinarizer()
 78 |                 lbl.from_json(lbl_params)
 79 |                 X = lbl.inverse_transform(X, column)  # should raise exception
 80 |             else:
 81 |                 # convert to integer
 82 |                 lbl = LabelEncoder()
 83 |                 lbl.from_json(lbl_params)
 84 |                 transformed_values = lbl.inverse_transform(X.loc[:, column])
 85 |                 # check for pandas FutureWarning: Setting an item
 86 |                 # of incompatible dtype is deprecated and will raise
 87 |                 # in a future error of pandas.
 88 |                 if transformed_values.dtype != X.loc[:, column].dtype and \
 89 |                         (X.loc[:, column].dtype == bool or X.loc[:, column].dtype == int):
 90 |                         X = X.astype({column: transformed_values.dtype})
 91 |                 X.loc[:, column] = transformed_values
 92 | 
 93 |         return X
 94 | 
 95 |     def to_json(self):
 96 |         params = {}
 97 |         
 98 |         if len(self._convert_params) == 0:
 99 |             return {}
100 |         params = {
101 |             "convert_method": self._convert_method,
102 |             "convert_params": self._convert_params,
103 |             "columns": self._columns,
104 |         }
105 |         return params
106 | 
107 |     def from_json(self, params):
108 |         if params is not None:
109 |             self._convert_method = params.get("convert_method", None)
110 |             self._columns = params.get("columns", [])
111 |             self._convert_params = params.get("convert_params", {})
112 | 
113 |         else:
114 |             self._convert_method, self._convert_params = None, None
115 |             self._columns = []
116 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_label_encoder.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised.preprocessing.label_encoder import LabelEncoder
  8 | 
  9 | 
 10 | class LabelEncoderTest(unittest.TestCase):
 11 |     def test_fit(self):
 12 |         # training data
 13 |         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
 14 |         df = pd.DataFrame(data=d)
 15 |         le = LabelEncoder()
 16 |         # check first column
 17 |         le.fit(df["col1"])
 18 |         data_json = le.to_json()
 19 |         # values from column should be in data json
 20 |         self.assertTrue("a" in data_json)
 21 |         self.assertTrue("c" in data_json)
 22 |         self.assertTrue("b" not in data_json)
 23 |         # there is alphabetical order for values
 24 |         self.assertEqual(0, data_json["a"])
 25 |         self.assertEqual(1, data_json["c"])
 26 | 
 27 |         # check next column
 28 |         le.fit(df["col2"])
 29 |         data_json = le.to_json()
 30 |         self.assertEqual(0, data_json["d"])
 31 |         self.assertEqual(1, data_json["e"])
 32 |         self.assertEqual(2, data_json["w"])
 33 | 
 34 |     def test_transform(self):
 35 |         # training data
 36 |         d = {"col1": ["a", "a", "c"]}
 37 |         df = pd.DataFrame(data=d)
 38 |         # fit encoder
 39 |         le = LabelEncoder()
 40 |         le.fit(df["col1"])
 41 |         # test data
 42 |         d_test = {"col2": ["c", "c", "a"]}
 43 |         df_test = pd.DataFrame(data=d_test)
 44 |         # transform
 45 |         y = le.transform(df_test["col2"])
 46 |         self.assertEqual(y[0], 1)
 47 |         self.assertEqual(y[1], 1)
 48 |         self.assertEqual(y[2], 0)
 49 | 
 50 |     def test_transform_with_new_values(self):
 51 |         # training data
 52 |         d = {"col1": ["a", "a", "c"]}
 53 |         df = pd.DataFrame(data=d)
 54 |         # fit encoder
 55 |         le = LabelEncoder()
 56 |         le.fit(df["col1"])
 57 |         # test data
 58 |         d_test = {"col2": ["c", "a", "d", "f"]}
 59 |         df_test = pd.DataFrame(data=d_test)
 60 |         # transform
 61 |         y = le.transform(df_test["col2"])
 62 |         self.assertEqual(y[0], 1)
 63 |         self.assertEqual(y[1], 0)
 64 |         self.assertEqual(y[2], 2)
 65 |         self.assertEqual(y[3], 3)
 66 | 
 67 |     def test_to_and_from_json(self):
 68 |         # training data
 69 |         d = {"col1": ["a", "a", "c"]}
 70 |         df = pd.DataFrame(data=d)
 71 |         # fit encoder
 72 |         le = LabelEncoder()
 73 |         le.fit(df["col1"])
 74 | 
 75 |         # new encoder
 76 |         new_le = LabelEncoder()
 77 |         new_le.from_json(le.to_json())
 78 | 
 79 |         # test data
 80 |         d_test = {"col2": ["c", "c", "a"]}
 81 |         df_test = pd.DataFrame(data=d_test)
 82 |         # transform
 83 |         y = new_le.transform(df_test["col2"])
 84 |         self.assertEqual(y[0], 1)
 85 |         self.assertEqual(y[1], 1)
 86 |         self.assertEqual(y[2], 0)
 87 | 
 88 |     def test_to_and_from_json_booleans(self):
 89 |         # training data
 90 |         d = {"col1": [True, False, True]}
 91 |         df = pd.DataFrame(data=d)
 92 |         # fit encoder
 93 |         le = LabelEncoder()
 94 |         le.fit(df["col1"])
 95 | 
 96 |         # new encoder
 97 |         new_le = LabelEncoder()
 98 |         new_le.from_json(json.loads(json.dumps(le.to_json(), indent=4)))
 99 | 
100 |         # test data
101 |         d_test = {"col2": [True, False, True]}
102 |         df_test = pd.DataFrame(data=d_test)
103 |         # transform
104 |         y = new_le.transform(df_test["col2"])
105 | 
106 |         self.assertEqual(y[0], 1)
107 |         self.assertEqual(y[1], 0)
108 |         self.assertEqual(y[2], 1)
109 | 
110 |     def test_fit_on_numeric_categories(self):
111 |         # categories are as strings
112 |         # but they represent numbers
113 |         # we force encoder to sort them by numeric values
114 |         # it is needed for computing predictions for many classes
115 | 
116 |         # training data
117 |         d = {"col1": ["1", "10", "2"]}
118 |         df = pd.DataFrame(data=d)
119 |         le = LabelEncoder(try_to_fit_numeric=True)
120 |         # check first column
121 |         le.fit(df["col1"])
122 |         data_json = le.to_json()
123 |         print(data_json)
124 |         # values from column should be in data json
125 |         self.assertTrue("1" in data_json)
126 |         self.assertTrue("10" in data_json)
127 |         self.assertTrue("2" in data_json)
128 |         # there is numeric order for values
129 |         self.assertEqual(0, data_json["1"])
130 |         self.assertEqual(1, data_json["2"])
131 |         self.assertEqual(2, data_json["10"])
132 |         p = le.transform(df["col1"])
133 |         p2 = le.transform(np.array(df["col1"].values))
134 |         self.assertEqual(p[0], 0)
135 |         self.assertEqual(p[1], 2)
136 |         self.assertEqual(p[2], 1)
137 | 
138 |         self.assertEqual(p[0], p2[0])
139 |         self.assertEqual(p[1], p2[1])
140 |         self.assertEqual(p[2], p2[2])
141 | 
142 |         new_le = LabelEncoder()
143 |         new_le.from_json(json.loads(json.dumps(le.to_json(), indent=4)))
144 |         p2 = new_le.transform(df["col1"])
145 | 
146 |         self.assertEqual(p[0], p2[0])
147 |         self.assertEqual(p[1], p2[1])
148 |         self.assertEqual(p[2], p2[2])
149 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_nn.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | from numpy.testing import assert_almost_equal
  6 | from sklearn import datasets
  7 | from sklearn import preprocessing
  8 | 
  9 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm
 10 | from supervised.utils.metric import Metric
 11 | 
 12 | 
 13 | class MLPAlgorithmTest(unittest.TestCase):
 14 |     @classmethod
 15 |     def setUpClass(cls):
 16 |         cls.X, cls.y = datasets.make_classification(
 17 |             n_samples=100,
 18 |             n_features=5,
 19 |             n_informative=4,
 20 |             n_redundant=1,
 21 |             n_classes=2,
 22 |             n_clusters_per_class=3,
 23 |             n_repeated=0,
 24 |             shuffle=False,
 25 |             random_state=1,
 26 |         )
 27 | 
 28 |         cls.params = {
 29 |             "dense_1_size": 8,
 30 |             "dense_2_size": 4,
 31 |             "learning_rate": 0.01,
 32 |             "ml_task": "binary_classification",
 33 |         }
 34 | 
 35 |     def test_fit_predict(self):
 36 |         metric = Metric({"name": "logloss"})
 37 |         nn = MLPAlgorithm(self.params)
 38 |         nn.fit(self.X, self.y)
 39 |         y_predicted = nn.predict_proba(self.X)
 40 |         loss = metric(self.y, y_predicted)
 41 |         self.assertLess(loss, 2)
 42 | 
 43 |     def test_copy(self):
 44 |         # train model #1
 45 |         metric = Metric({"name": "logloss"})
 46 |         nn = MLPAlgorithm(self.params)
 47 |         nn.fit(self.X, self.y)
 48 |         y_predicted = nn.predict(self.X)
 49 |         loss = metric(self.y, y_predicted)
 50 |         # create model #2
 51 |         nn2 = MLPAlgorithm(self.params)
 52 |         # do a copy and use it for predictions
 53 |         nn2 = nn.copy()
 54 |         self.assertEqual(type(nn), type(nn2))
 55 |         y_predicted = nn2.predict(self.X)
 56 |         loss2 = metric(self.y, y_predicted)
 57 |         self.assertEqual(loss, loss2)
 58 | 
 59 |         # the loss of model #2 should not change
 60 |         y_predicted = nn2.predict(self.X)
 61 |         loss4 = metric(self.y, y_predicted)
 62 |         assert_almost_equal(loss2, loss4)
 63 | 
 64 |     def test_save_and_load(self):
 65 |         metric = Metric({"name": "logloss"})
 66 |         nn = MLPAlgorithm(self.params)
 67 |         nn.fit(self.X, self.y)
 68 |         y_predicted = nn.predict(self.X)
 69 |         loss = metric(self.y, y_predicted)
 70 | 
 71 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
 72 | 
 73 |         nn.save(filename)
 74 |         json_desc = nn.get_params()
 75 |         nn2 = MLPAlgorithm(json_desc["params"])
 76 |         nn2.load(filename)
 77 |         # Finished with the file, delete it
 78 |         os.remove(filename)
 79 | 
 80 |         y_predicted = nn2.predict(self.X)
 81 |         loss2 = metric(self.y, y_predicted)
 82 |         assert_almost_equal(loss, loss2)
 83 | 
 84 | 
 85 | class MLPRegressorAlgorithmTest(unittest.TestCase):
 86 |     @classmethod
 87 |     def setUpClass(cls):
 88 |         cls.X, cls.y = datasets.make_regression(
 89 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 90 |         )
 91 | 
 92 |         cls.params = {
 93 |             "dense_layers": 2,
 94 |             "dense_1_size": 8,
 95 |             "dense_2_size": 4,
 96 |             "dropout": 0,
 97 |             "learning_rate": 0.01,
 98 |             "momentum": 0.9,
 99 |             "decay": 0.001,
100 |             "ml_task": "regression",
101 |         }
102 | 
103 |         cls.y = preprocessing.scale(cls.y)
104 | 
105 |     def test_fit_predict(self):
106 |         metric = Metric({"name": "mse"})
107 |         nn = MLPRegressorAlgorithm(self.params)
108 |         nn.fit(self.X, self.y)
109 |         y_predicted = nn.predict(self.X)
110 |         loss = metric(self.y, y_predicted)
111 |         self.assertLess(loss, 2)
112 | 
113 | 
114 | class MultiClassNeuralNetworkAlgorithmTest(unittest.TestCase):
115 |     @classmethod
116 |     def setUpClass(cls):
117 |         cls.X, cls.y = datasets.make_classification(
118 |             n_samples=100,
119 |             n_features=5,
120 |             n_informative=4,
121 |             n_redundant=1,
122 |             n_classes=3,
123 |             n_clusters_per_class=3,
124 |             n_repeated=0,
125 |             shuffle=False,
126 |             random_state=0,
127 |         )
128 | 
129 |         cls.params = {
130 |             "dense_layers": 2,
131 |             "dense_1_size": 8,
132 |             "dense_2_size": 4,
133 |             "dropout": 0,
134 |             "learning_rate": 0.01,
135 |             "momentum": 0.9,
136 |             "decay": 0.001,
137 |             "ml_task": "multiclass_classification",
138 |             "num_class": 3,
139 |         }
140 | 
141 |         lb = preprocessing.LabelEncoder()
142 |         lb.fit(cls.y)
143 |         cls.y = lb.transform(cls.y)
144 | 
145 |     def test_fit_predict(self):
146 |         metric = Metric({"name": "logloss"})
147 |         nn = MLPAlgorithm(self.params)
148 |         nn.fit(self.X, self.y)
149 |         y_predicted = nn.predict(self.X)
150 |         loss = metric(self.y, y_predicted)
151 |         self.assertLess(loss, 2)
152 | 
153 |     def test_is_fitted(self):
154 |         model = MLPAlgorithm(self.params)
155 |         self.assertFalse(model.is_fitted())
156 |         model.fit(self.X, self.y)
157 |         self.assertTrue(model.is_fitted())
158 | 
```

--------------------------------------------------------------------------------
/supervised/validation/validator_split.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | import warnings
  4 | 
  5 | import numpy as np
  6 | 
  7 | log = logging.getLogger(__name__)
  8 | 
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | from supervised.exceptions import AutoMLException
 12 | from supervised.utils.utils import load_data
 13 | from supervised.validation.validator_base import BaseValidator
 14 | 
 15 | 
 16 | class SplitValidator(BaseValidator):
 17 |     def __init__(self, params):
 18 |         BaseValidator.__init__(self, params)
 19 | 
 20 |         self.train_ratio = self.params.get("train_ratio", 0.8)
 21 |         self.shuffle = self.params.get("shuffle", True)
 22 |         self.stratify = self.params.get("stratify", False)
 23 |         self.random_seed = self.params.get("random_seed", 1234)
 24 |         self.repeats = self.params.get("repeats", 1)
 25 | 
 26 |         if not self.shuffle and self.repeats > 1:
 27 |             warnings.warn(
 28 |                 "Disable repeats in validation because shuffle is disabled", UserWarning
 29 |             )
 30 |             self.repeats = 1
 31 | 
 32 |         self._results_path = self.params.get("results_path")
 33 |         self._X_path = self.params.get("X_path")
 34 |         self._y_path = self.params.get("y_path")
 35 |         self._sample_weight_path = self.params.get("sample_weight_path")
 36 |         self._sensitive_features_path = self.params.get("sensitive_features_path")
 37 | 
 38 |         if self._X_path is None or self._y_path is None:
 39 |             raise AutoMLException("No data path set in SplitValidator params")
 40 | 
 41 |     def get_split(self, k=0, repeat=0):
 42 |         X = load_data(self._X_path)
 43 |         y = load_data(self._y_path)
 44 |         y = y["target"]
 45 | 
 46 |         sample_weight = None
 47 |         if self._sample_weight_path is not None:
 48 |             sample_weight = load_data(self._sample_weight_path)
 49 |             sample_weight = sample_weight["sample_weight"]
 50 | 
 51 |         sensitive_features = None
 52 |         if self._sensitive_features_path is not None:
 53 |             sensitive_features = load_data(self._sensitive_features_path)
 54 | 
 55 |         stratify = None
 56 |         if self.stratify:
 57 |             stratify = y
 58 |         if self.shuffle == False:
 59 |             stratify = None
 60 | 
 61 |         input_data = [X, y]
 62 |         if sample_weight is not None:
 63 |             input_data += [sample_weight]
 64 |         if sensitive_features is not None:
 65 |             input_data += [sensitive_features]
 66 | 
 67 |         output_data = train_test_split(
 68 |             *input_data,
 69 |             train_size=self.train_ratio,
 70 |             test_size=1.0 - self.train_ratio,
 71 |             shuffle=self.shuffle,
 72 |             stratify=stratify,
 73 |             random_state=self.random_seed + repeat,
 74 |         )
 75 | 
 76 |         X_train = output_data[0]
 77 |         X_validation = output_data[1]
 78 |         y_train = output_data[2]
 79 |         y_validation = output_data[3]
 80 |         if sample_weight is not None:
 81 |             sample_weight_train = output_data[4]
 82 |             sample_weight_validation = output_data[5]
 83 |             if sensitive_features is not None:
 84 |                 sensitive_features_train = output_data[6]
 85 |                 sensitive_features_validation = output_data[7]
 86 |         else:
 87 |             if sensitive_features is not None:
 88 |                 sensitive_features_train = output_data[4]
 89 |                 sensitive_features_validation = output_data[5]
 90 | 
 91 |         train_data = {"X": X_train, "y": y_train}
 92 |         validation_data = {"X": X_validation, "y": y_validation}
 93 |         if sample_weight is not None:
 94 |             train_data["sample_weight"] = sample_weight_train
 95 |             validation_data["sample_weight"] = sample_weight_validation
 96 |         if sensitive_features is not None:
 97 |             train_data["sensitive_features"] = sensitive_features_train
 98 |             validation_data["sensitive_features"] = sensitive_features_validation
 99 | 
100 |         repeat_str = f"repeat_{repeat}_" if self.repeats > 1 else ""
101 | 
102 |         train_data_file = os.path.join(
103 |             self._results_path, f"split_{repeat_str}train_indices.npy"
104 |         )
105 |         validation_data_file = os.path.join(
106 |             self._results_path, f"split_{repeat_str}validation_indices.npy"
107 |         )
108 | 
109 |         np.save(train_data_file, X_train.index)
110 |         np.save(validation_data_file, X_validation.index)
111 | 
112 |         return train_data, validation_data
113 | 
114 |     def get_n_splits(self):
115 |         return 1
116 | 
117 |     def get_repeats(self):
118 |         return self.repeats
119 | 
120 | 
121 | """
122 | import numpy as np
123 | import pandas as pd
124 | 
125 | from sklearn.utils.fixes import bincount
126 | from sklearn.model_selection import train_test_split
127 | 
128 | import logging
129 | logger = logging.getLogger('mljar')
130 | 
131 | 
132 | def validation_split(train, validation_train_split, stratify, shuffle, random_seed):
133 | 
134 |     if shuffle:
135 |     else:
136 |         if stratify is None:
137 |             train, validation = data_split(validation_train_split, train)
138 |         else:
139 |             train, validation = data_split_stratified(validation_train_split, train, stratify)
140 |     return train, validation
141 | 
142 | 
143 | """
144 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/xgboost.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | import optuna
  3 | import optuna_integration
  4 | import xgboost as xgb
  5 | 
  6 | from supervised.algorithms.registry import (
  7 |     MULTICLASS_CLASSIFICATION,
  8 | )
  9 | from supervised.algorithms.xgboost import xgboost_eval_metric, xgboost_objective
 10 | from supervised.utils.metric import (
 11 |     Metric,
 12 |     xgboost_eval_metric_accuracy,
 13 |     xgboost_eval_metric_average_precision,
 14 |     xgboost_eval_metric_f1,
 15 |     xgboost_eval_metric_mse,
 16 |     xgboost_eval_metric_pearson,
 17 |     xgboost_eval_metric_r2,
 18 |     xgboost_eval_metric_spearman,
 19 |     xgboost_eval_metric_user_defined,
 20 | )
 21 | 
 22 | EPS = 1e-8
 23 | 
 24 | 
 25 | class XgboostObjective:
 26 |     def __init__(
 27 |         self,
 28 |         ml_task,
 29 |         X_train,
 30 |         y_train,
 31 |         sample_weight,
 32 |         X_validation,
 33 |         y_validation,
 34 |         sample_weight_validation,
 35 |         eval_metric,
 36 |         n_jobs,
 37 |         random_state,
 38 |     ):
 39 |         self.dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weight)
 40 |         self.dvalidation = xgb.DMatrix(
 41 |             X_validation, label=y_validation, weight=sample_weight_validation
 42 |         )
 43 |         self.X_validation = X_validation
 44 |         self.y_validation = y_validation
 45 |         self.eval_metric = eval_metric
 46 |         self.n_jobs = n_jobs
 47 | 
 48 |         self.learning_rate = 0.0125
 49 |         self.rounds = 1000
 50 |         self.early_stopping_rounds = 50
 51 |         self.seed = random_state
 52 | 
 53 |         self.objective = ""
 54 |         self.eval_metric_name = ""
 55 |         self.num_class = (
 56 |             len(np.unique(y_train)) if ml_task == MULTICLASS_CLASSIFICATION else None
 57 |         )
 58 | 
 59 |         self.objective = xgboost_objective(ml_task, eval_metric.name)
 60 |         self.eval_metric_name = xgboost_eval_metric(ml_task, eval_metric.name)
 61 | 
 62 |         self.custom_eval_metric = None
 63 |         if self.eval_metric_name == "r2":
 64 |             self.custom_eval_metric = xgboost_eval_metric_r2
 65 |         elif self.eval_metric_name == "spearman":
 66 |             self.custom_eval_metric = xgboost_eval_metric_spearman
 67 |         elif self.eval_metric_name == "pearson":
 68 |             self.custom_eval_metric = xgboost_eval_metric_pearson
 69 |         elif self.eval_metric_name == "f1":
 70 |             self.custom_eval_metric = xgboost_eval_metric_f1
 71 |         elif self.eval_metric_name == "average_precision":
 72 |             self.custom_eval_metric = xgboost_eval_metric_average_precision
 73 |         elif self.eval_metric_name == "accuracy":
 74 |             self.custom_eval_metric = xgboost_eval_metric_accuracy
 75 |         elif self.eval_metric_name == "mse":
 76 |             self.custom_eval_metric = xgboost_eval_metric_mse
 77 |         elif self.eval_metric_name == "user_defined_metric":
 78 |             self.custom_eval_metric = xgboost_eval_metric_user_defined
 79 | 
 80 |     def __call__(self, trial):
 81 |         param = {
 82 |             "objective": self.objective,
 83 |             "eval_metric": self.eval_metric_name,
 84 |             "tree_method": "hist",
 85 |             "booster": "gbtree",
 86 |             "eta": trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]),
 87 |             "max_depth": trial.suggest_int("max_depth", 2, 12),
 88 |             "lambda": trial.suggest_float("lambda", EPS, 10.0, log=True),
 89 |             "alpha": trial.suggest_float("alpha", EPS, 10.0, log=True),
 90 |             "colsample_bytree": min(
 91 |                 trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0
 92 |             ),
 93 |             "subsample": min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0),
 94 |             "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
 95 |             "n_jobs": self.n_jobs,
 96 |             "seed": self.seed,
 97 |             "verbosity": 0,
 98 |         }
 99 |         if self.custom_eval_metric is not None:
100 |             del param["eval_metric"]
101 | 
102 |         if self.num_class is not None:
103 |             param["num_class"] = self.num_class
104 |         try:
105 |             pruning_callback = optuna_integration.XGBoostPruningCallback(
106 |                 trial, f"validation-{self.eval_metric_name}"
107 |             )
108 |             bst = xgb.train(
109 |                 param,
110 |                 self.dtrain,
111 |                 self.rounds,
112 |                 evals=[(self.dvalidation, "validation")],
113 |                 early_stopping_rounds=self.early_stopping_rounds,
114 |                 callbacks=[pruning_callback],
115 |                 verbose_eval=False,
116 |                 custom_metric=self.custom_eval_metric,
117 |             )
118 |             preds = bst.predict(
119 |                 self.dvalidation, iteration_range=(0, bst.best_iteration)
120 |             )
121 |             score = self.eval_metric(self.y_validation, preds)
122 |             if Metric.optimize_negative(self.eval_metric.name):
123 |                 score *= -1.0
124 |         except optuna.exceptions.TrialPruned as e:
125 |             raise e
126 |         except Exception as e:
127 |             print("Exception in XgboostObjective", str(e))
128 |             return None
129 | 
130 |         return score
131 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/nn.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import warnings
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | from sklearn.base import ClassifierMixin, RegressorMixin
  8 | from sklearn.neural_network import MLPClassifier, MLPRegressor
  9 | 
 10 | from supervised.algorithms.registry import (
 11 |     BINARY_CLASSIFICATION,
 12 |     MULTICLASS_CLASSIFICATION,
 13 |     REGRESSION,
 14 |     AlgorithmsRegistry,
 15 | )
 16 | from supervised.algorithms.sklearn import SklearnAlgorithm
 17 | from supervised.utils.config import LOG_LEVEL
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(LOG_LEVEL)
 21 | 
 22 | 
 23 | class NNFit(SklearnAlgorithm):
 24 |     def file_extension(self):
 25 |         return "neural_network"
 26 | 
 27 |     def is_fitted(self):
 28 |         return (
 29 |             hasattr(self.model, "n_iter_")
 30 |             and self.model.n_iter_ is not None
 31 |             and self.model.n_iter_ > 0
 32 |         )
 33 | 
 34 |     def fit(
 35 |         self,
 36 |         X,
 37 |         y,
 38 |         sample_weight=None,
 39 |         X_validation=None,
 40 |         y_validation=None,
 41 |         sample_weight_validation=None,
 42 |         log_to_file=None,
 43 |         max_time=None,
 44 |     ):
 45 |         with warnings.catch_warnings():
 46 |             warnings.simplefilter(action="ignore")
 47 |             # filter
 48 |             # X does not have valid feature names, but MLPClassifier was fitted with feature names
 49 |             self.model.fit(X, y)
 50 | 
 51 |         if log_to_file is not None:
 52 |             loss_curve = self.model.loss_curve_
 53 |             result = pd.DataFrame(
 54 |                 {
 55 |                     "iteration": range(len(loss_curve)),
 56 |                     "train": loss_curve,
 57 |                     "validation": None,
 58 |                 }
 59 |             )
 60 |             result.to_csv(log_to_file, index=False, header=False)
 61 | 
 62 |         if self.params["ml_task"] != REGRESSION:
 63 |             self.classes_ = np.unique(y)
 64 | 
 65 | 
 66 | class MLPAlgorithm(ClassifierMixin, NNFit):
 67 |     algorithm_name = "Neural Network"
 68 |     algorithm_short_name = "Neural Network"
 69 | 
 70 |     def __init__(self, params):
 71 |         super(MLPAlgorithm, self).__init__(params)
 72 |         logger.debug("MLPAlgorithm.__init__")
 73 |         self.max_iters = 1
 74 |         self.library_version = sklearn.__version__
 75 |         h1 = params.get("dense_1_size", 32)
 76 |         h2 = params.get("dense_2_size", 16)
 77 |         learning_rate = params.get("learning_rate", 0.05)
 78 | 
 79 |         max_iter = 500
 80 |         self.model = MLPClassifier(
 81 |             hidden_layer_sizes=(h1, h2),
 82 |             activation="relu",
 83 |             solver="adam",
 84 |             learning_rate=params.get("learning_rate_type", "constant"),
 85 |             learning_rate_init=learning_rate,
 86 |             alpha=params.get("alpha", 0.0001),
 87 |             early_stopping=True,
 88 |             n_iter_no_change=50,
 89 |             max_iter=max_iter,
 90 |             random_state=params.get("seed", 123),
 91 |         )
 92 | 
 93 |     def get_metric_name(self):
 94 |         return "logloss"
 95 | 
 96 | 
 97 | class MLPRegressorAlgorithm(RegressorMixin, NNFit):
 98 |     algorithm_name = "Neural Network"
 99 |     algorithm_short_name = "Neural Network"
100 | 
101 |     def __init__(self, params):
102 |         super(MLPRegressorAlgorithm, self).__init__(params)
103 |         logger.debug("MLPRegressorAlgorithm.__init__")
104 |         self.max_iters = 1
105 |         self.library_version = sklearn.__version__
106 |         h1 = params.get("dense_1_size", 32)
107 |         h2 = params.get("dense_2_size", 16)
108 |         learning_rate = params.get("learning_rate", 0.05)
109 |         momentum = params.get("momentum", 0.9)
110 |         early_stopping = True
111 |         max_iter = 500
112 |         self.model = MLPRegressor(
113 |             hidden_layer_sizes=(h1, h2),
114 |             activation="relu",
115 |             solver="adam",
116 |             learning_rate="constant",
117 |             learning_rate_init=learning_rate,
118 |             momentum=momentum,
119 |             early_stopping=early_stopping,
120 |             max_iter=max_iter,
121 |         )
122 | 
123 |     def get_metric_name(self):
124 |         return "mse"
125 | 
126 | 
127 | nn_params = {
128 |     "dense_1_size": [16, 32, 64],
129 |     "dense_2_size": [4, 8, 16, 32],
130 |     "learning_rate": [0.01, 0.05, 0.08, 0.1],
131 | }
132 | 
133 | default_nn_params = {"dense_1_size": 32, "dense_2_size": 16, "learning_rate": 0.05}
134 | 
135 | additional = {"max_rows_limit": None, "max_cols_limit": None}
136 | 
137 | required_preprocessing = [
138 |     "missing_values_inputation",
139 |     "convert_categorical",
140 |     "datetime_transform",
141 |     "text_transform",
142 |     "scale",
143 |     "target_as_integer",
144 | ]
145 | 
146 | AlgorithmsRegistry.add(
147 |     BINARY_CLASSIFICATION,
148 |     MLPAlgorithm,
149 |     nn_params,
150 |     required_preprocessing,
151 |     additional,
152 |     default_nn_params,
153 | )
154 | 
155 | AlgorithmsRegistry.add(
156 |     MULTICLASS_CLASSIFICATION,
157 |     MLPAlgorithm,
158 |     nn_params,
159 |     required_preprocessing,
160 |     additional,
161 |     default_nn_params,
162 | )
163 | 
164 | required_preprocessing = [
165 |     "missing_values_inputation",
166 |     "convert_categorical",
167 |     "datetime_transform",
168 |     "text_transform",
169 |     "scale",
170 |     "target_scale",
171 | ]
172 | 
173 | AlgorithmsRegistry.add(
174 |     REGRESSION,
175 |     MLPRegressorAlgorithm,
176 |     nn_params,
177 |     required_preprocessing,
178 |     additional,
179 |     default_nn_params,
180 | )
181 | 
```

--------------------------------------------------------------------------------
/supervised/utils/leaderboard_plots.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | from supervised.utils.config import LOG_LEVEL
  9 | from supervised.utils.metric import Metric
 10 | 
 11 | logger.setLevel(LOG_LEVEL)
 12 | 
 13 | import warnings
 14 | 
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
 18 | 
 19 | 
 20 | markers = {
 21 |     "Baseline": {"color": "tab:cyan", "marker": "8"},
 22 |     "Linear": {"color": "tab:pink", "marker": "s"},
 23 |     "Decision Tree": {"color": "tab:gray", "marker": "^"},
 24 |     "Random Forest": {"color": "tab:green", "marker": "o"},
 25 |     "Extra Trees": {"color": "tab:brown", "marker": "v"},
 26 |     "LightGBM": {"color": "tab:purple", "marker": "P"},
 27 |     "Xgboost": {"color": "tab:blue", "marker": "*"},
 28 |     "CatBoost": {"color": "tab:orange", "marker": "D"},
 29 |     "Neural Network": {"color": "tab:red", "marker": "x"},
 30 |     "Nearest Neighbors": {"color": "tab:olive", "marker": "+"},
 31 |     "Ensemble": {"color": "black", "marker": "p"},
 32 | }
 33 | 
 34 | 
 35 | class LeaderboardPlots:
 36 |     performance_fname = "ldb_performance.png"
 37 |     performance_boxplot_fname = "ldb_performance_boxplot.png"
 38 | 
 39 |     @staticmethod
 40 |     def compute(ldb, model_path, fout, fairness_threshold=None):
 41 |         if ldb.shape[0] < 2:
 42 |             return
 43 |         # Scatter plot
 44 |         plt.figure(figsize=(10, 7))
 45 |         for model_type in ldb.model_type.unique():
 46 |             ii = ldb.model_type == model_type
 47 |             plt.plot(
 48 |                 ldb.metric_value[ii],
 49 |                 markers[model_type]["marker"],
 50 |                 markersize=12,
 51 |                 alpha=0.75,
 52 |                 color=markers[model_type]["color"],
 53 |                 label=model_type,
 54 |             )
 55 |         # plt.plot(ldb.metric_value, "*", markersize=12, alpha=0.75)
 56 | 
 57 |         plt.xlabel("#Iteration")
 58 |         plt.ylabel(ldb.metric_type.iloc[0])
 59 |         plt.legend()
 60 |         plt.title("AutoML Performance")
 61 |         plt.tight_layout(pad=2.0)
 62 |         plot_path = os.path.join(model_path, LeaderboardPlots.performance_fname)
 63 |         plt.savefig(plot_path)
 64 |         plt.close("all")
 65 | 
 66 |         fout.write("\n\n### AutoML Performance\n")
 67 |         fout.write(f"![AutoML Performance]({LeaderboardPlots.performance_fname})")
 68 | 
 69 |         # Boxplot
 70 |         by = "model_type"
 71 |         column = "metric_value"
 72 |         df2 = pd.DataFrame({col: vals[column] for col, vals in ldb.groupby(by)})
 73 | 
 74 |         ascending_sort = Metric.optimize_negative(ldb.metric_type.iloc[0])
 75 |         mins = df2.min().sort_values(ascending=ascending_sort)
 76 | 
 77 |         plt.figure(figsize=(10, 7))
 78 |         # plt.title("")
 79 |         plt.ylabel(ldb.metric_type.iloc[0])
 80 |         df2[mins.index].boxplot(rot=90, fontsize=12)
 81 | 
 82 |         plt.tight_layout(pad=2.0)
 83 |         plot_path = os.path.join(model_path, LeaderboardPlots.performance_boxplot_fname)
 84 |         plt.savefig(plot_path)
 85 |         plt.close("all")
 86 | 
 87 |         fout.write("\n\n### AutoML Performance Boxplot\n")
 88 |         fout.write(
 89 |             f"![AutoML Performance Boxplot]({LeaderboardPlots.performance_boxplot_fname})"
 90 |         )
 91 | 
 92 |         if fairness_threshold is not None:
 93 |             fairness_metrics = [
 94 |                 f for f in ldb.columns if "fairness_" in f and f != "fairness_metric"
 95 |             ]
 96 |             for fm in fairness_metrics:
 97 |                 x_axis_name = ldb.metric_type.iloc[0]
 98 |                 y_axis_name = ldb["fairness_metric"].iloc[0]
 99 | 
100 |                 # Scatter plot
101 |                 plt.figure(figsize=(10, 7))
102 |                 for model_type in ldb.model_type.unique():
103 |                     ii = ldb.model_type == model_type
104 |                     plt.plot(
105 |                         ldb.metric_value[ii],
106 |                         ldb[fm][ii],
107 |                         markers[model_type]["marker"],
108 |                         markersize=12,
109 |                         alpha=0.75,
110 |                         color=markers[model_type]["color"],
111 |                         label=model_type,
112 |                     )
113 | 
114 |                 plt.xlabel(x_axis_name)
115 |                 plt.ylabel(y_axis_name)
116 |                 plt.legend()
117 |                 plt.title(f"Performance vs {fm}")
118 |                 plt.tight_layout(pad=2.0)
119 | 
120 |                 ymin = 0
121 |                 ymax = max(1, ldb[fm].max() * 1.1)
122 |                 plt.ylim(0, ymax)
123 |                 if "ratio" in y_axis_name:
124 |                     plt.axhspan(fairness_threshold, ymax, color="green", alpha=0.05)
125 |                     plt.axhspan(ymin, fairness_threshold, color="red", alpha=0.05)
126 |                 else:
127 |                     # difference metric
128 |                     plt.axhspan(ymin, fairness_threshold, color="green", alpha=0.05)
129 |                     plt.axhspan(fairness_threshold, ymax, color="red", alpha=0.05)
130 | 
131 |                 fname = f"performance_vs_{fm}.png"
132 |                 plot_path = os.path.join(model_path, fname)
133 |                 plt.savefig(plot_path)
134 |                 plt.close("all")
135 | 
136 |                 fout.write(f"\n\n### Performance vs {fm}\n")
137 |                 fout.write(f"![Performance vs {fm}]({fname})")
138 | 
```