mljar/mljar-supervised # codebase.md

This is page 5 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_preprocessing.py:
--------------------------------------------------------------------------------

```python
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from supervised.preprocessing.preprocessing import Preprocessing
  7 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
  8 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
  9 | 
 10 | 
 11 | class PreprocessingTest(unittest.TestCase):
 12 |     def test_constructor_preprocessing_step(self):
 13 |         preprocessing_params = {}
 14 |         ps = Preprocessing(preprocessing_params)
 15 | 
 16 |         self.assertTrue(len(ps._missing_values) == 0)
 17 |         self.assertTrue(len(ps._categorical) == 0)
 18 |         self.assertTrue(ps._categorical_y is None)
 19 | 
 20 |     def test_exclude_missing_targets_all_good(self):
 21 |         # training data
 22 |         d = {
 23 |             "col1": [1, 1, 1, 3],
 24 |             "col2": [5, 6, 7, 0],
 25 |             "col3": [1, 1, 1, 3],
 26 |             "col4": [2, 2, 4, 3],
 27 |             "y": [0, 1, 0, 1],
 28 |         }
 29 |         df = pd.DataFrame(data=d)
 30 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
 31 |         y_train = df.loc[:, "y"]
 32 | 
 33 |         ps = Preprocessing()
 34 |         X_train, y_train = ps._exclude_missing_targets(X_train, y_train)
 35 | 
 36 |         self.assertEqual(4, X_train.shape[0])
 37 |         self.assertEqual(4, y_train.shape[0])
 38 | 
 39 |     def test_exclude_missing_targets(self):
 40 |         # training data
 41 |         d = {
 42 |             "col1": [1, 1, 1, 3],
 43 |             "col2": [5, 6, 7, 0],
 44 |             "col3": [1, 1, 1, 3],
 45 |             "col4": [2, 2, 4, 3],
 46 |             "y": [0, np.nan, 0, 1],
 47 |         }
 48 |         df = pd.DataFrame(data=d)
 49 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
 50 |         y_train = df.loc[:, "y"]
 51 | 
 52 |         ps = Preprocessing()
 53 |         X_train, y_train = ps._exclude_missing_targets(X_train, y_train)
 54 | 
 55 |         self.assertEqual(3, X_train.shape[0])
 56 |         self.assertEqual(3, y_train.shape[0])
 57 | 
 58 |     def test_run_exclude_missing_targets(self):
 59 |         # training data
 60 |         d = {
 61 |             "col1": [1, 1, 1, 3],
 62 |             "col2": [5, 6, 7, 0],
 63 |             "col3": [1, 1, 1, 3],
 64 |             "col4": [2, 2, 4, 3],
 65 |             "y": [0, np.nan, 0, 1],
 66 |         }
 67 |         df = pd.DataFrame(data=d)
 68 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
 69 |         y_train = df.loc[:, "y"]
 70 | 
 71 |         ps = Preprocessing()
 72 |         X_train, y_train, _ = ps.fit_and_transform(X_train, y_train)
 73 |         self.assertEqual(3, X_train.shape[0])
 74 |         self.assertEqual(3, y_train.shape[0])
 75 | 
 76 |     def test_run_all_good(self):
 77 |         # training data
 78 |         d = {
 79 |             "col1": [1, 1, 1, 3],
 80 |             "col2": [5, 6, 7, 0],
 81 |             "col3": [1, 1, 1, 3],
 82 |             "col4": [2, 2, 4, 3],
 83 |             "y": [0, 1, 0, 1],
 84 |         }
 85 |         df = pd.DataFrame(data=d)
 86 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
 87 |         y_train = df.loc[:, "y"]
 88 | 
 89 |         preprocessing_params = {
 90 |             "columns_preprocessing": {
 91 |                 "col1": [
 92 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
 93 |                     PreprocessingCategorical.CONVERT_INTEGER,
 94 |                 ],
 95 |                 "col2": [
 96 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
 97 |                     PreprocessingCategorical.CONVERT_INTEGER,
 98 |                 ],
 99 |                 "col3": [
100 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
101 |                     PreprocessingCategorical.CONVERT_INTEGER,
102 |                 ],
103 |                 "col4": [
104 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
105 |                     PreprocessingCategorical.CONVERT_INTEGER,
106 |                 ],
107 |             }
108 |         }
109 | 
110 |         ps = Preprocessing(preprocessing_params)
111 | 
112 |         X_train, y_train, _ = ps.fit_and_transform(X_train, y_train)
113 | 
114 |         for col in ["col1", "col2", "col3", "col4"]:
115 |             self.assertTrue(col in X_train.columns)
116 | 
117 |         params_json = ps.to_json()
118 |         self.assertEqual(len(params_json), 1)  # should store params only
119 |         self.assertTrue("params" in params_json)
120 | 
121 |     def test_run_fill_median_convert_integer(self):
122 |         # training data
123 |         d = {
124 |             "col1": [1, 1, np.nan, 3],
125 |             "col2": ["a", "a", np.nan, "a"],
126 |             "col3": [1, 1, 1, 3],
127 |             "col4": ["a", "a", "b", "c"],
128 |             "y": [0, 1, 0, 1],
129 |         }
130 |         df = pd.DataFrame(data=d)
131 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
132 |         y_train = df.loc[:, "y"]
133 | 
134 |         preprocessing_params = {
135 |             "columns_preprocessing": {
136 |                 "col1": [
137 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
138 |                     PreprocessingCategorical.CONVERT_INTEGER,
139 |                 ],
140 |                 "col2": [
141 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
142 |                     PreprocessingCategorical.CONVERT_INTEGER,
143 |                 ],
144 |                 "col3": [
145 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
146 |                     PreprocessingCategorical.CONVERT_INTEGER,
147 |                 ],
148 |                 "col4": [
149 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
150 |                     PreprocessingCategorical.CONVERT_INTEGER,
151 |                 ],
152 |             }
153 |         }
154 | 
155 |         ps = Preprocessing(preprocessing_params)
156 |         X_train, y_train, _ = ps.fit_and_transform(X_train, y_train)
157 | 
158 |         for col in ["col1", "col2", "col3", "col4"]:
159 |             self.assertTrue(col in X_train.columns)
160 |         self.assertEqual(X_train["col1"][2], 1)
161 |         self.assertEqual(X_train["col2"][2], 0)
162 |         self.assertEqual(X_train["col4"][0], 0)
163 |         self.assertEqual(X_train["col4"][1], 0)
164 |         self.assertEqual(X_train["col4"][2], 1)
165 |         self.assertEqual(X_train["col4"][3], 2)
166 | 
167 |         params_json = ps.to_json()
168 | 
169 |         self.assertTrue("missing_values" in params_json)
170 |         self.assertTrue("categorical" in params_json)
171 |         self.assertTrue("categorical_y" not in params_json)
172 | 
173 |         self.assertTrue("fill_params" in params_json["missing_values"][0])
174 |         self.assertEqual(
175 |             "na_fill_median", params_json["missing_values"][0]["fill_method"]
176 |         )
177 |         self.assertTrue("convert_params" in params_json["categorical"][0])
178 |         self.assertEqual(
179 |             "categorical_to_int", params_json["categorical"][0]["convert_method"]
180 |         )
181 | 
182 |     def test_run_fill_median_convert_integer_validation_dataset(self):
183 |         # training data
184 |         d = {
185 |             "col1": [1, 1, np.nan, 3],
186 |             "col2": ["a", "a", np.nan, "a"],
187 |             "col3": [1, 1, 1, 3],
188 |             "col4": ["a", "a", "b", "c"],
189 |             "y": [0, 1, 1, 1],
190 |         }
191 |         df = pd.DataFrame(data=d)
192 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
193 |         y_train = df.loc[:, "y"]
194 | 
195 |         d_test = {
196 |             "col1": [1, 1, np.nan, 3],
197 |             "col2": ["a", "a", np.nan, "a"],
198 |             "col3": [1, 1, 1, 3],
199 |             "col4": ["a", "a", "b", "c"],
200 |             "y": [np.nan, 1, np.nan, 1],
201 |         }
202 |         df_test = pd.DataFrame(data=d_test)
203 |         X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
204 |         y_test = df_test.loc[:, "y"]
205 | 
206 |         preprocessing_params = {
207 |             "columns_preprocessing": {
208 |                 "col1": [
209 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
210 |                     PreprocessingCategorical.CONVERT_INTEGER,
211 |                 ],
212 |                 "col2": [
213 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
214 |                     PreprocessingCategorical.CONVERT_INTEGER,
215 |                 ],
216 |                 "col3": [
217 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
218 |                     PreprocessingCategorical.CONVERT_INTEGER,
219 |                 ],
220 |                 "col4": [
221 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
222 |                     PreprocessingCategorical.CONVERT_INTEGER,
223 |                 ],
224 |             }
225 |         }
226 | 
227 |         ps = Preprocessing(preprocessing_params)
228 | 
229 |         X_train, y_train, _ = ps.fit_and_transform(X_train, y_train)
230 |         X_test, y_test, _ = ps.transform(X_test, y_test)
231 | 
232 |         for col in ["col1", "col2", "col3", "col4"]:
233 |             self.assertTrue(col in X_train.columns)
234 |             self.assertTrue(col in X_test.columns)
235 | 
236 |         self.assertEqual(4, X_train.shape[0])
237 |         self.assertEqual(4, y_train.shape[0])
238 |         self.assertEqual(2, X_test.shape[0])
239 |         self.assertEqual(2, y_test.shape[0])
240 | 
241 |     def test_run_on_y_only(self):
242 |         d = {"y": ["a", "b", "a", "b"]}
243 |         df = pd.DataFrame(data=d)
244 |         y_train = df.loc[:, "y"]
245 | 
246 |         preprocessing_params = {
247 |             "target_preprocessing": [
248 |                 PreprocessingMissingValues.FILL_NA_MEDIAN,
249 |                 PreprocessingCategorical.CONVERT_INTEGER,
250 |             ]
251 |         }
252 | 
253 |         ps = Preprocessing(preprocessing_params)
254 |         _, y_train, _ = ps.fit_and_transform(None, y_train)
255 | 
256 |         self.assertEqual(4, y_train.shape[0])
257 |         self.assertEqual(0, y_train[0])
258 |         self.assertEqual(1, y_train[1])
259 | 
260 |     def test_run_on_y_only_validation(self):
261 |         d = {"y": ["a", "b", "a", "b"]}
262 |         df = pd.DataFrame(data=d)
263 |         y_train = df.loc[:, "y"]
264 | 
265 |         d_test = {"y": [np.nan, "a", np.nan, "b"]}
266 |         df_test = pd.DataFrame(data=d_test)
267 |         y_test = df_test.loc[:, "y"]
268 | 
269 |         preprocessing_params = {
270 |             "target_preprocessing": [
271 |                 PreprocessingMissingValues.FILL_NA_MEDIAN,
272 |                 PreprocessingCategorical.CONVERT_INTEGER,
273 |             ]
274 |         }
275 | 
276 |         ps = Preprocessing(preprocessing_params)
277 | 
278 |         _, y_train, _ = ps.fit_and_transform(None, y_train)
279 |         _, y_test, _ = ps.transform(None, y_test)
280 | 
281 |         self.assertEqual(4, y_train.shape[0])
282 |         self.assertEqual(2, y_test.shape[0])
283 |         self.assertEqual(0, y_train[0])
284 |         self.assertEqual(1, y_train[1])
285 |         self.assertEqual(0, y_test[0])
286 |         self.assertEqual(1, y_test[1])
287 | 
288 |     def test_to_and_from_json_run_fill_median_convert_integer(self):
289 |         # training data
290 |         d = {
291 |             "col1": [1, 1, np.nan, 3],
292 |             "col2": ["a", "a", np.nan, "a"],
293 |             "col3": [1, 1, 1, 3],
294 |             "col4": ["a", "a", "b", "c"],
295 |             "y": [0, 1, 0, 1],
296 |         }
297 |         df = pd.DataFrame(data=d)
298 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
299 |         y_train = df.loc[:, "y"]
300 | 
301 |         preprocessing_params = {
302 |             "columns_preprocessing": {
303 |                 "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN],
304 |                 "col2": [
305 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
306 |                     PreprocessingCategorical.CONVERT_INTEGER,
307 |                 ],
308 |                 "col4": [
309 |                     PreprocessingMissingValues.FILL_NA_MEDIAN,
310 |                     PreprocessingCategorical.CONVERT_INTEGER,
311 |                 ],
312 |             },
313 |             "target_preprocessing": [],
314 |         }
315 | 
316 |         ps = Preprocessing(preprocessing_params)
317 |         _, _, _ = ps.fit_and_transform(X_train, y_train)
318 | 
319 |         ps2 = Preprocessing()
320 |         ps2.from_json(ps.to_json(), "./")
321 |         del ps
322 | 
323 |         d_test = {
324 |             "col1": [1, 1, np.nan, 3],
325 |             "col2": ["a", "a", np.nan, "a"],
326 |             "col3": [1, 1, 1, 3],
327 |             "col4": ["a", "a", "b", "c"],
328 |             "y": [np.nan, np.nan, 1, 1],
329 |         }
330 |         df_test = pd.DataFrame(data=d_test)
331 |         X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
332 |         y_test = df_test.loc[:, "y"]
333 | 
334 |         X_test, y_test, _ = ps2.transform(X_test, y_test)
335 | 
336 |         self.assertEqual(2, y_test.shape[0])
337 |         self.assertEqual(2, np.sum(y_test))
338 |         self.assertEqual(1, X_test["col1"].iloc[0])
339 |         self.assertEqual(0, X_test["col2"].iloc[0])
340 | 
341 |     def test_empty_column(self):
342 |         # training data
343 |         d = {
344 |             "col1": [np.nan, np.nan, np.nan, np.nan],
345 |             "col2": [5, 6, 7, 0],
346 |             "col3": [1, 1, 1, 3],
347 |             "col4": [2, 2, 4, 3],
348 |             "y": [0, 1, 0, 1],
349 |         }
350 |         df = pd.DataFrame(data=d)
351 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
352 |         y_train = df.loc[:, "y"]
353 | 
354 |         preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}}
355 | 
356 |         ps = Preprocessing(preprocessing_params)
357 |         X_train1, _, _ = ps.fit_and_transform(X_train, y_train)
358 | 
359 |         self.assertTrue("col1" not in X_train1.columns)
360 |         self.assertEqual(3, len(X_train1.columns))
361 |         X_train2, _, _ = ps.transform(X_train, y_train)
362 |         self.assertTrue("col1" not in X_train2.columns)
363 |         self.assertEqual(3, len(X_train2.columns))
364 |         for col in ["col2", "col3", "col4"]:
365 |             self.assertTrue(col in X_train2.columns)
366 | 
367 |         params_json = ps.to_json()
368 |         ps2 = Preprocessing()
369 |         ps2.from_json(params_json, "./")
370 | 
371 |         X_train3, _, _ = ps2.transform(X_train, y_train)
372 |         self.assertTrue("col1" not in X_train3.columns)
373 |         self.assertEqual(3, len(X_train3.columns))
374 |         for col in ["col2", "col3", "col4"]:
375 |             self.assertTrue(col in X_train3.columns)
376 | 
377 | 
378 | """
379 |     def test_run_fill_median_convert_one_hot_validation_dataset(self):
380 |         # training data
381 |         d = {
382 |             "col1": [1, 1, np.nan, 3],
383 |             "col2": ["a", "a", np.nan, "a"],
384 |             "col3": [1, 1, 1, 3],
385 |             "col4": ["a", "a", "b", "c"],
386 |             "y": [0, 1, 1, 1],
387 |         }
388 |         df = pd.DataFrame(data=d)
389 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
390 |         y_train = df.loc[:, "y"]
391 | 
392 |         d_test = {
393 |             "col1": [1, 1, np.nan, 3],
394 |             "col2": ["a", "z", np.nan, "a"],
395 |             "col3": [1, 1, 1, 3],
396 |             "col4": ["a", "a", "b", "c"],
397 |             "y": [np.nan, 1, np.nan, 1],
398 |         }
399 |         df_test = pd.DataFrame(data=d_test)
400 |         X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
401 |         y_test = df_test.loc[:, "y"]
402 | 
403 |         ps = Preprocessing(
404 |             missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN,
405 |             categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT,
406 |         )
407 |         X_train, y_train, X_test, y_test = ps.run(
408 |             X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test
409 |         )
410 | 
411 |         for col in ["col1", "col2_a", "col3", "col4_a", "col4_b", "col4_c"]:
412 |             self.assertTrue(col in X_train.columns)
413 |             self.assertTrue(col in X_test.columns)
414 | 
415 |         self.assertEqual(4, X_train.shape[0])
416 |         self.assertEqual(2, X_test.shape[0])
417 |         self.assertEqual(4, np.sum(X_train["col2_a"]))
418 |         self.assertEqual(2, np.sum(X_train["col4_a"]))
419 |         self.assertEqual(1, np.sum(X_train["col4_b"]))
420 |         self.assertEqual(1, np.sum(X_train["col4_c"]))
421 |         self.assertEqual(0, X_test.loc[0, "col2_a"])
422 |         self.assertEqual(1, X_test.loc[1, "col2_a"])
423 | 
424 |     def test_run_fill_median_convert_one_hot_big_categorical(self):
425 | 
426 |         a_lot = 250
427 |         cs = []
428 |         for i in range(a_lot):
429 |             cs.append(str(uuid.uuid4().hex.upper()[0:6]))
430 | 
431 |         d = {
432 |             "col1": cs,
433 |             "col2": ["a", "b"] * int(a_lot / 2),
434 |             "col3": range(a_lot),
435 |             "col4": range(a_lot),
436 |         }
437 | 
438 |         df = pd.DataFrame(data=d)
439 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
440 |         X_train_2 = copy.deepcopy(X_train)
441 | 
442 |         ps = Preprocessing(
443 |             missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN,
444 |             categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT,
445 |         )
446 |         X_train, _, _, _ = ps.run(X_train=X_train)
447 | 
448 |         for col in ["col1", "col2_b", "col3", "col4"]:
449 |             self.assertTrue(col in X_train.columns)
450 | 
451 |         self.assertTrue(
452 |             np.max(X_train["col1"]) > 0.90 * a_lot
453 |         )  # there can be duplicates ;)
454 |         self.assertEqual(np.max(X_train["col2_b"]), 1)
455 |         self.assertEqual(np.sum(X_train["col2_b"]), a_lot / 2)
456 | 
457 |         ps2 = Preprocessing()
458 |         ps2.from_json(ps.to_json())
459 |         del ps
460 |         # apply preprocessing loaded from json
461 |         _, _, X_train_2, _ = ps2.run(X_test=X_train_2)
462 |         for col in ["col1", "col2_b", "col3", "col4"]:
463 |             self.assertTrue(col in X_train_2.columns)
464 | 
465 |         self.assertTrue(
466 |             np.max(X_train_2["col1"]) > 0.90 * a_lot
467 |         )  # there can be duplicates ;)
468 |         self.assertEqual(np.max(X_train_2["col2_b"]), 1)
469 |         self.assertEqual(np.sum(X_train_2["col2_b"]), a_lot / 2)
470 | 
471 |     def test_convert_target(self):
472 |         d = {
473 |             "col1": [1, 1, np.nan, 3],
474 |             "col2": ["a", "a", np.nan, "a"],
475 |             "col3": [1, 1, 1, 3],
476 |             "col4": ["a", "a", "b", "c"],
477 |             "y": [2, 2, 1, 1],
478 |         }
479 |         df = pd.DataFrame(data=d)
480 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
481 |         y_train = df.loc[:, "y"]
482 | 
483 |         ps = Preprocessing(
484 |             missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN,
485 |             categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT,
486 |             project_task="PROJECT_BIN_CLASS",
487 |         )
488 |         X_train, y_train, _, _ = ps.run(X_train=X_train, y_train=y_train)
489 | 
490 |         self.assertEqual(2, len(np.unique(y_train)))
491 |         self.assertTrue(0 in np.unique(y_train))
492 |         self.assertTrue(1 in np.unique(y_train))
493 | 
494 |     def test_dont_convert_target(self):
495 |         d = {
496 |             "col1": [1, 1, np.nan, 3],
497 |             "col2": ["a", "a", np.nan, "a"],
498 |             "col3": [1, 1, 1, 3],
499 |             "col4": ["a", "a", "b", "c"],
500 |             "y": [2, 2, 1, 1],
501 |         }
502 |         df = pd.DataFrame(data=d)
503 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
504 |         y_train = df.loc[:, "y"]
505 | 
506 |         ps = Preprocessing(
507 |             missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN,
508 |             categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT,
509 |             project_task="PROJECT_REGRESSION",
510 |         )
511 |         X_train, y_train, _, _ = ps.run(X_train=X_train, y_train=y_train)
512 | 
513 |         self.assertEqual(2, len(np.unique(y_train)))
514 |         self.assertTrue(1 in np.unique(y_train))
515 |         self.assertTrue(2 in np.unique(y_train))
516 | """
517 | 
518 | if __name__ == "__main__":
519 |     unittest.main()
520 | 
```

--------------------------------------------------------------------------------
/supervised/ensemble.py:
--------------------------------------------------------------------------------

```python
  1 | import copy
  2 | import json
  3 | import logging
  4 | import os
  5 | import time
  6 | import uuid
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from supervised.algorithms.registry import (
 12 |     BINARY_CLASSIFICATION,
 13 |     MULTICLASS_CLASSIFICATION,
 14 |     REGRESSION,
 15 | )
 16 | from supervised.exceptions import NotTrainedException
 17 | from supervised.model_framework import ModelFramework
 18 | from supervised.utils.additional_metrics import AdditionalMetrics
 19 | from supervised.utils.config import LOG_LEVEL
 20 | from supervised.utils.jsonencoder import MLJSONEncoder
 21 | from supervised.utils.metric import Metric
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logger.setLevel(LOG_LEVEL)
 25 | 
 26 | from tabulate import tabulate
 27 | 
 28 | from supervised.utils.learning_curves import LearningCurves
 29 | 
 30 | 
 31 | class Ensemble:
 32 |     algorithm_name = "Greedy Ensemble"
 33 |     algorithm_short_name = "Ensemble"
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         optimize_metric="logloss",
 38 |         ml_task=BINARY_CLASSIFICATION,
 39 |         is_stacked=False,
 40 |         max_single_prediction_time=None,
 41 |         fairness_metric=None,
 42 |         fairness_threshold=None,
 43 |         privileged_groups=None,
 44 |         underprivileged_groups=None,
 45 |     ):
 46 |         self.library_version = "0.1"
 47 |         self.uid = str(uuid.uuid4())
 48 | 
 49 |         self.metric = Metric({"name": optimize_metric})
 50 |         self.best_loss = self.metric.get_maximum()  # the best loss obtained by ensemble
 51 |         self.models_map = None
 52 |         self.selected_models = []
 53 |         self.train_time = None
 54 |         self.total_best_sum = None  # total sum of predictions, the oof of ensemble
 55 |         self.target = None
 56 |         self.target_columns = None
 57 |         self.sample_weight = None
 58 |         self._ml_task = ml_task
 59 |         self._optimize_metric = optimize_metric
 60 |         self._is_stacked = is_stacked
 61 | 
 62 |         self._additional_metrics = None
 63 |         self._threshold = None
 64 |         self._name = "Ensemble_Stacked" if is_stacked else "Ensemble"
 65 |         self._scores = []
 66 |         self.oof_predictions = None
 67 |         self._oof_predictions_fname = None
 68 |         self._single_prediction_time = None  # prediction time on single sample
 69 |         self._max_single_prediction_time = max_single_prediction_time
 70 |         self.model_prediction_time = {}
 71 | 
 72 |         self._fairness_metric = fairness_metric
 73 |         self._fairness_threshold = fairness_threshold
 74 |         self._privileged_groups = privileged_groups
 75 |         self._underprivileged_groups = underprivileged_groups
 76 |         self._is_fair = None
 77 |         self.sensitive_features = None
 78 | 
 79 |     def get_train_time(self):
 80 |         return self.train_time
 81 | 
 82 |     def get_final_loss(self):
 83 |         return self.best_loss
 84 | 
 85 |     def is_valid(self):
 86 |         return len(self.selected_models) > 1
 87 | 
 88 |     def is_fast_enough(self, max_single_prediction_time):
 89 |         # dont need to check
 90 |         if max_single_prediction_time is None:
 91 |             return True
 92 | 
 93 |         # no iformation about prediction time
 94 |         if self._single_prediction_time is None:
 95 |             return True
 96 | 
 97 |         return self._single_prediction_time < max_single_prediction_time
 98 | 
 99 |     def get_type(self):
100 |         prefix = ""  # "Stacked" if self._is_stacked else ""
101 |         return prefix + self.algorithm_short_name
102 | 
103 |     def get_name(self):
104 |         return self._name
105 | 
106 |     def involved_model_names(self):
107 |         """Returns the list of all models involved in the current model.
108 |         For single model, it returns the list with the name of the model.
109 |         For ensemble model, it returns the list with the name of the ensemble and all internal models
110 |         (used to build ensemble).
111 |         For single model but trained on stacked data, it returns the list with the name of the model
112 |         (names of models used in stacking are not included)."""
113 |         if self.selected_models is None or not self.selected_models:
114 |             return [self._name]
115 |         l = []
116 |         for m in self.selected_models:
117 |             l += m["model"].involved_model_names()
118 |         return [self._name] + l
119 | 
120 |     def get_metric_name(self):
121 |         return self.metric.name
122 | 
123 |     def get_metric(self):
124 |         return self.metric
125 | 
126 |     def get_out_of_folds(self):
127 |         """Needed when ensemble is treated as model and we want to compute additional metrics for it"""
128 |         # single prediction (in case of binary classification and regression)
129 |         if self.oof_predictions is not None:
130 |             return self.oof_predictions.copy(deep=True)
131 | 
132 |         if self._oof_predictions_fname is not None:
133 |             self.oof_predictions = pd.read_csv(self._oof_predictions_fname)
134 |             return self.oof_predictions.copy(deep=True)
135 | 
136 |         ensemble_oof = pd.DataFrame(
137 |             data=self.total_best_sum, columns=self.total_best_sum.columns
138 |         )
139 |         ensemble_oof["target"] = self.target
140 |         if self.sample_weight is not None:
141 |             ensemble_oof["sample_weight"] = self.sample_weight
142 | 
143 |         # if self.sensitive_features is not None:
144 |         #    for col in self.sensitive_features.columns:
145 |         #        ensemble_oof[col] = self.sensitive_features[col]
146 | 
147 |         self.oof_predictions = ensemble_oof
148 |         return ensemble_oof
149 | 
150 |     def _get_mean(self, oof_selected, best_sum, best_count):
151 |         resp = copy.deepcopy(oof_selected)
152 |         if best_count > 1:
153 |             resp += best_sum
154 |             resp /= float(best_count)
155 |         return resp
156 | 
157 |     def get_oof_matrix(self, models):
158 |         # remember models, will be needed in predictions
159 |         self.models_map = {m.get_name(): m for m in models}
160 | 
161 |         if self._max_single_prediction_time is not None:
162 |             self.model_prediction_time = {
163 |                 m.get_name(): m._single_prediction_time for m in models
164 |             }
165 | 
166 |             if not [
167 |                 m for m in models if m.is_fast_enough(self._max_single_prediction_time)
168 |             ]:
169 |                 raise NotTrainedException(
170 |                     "Can't contruct ensemble with prediction time smaller than limit."
171 |                 )
172 | 
173 |         # check if we can construct fair ensemble
174 |         if self._fairness_metric is not None:
175 |             if not [m for m in models if m.is_fair()]:
176 |                 raise NotTrainedException("Can't contruct fair ensemble.")
177 | 
178 |         oofs = {}
179 |         sensitive_features = None
180 |         for m in models:
181 |             # do not use model with RandomFeature
182 |             if "RandomFeature" in m.get_name():
183 |                 continue
184 | 
185 |             # ensemble only the same level of stack
186 |             # if m._is_stacked != self._is_stacked:
187 |             #    continue
188 |             oof = m.get_out_of_folds()
189 |             prediction_cols = [c for c in oof.columns if "prediction" in c]
190 |             oofs[m.get_name()] = oof[prediction_cols]  # oof["prediction"]
191 |             if self.target is None:
192 |                 self.target_columns = [c for c in oof.columns if "target" in c]
193 |                 self.target = oof[
194 |                     self.target_columns
195 |                 ]  # it will be needed for computing advance model statistics
196 | 
197 |             if self.sample_weight is None and "sample_weight" in oof.columns:
198 |                 self.sample_weight = oof["sample_weight"]
199 | 
200 |             sensitive_cols = [c for c in oof.columns if "sensitive" in c]
201 |             if sensitive_cols and sensitive_features is None:
202 |                 sensitive_features = oof[sensitive_cols]
203 | 
204 |         return oofs, self.target, self.sample_weight, sensitive_features
205 | 
206 |     def get_additional_metrics(self):
207 |         if self._additional_metrics is None:
208 |             logger.debug("Get additional metrics for Ensemble")
209 |             # 'target' - the target after processing used for model training
210 |             # 'prediction' - out of folds predictions of the model
211 |             oof_predictions = self.get_out_of_folds()
212 |             prediction_cols = [c for c in oof_predictions.columns if "prediction" in c]
213 |             target_cols = [c for c in oof_predictions.columns if "target" in c]
214 | 
215 |             oof_preds = oof_predictions[prediction_cols]
216 |             if self._ml_task == MULTICLASS_CLASSIFICATION:
217 |                 cols = oof_preds.columns.tolist()
218 |                 # prediction_
219 |                 labels = {i: v[11:] for i, v in enumerate(cols)}
220 | 
221 |                 oof_preds["label"] = np.argmax(
222 |                     np.array(oof_preds[prediction_cols]), axis=1
223 |                 )
224 |                 oof_preds["label"] = oof_preds["label"].map(labels)
225 | 
226 |             sample_weight = None
227 |             if "sample_weight" in oof_predictions.columns:
228 |                 sample_weight = oof_predictions["sample_weight"]
229 | 
230 |             self._additional_metrics = AdditionalMetrics.compute(
231 |                 oof_predictions[target_cols],
232 |                 oof_preds,
233 |                 sample_weight,
234 |                 self._ml_task,
235 |                 self.sensitive_features,
236 |                 self._fairness_metric
237 |                 if self._ml_task != REGRESSION
238 |                 else f"{self._fairness_metric}@{self.get_metric_name()}",
239 |                 self._fairness_threshold,
240 |                 self._privileged_groups,
241 |                 self._underprivileged_groups,
242 |             )
243 |             if self._ml_task == BINARY_CLASSIFICATION:
244 |                 self._threshold = float(self._additional_metrics["threshold"])
245 | 
246 |         return self._additional_metrics
247 | 
248 |     def get_sensitive_features_names(self):
249 |         metrics = self.get_additional_metrics()
250 |         fm = metrics.get("fairness_metrics", {})
251 |         return [i for i in list(fm.keys()) if i != "fairness_optimization"]
252 | 
253 |     def get_fairness_metric(self, col_name):
254 |         metrics = self.get_additional_metrics()
255 |         fm = metrics.get("fairness_metrics", {})
256 |         return fm.get(col_name, {}).get("fairness_metric_value")
257 | 
258 |     def get_fairness_optimization(self):
259 |         metrics = self.get_additional_metrics()
260 |         fm = metrics.get("fairness_metrics", {})
261 |         return fm.get("fairness_optimization", {})
262 | 
263 |     def get_worst_fairness(self):
264 |         # We have fairness metrics per sensitive feature.
265 |         # The worst fairness metric is:
266 |         # - for ratio metrics, the lowest fairness value from all sensitive features
267 |         # - for difference metrics, the highest fairness value from all sensitive features
268 |         # It is needed as bias mitigation stop criteria.
269 | 
270 |         metrics = self.get_additional_metrics()
271 | 
272 |         fm = metrics.get("fairness_metrics", {})
273 |         worst_value = None
274 |         for col_name, values in fm.items():
275 |             if col_name == "fairness_optimization":
276 |                 continue
277 |             if "ratio" in self._fairness_metric.lower():
278 |                 if worst_value is None:
279 |                     worst_value = values.get("fairness_metric_value", 0)
280 |                 else:
281 |                     worst_value = min(
282 |                         worst_value, values.get("fairness_metric_value", 0)
283 |                     )
284 |             else:
285 |                 if worst_value is None:
286 |                     worst_value = values.get("fairness_metric_value", 1)
287 |                 else:
288 |                     worst_value = max(
289 |                         worst_value, values.get("fairness_metric_value", 1)
290 |                     )
291 | 
292 |         return worst_value
293 | 
294 |     def get_best_fairness(self):
295 |         # We have fairness metrics per sensitive feature.
296 |         # The best fairness metric is:
297 |         # - for ratio metrics, the highest fairness value from all sensitive features
298 |         # - for difference metrics, the lowest fairness value from all sensitive features
299 |         # It is needed as bias mitigation stop criteria.
300 | 
301 |         metrics = self.get_additional_metrics()
302 |         fm = metrics.get("fairness_metrics", {})
303 |         best_value = None
304 |         for col_name, values in fm.items():
305 |             if col_name == "fairness_optimization":
306 |                 continue
307 |             if "ratio" in self._fairness_metric.lower():
308 |                 if best_value is None:
309 |                     best_value = values.get("fairness_metric_value", 0)
310 |                 else:
311 |                     best_value = max(best_value, values.get("fairness_metric_value", 0))
312 |             else:
313 |                 if best_value is None:
314 |                     best_value = values.get("fairness_metric_value", 1)
315 |                 else:
316 |                     best_value = min(best_value, values.get("fairness_metric_value", 1))
317 | 
318 |         return best_value
319 | 
320 |     def is_fair(self):
321 |         if self._is_fair is not None:
322 |             return self._is_fair
323 |         metrics = self.get_additional_metrics()
324 |         fm = metrics.get("fairness_metrics", {})
325 |         for col, m in fm.items():
326 |             if col == "fairness_optimization":
327 |                 continue
328 |             if not m.get("is_fair", True):
329 |                 self._is_fair = False
330 |                 return False
331 |         self._is_fair = True
332 |         return False
333 | 
334 |     def fit(self, oofs, y, sample_weight=None, sensitive_features=None):
335 |         logger.debug("Ensemble.fit")
336 |         self.sensitive_features = sensitive_features
337 |         start_time = time.time()
338 |         selected_algs_cnt = 0  # number of selected algorithms
339 |         self.best_algs = []  # selected algoritms indices from each loop
340 | 
341 |         total_prediction_time = 0
342 |         best_sum = None  # sum of best algorihtms
343 |         for j in range(len(oofs)):  # iterate over all solutions
344 |             min_score = self.metric.get_maximum()
345 |             best_model = None
346 |             # try to add some algorithm to the best_sum to minimize metric
347 |             for model_name in oofs.keys():
348 |                 if (
349 |                     self._max_single_prediction_time
350 |                     and model_name in self.model_prediction_time
351 |                 ):
352 |                     if (
353 |                         total_prediction_time + self.model_prediction_time[model_name]
354 |                         > self._max_single_prediction_time
355 |                     ):
356 |                         continue
357 |                 # skip unfair models
358 |                 if (
359 |                     self._fairness_metric is not None
360 |                     and not self.models_map[model_name].is_fair()
361 |                 ):
362 |                     continue
363 |                 y_ens = self._get_mean(oofs[model_name], best_sum, j + 1)
364 |                 score = self.metric(y, y_ens, sample_weight)
365 |                 if self.metric.improvement(previous=min_score, current=score):
366 |                     min_score = score
367 |                     best_model = model_name
368 | 
369 |             if best_model is None:
370 |                 continue
371 |             # there is improvement, save it
372 |             # save scores for plotting learning curve
373 |             # if we optimize negative, then we need to multiply by -1.0
374 |             # to save correct values in the learning curve
375 |             sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0
376 |             self._scores += [sign * min_score]
377 | 
378 |             if self.metric.improvement(previous=self.best_loss, current=min_score):
379 |                 self.best_loss = min_score
380 |                 selected_algs_cnt = j
381 | 
382 |             self.best_algs.append(best_model)  # save the best algoritm
383 |             # update best_sum value
384 |             best_sum = (
385 |                 oofs[best_model] if best_sum is None else best_sum + oofs[best_model]
386 |             )
387 |             if j == selected_algs_cnt:
388 |                 self.total_best_sum = copy.deepcopy(best_sum)
389 | 
390 |             # update prediction time estimate
391 |             if self._max_single_prediction_time is not None:
392 |                 total_prediction_time = np.sum(
393 |                     [
394 |                         self.model_prediction_time[name]
395 |                         for name in np.unique(self.best_algs)
396 |                     ]
397 |                 )
398 |         # end of main loop #
399 | 
400 |         if not self.best_algs:
401 |             raise NotTrainedException("Ensemble wasn't fitted.")
402 | 
403 |         # keep oof predictions of ensemble
404 |         self.total_best_sum /= float(selected_algs_cnt + 1)
405 |         self.best_algs = self.best_algs[: (selected_algs_cnt + 1)]
406 | 
407 |         logger.debug("Selected models for ensemble:")
408 |         for model_name in np.unique(self.best_algs):
409 |             self.selected_models += [
410 |                 {
411 |                     "model": self.models_map[model_name],
412 |                     "repeat": float(self.best_algs.count(model_name)),
413 |                 }
414 |             ]
415 |             logger.debug(f"{model_name} {self.best_algs.count(model_name)}")
416 | 
417 |         self._additional_metrics = self.get_additional_metrics()
418 | 
419 |         self.train_time = time.time() - start_time
420 | 
421 |     def predict(self, X, X_stacked=None):
422 |         logger.debug(
423 |             "Ensemble.predict with {} models".format(len(self.selected_models))
424 |         )
425 |         y_predicted_ensemble = None
426 |         total_repeat = 0.0
427 | 
428 |         for selected in self.selected_models:
429 |             model = selected["model"]
430 |             repeat = selected["repeat"]
431 |             total_repeat += repeat
432 | 
433 |             if model._is_stacked:
434 |                 y_predicted_from_model = model.predict(X_stacked)
435 |             else:
436 |                 y_predicted_from_model = model.predict(X)
437 | 
438 |             prediction_cols = []
439 |             if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
440 |                 prediction_cols = [
441 |                     c for c in y_predicted_from_model.columns if "prediction_" in c
442 |                 ]
443 |             else:  # REGRESSION
444 |                 prediction_cols = ["prediction"]
445 |             y_predicted_from_model = y_predicted_from_model[prediction_cols]
446 |             y_predicted_ensemble = (
447 |                 y_predicted_from_model * repeat
448 |                 if y_predicted_ensemble is None
449 |                 else y_predicted_ensemble + y_predicted_from_model * repeat
450 |             )
451 | 
452 |         y_predicted_ensemble /= total_repeat
453 | 
454 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
455 |             cols = y_predicted_ensemble.columns.tolist()
456 |             # prediction_
457 |             labels = {i: v[11:] for i, v in enumerate(cols)}
458 | 
459 |             y_predicted_ensemble["label"] = np.argmax(
460 |                 np.array(y_predicted_ensemble[prediction_cols]), axis=1
461 |             )
462 |             y_predicted_ensemble["label"] = y_predicted_ensemble["label"].map(labels)
463 | 
464 |         return y_predicted_ensemble
465 | 
466 |     def to_json(self):
467 |         models_json = []
468 |         for selected in self.selected_models:
469 |             model = selected["model"]
470 |             repeat = selected["repeat"]
471 |             models_json += [{"model": model.to_json(), "repeat": repeat}]
472 | 
473 |         json_desc = {
474 |             "library_version": self.library_version,
475 |             "algorithm_name": self.algorithm_name,
476 |             "algorithm_short_name": self.algorithm_short_name,
477 |             "uid": self.uid,
478 |             "models": models_json,
479 |         }
480 |         return json_desc
481 | 
482 |     def from_json(self, json_desc):
483 |         self.library_version = json_desc.get("library_version", self.library_version)
484 |         self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name)
485 |         self.algorithm_short_name = json_desc.get(
486 |             "algorithm_short_name", self.algorithm_short_name
487 |         )
488 |         self.uid = json_desc.get("uid", self.uid)
489 |         self.selected_models = []
490 |         models_json = json_desc.get("models")
491 |         for selected in models_json:
492 |             model = selected["model"]
493 |             repeat = selected["repeat"]
494 | 
495 |             il = ModelFramework(model.get("params"))
496 |             il.from_json(model)
497 |             self.selected_models += [
498 |                 # {"model": LearnerFactory.load(model), "repeat": repeat}
499 |                 {"model": il, "repeat": repeat}
500 |             ]
501 | 
502 |     def save(self, results_path, model_subpath):
503 |         model_path = os.path.join(results_path, model_subpath)
504 |         logger.info(f"Save the ensemble to {model_path}")
505 | 
506 |         predictions = self.get_out_of_folds()
507 |         predictions_fname = os.path.join(model_subpath, f"predictions_ensemble.csv")
508 |         self._oof_predictions_fname = os.path.join(results_path, predictions_fname)
509 |         predictions.to_csv(self._oof_predictions_fname, index=False)
510 | 
511 |         with open(os.path.join(model_path, "ensemble.json"), "w") as fout:
512 |             ms = []
513 |             for selected in self.selected_models:
514 |                 ms += [{"model": selected["model"]._name, "repeat": selected["repeat"]}]
515 | 
516 |             desc = {
517 |                 "name": self._name,
518 |                 "ml_task": self._ml_task,
519 |                 "optimize_metric": self._optimize_metric,
520 |                 "selected_models": ms,
521 |                 "predictions_fname": predictions_fname,
522 |                 "metric_name": self.get_metric_name(),
523 |                 "final_loss": self.get_final_loss(),
524 |                 "train_time": self.get_train_time(),
525 |                 "is_stacked": self._is_stacked,
526 |             }
527 | 
528 |             if self._threshold is not None:
529 |                 desc["threshold"] = self._threshold
530 |             fout.write(json.dumps(desc, indent=4, cls=MLJSONEncoder))
531 | 
532 |         LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path)
533 | 
534 |         # call additional metics just to be sure they are computed
535 |         self._additional_metrics = self.get_additional_metrics()
536 | 
537 |         AdditionalMetrics.save(
538 |             self._additional_metrics, self._ml_task, self.model_markdown(), model_path
539 |         )
540 | 
541 |         with open(os.path.join(model_path, "status.txt"), "w") as fout:
542 |             fout.write("ALL OK!")
543 | 
544 |     def model_markdown(self):
545 |         select_models_desc = []
546 |         for selected in self.selected_models:
547 |             select_models_desc += [
548 |                 {"model": selected["model"]._name, "repeat": selected["repeat"]}
549 |             ]
550 |         desc = f"# Summary of {self.get_name()}\n\n"
551 |         desc += "[<< Go back](../README.md)\n\n"
552 |         desc += "\n## Ensemble structure\n"
553 |         selected = pd.DataFrame(select_models_desc)
554 |         desc += tabulate(selected.values, ["Model", "Weight"], tablefmt="pipe")
555 |         desc += "\n"
556 |         return desc
557 | 
558 |     @staticmethod
559 |     def load(results_path, model_subpath, models_map):
560 |         model_path = os.path.join(results_path, model_subpath)
561 |         logger.info(f"Loading ensemble from {model_path}")
562 | 
563 |         with open(os.path.join(model_path, "ensemble.json")) as file:
564 |             json_desc = json.load(file)
565 | 
566 |         ensemble = Ensemble(json_desc.get("optimize_metric"), json_desc.get("ml_task"))
567 |         ensemble._name = json_desc.get("name", ensemble._name)
568 |         ensemble._threshold = json_desc.get("threshold", ensemble._threshold)
569 |         for m in json_desc.get("selected_models", []):
570 |             ensemble.selected_models += [
571 |                 {"model": models_map[m["model"]], "repeat": m["repeat"]}
572 |             ]
573 | 
574 |         ensemble.best_loss = json_desc.get("final_loss", ensemble.best_loss)
575 |         ensemble.train_time = json_desc.get("train_time", ensemble.train_time)
576 |         ensemble._is_stacked = json_desc.get("is_stacked", ensemble._is_stacked)
577 |         predictions_fname = json_desc.get("predictions_fname")
578 |         if predictions_fname is not None:
579 |             ensemble._oof_predictions_fname = os.path.join(
580 |                 results_path, predictions_fname
581 |             )
582 | 
583 |         return ensemble
584 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/metrics.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.metrics import (
  4 |     mean_absolute_error,
  5 |     mean_absolute_percentage_error,
  6 |     mean_squared_error,
  7 |     r2_score,
  8 | )
  9 | 
 10 | from supervised.fairness.optimization import FairnessOptimization
 11 | from supervised.fairness.plots import FairnessPlots
 12 | from supervised.fairness.utils import (
 13 |     accuracy,
 14 |     false_negative_rate,
 15 |     false_positive_rate,
 16 |     selection_rate,
 17 |     true_negative_rate,
 18 |     true_positive_rate,
 19 | )
 20 | from supervised.utils.metric import pearson, spearman
 21 | 
 22 | 
 23 | class FairnessMetrics:
 24 |     @staticmethod
 25 |     def binary_classification(
 26 |         target,
 27 |         predicted_labels,
 28 |         sensitive_features,
 29 |         fairness_metric,
 30 |         fairness_threshold,
 31 |         privileged_groups=[],
 32 |         underprivileged_groups=[],
 33 |         previous_fairness_optimization=None,
 34 |     ):
 35 |         target = np.array(target).ravel()
 36 |         preds = np.array(predicted_labels)
 37 | 
 38 |         fairness_metrics = {}
 39 | 
 40 |         for col in sensitive_features.columns:
 41 |             col_name = col[10:]  # skip 'senstive_'
 42 | 
 43 |             accuracies = []
 44 |             selection_rates = []
 45 |             tprs = []
 46 |             fprs = []
 47 |             tnrs = []
 48 |             fnrs = []
 49 |             samples = []
 50 |             demographic_parity_diff = None
 51 |             demographic_parity_ratio = None
 52 |             equalized_odds_diff = None
 53 |             equalized_odds_ratio = None
 54 | 
 55 |             # overall
 56 |             accuracies += [accuracy(target, preds)]
 57 |             selection_rates += [selection_rate(preds)]
 58 |             tprs += [true_positive_rate(target, preds)]
 59 |             fprs += [false_positive_rate(target, preds)]
 60 |             tnrs += [true_negative_rate(target, preds)]
 61 |             fnrs += [false_negative_rate(target, preds)]
 62 |             samples += [target.shape[0]]
 63 | 
 64 |             values = sensitive_features[col].unique()
 65 | 
 66 |             for value in values:
 67 |                 accuracies += [
 68 |                     accuracy(
 69 |                         target[sensitive_features[col] == value],
 70 |                         preds[sensitive_features[col] == value],
 71 |                     )
 72 |                 ]
 73 |                 selection_rates += [
 74 |                     selection_rate(preds[sensitive_features[col] == value])
 75 |                 ]
 76 |                 tprs += [
 77 |                     true_positive_rate(
 78 |                         target[sensitive_features[col] == value],
 79 |                         preds[sensitive_features[col] == value],
 80 |                     )
 81 |                 ]
 82 |                 fprs += [
 83 |                     false_positive_rate(
 84 |                         target[sensitive_features[col] == value],
 85 |                         preds[sensitive_features[col] == value],
 86 |                     )
 87 |                 ]
 88 |                 tnrs += [
 89 |                     true_negative_rate(
 90 |                         target[sensitive_features[col] == value],
 91 |                         preds[sensitive_features[col] == value],
 92 |                     )
 93 |                 ]
 94 |                 fnrs += [
 95 |                     false_negative_rate(
 96 |                         target[sensitive_features[col] == value],
 97 |                         preds[sensitive_features[col] == value],
 98 |                     )
 99 |                 ]
100 |                 samples += [np.sum([sensitive_features[col] == value])]
101 | 
102 |             metrics = pd.DataFrame(
103 |                 {
104 |                     "Samples": samples,
105 |                     "Accuracy": accuracies,
106 |                     "Selection Rate": selection_rates,
107 |                     "True Positive Rate": tprs,
108 |                     "False Negative Rate": fnrs,
109 |                     "False Positive Rate": fprs,
110 |                     "True Negative Rate": tnrs,
111 |                 },
112 |                 index=["Overall"] + list(values),
113 |             )
114 | 
115 |             max_selection_rate = np.max(selection_rates[1:])
116 |             min_selection_rate = np.min(selection_rates[1:])
117 | 
118 |             privileged_value, underprivileged_value = None, None
119 |             for pg in privileged_groups:
120 |                 if col_name in pg:
121 |                     privileged_value = pg.get(col_name)
122 |             for upg in underprivileged_groups:
123 |                 if col_name in upg:
124 |                     underprivileged_value = upg.get(col_name)
125 | 
126 |             if privileged_value is not None:
127 |                 for i, v in enumerate(values):
128 |                     if v == privileged_value:
129 |                         # starting from 1 because first selection rate is for all samples
130 |                         max_selection_rate = selection_rates[i + 1]
131 | 
132 |             if underprivileged_value is not None:
133 |                 for i, v in enumerate(values):
134 |                     if v == underprivileged_value:
135 |                         # starting from 1 because first selection rate is for all samples
136 |                         min_selection_rate = selection_rates[i + 1]
137 | 
138 |             demographic_parity_diff = np.round(
139 |                 max_selection_rate - min_selection_rate, 4
140 |             )
141 |             demographic_parity_ratio = np.round(
142 |                 min_selection_rate / max_selection_rate, 4
143 |             )
144 | 
145 |             tpr_min = np.min(tprs[1:])
146 |             tpr_max = np.max(tprs[1:])
147 | 
148 |             fpr_min = np.min(fprs[1:])
149 |             fpr_max = np.max(fprs[1:])
150 | 
151 |             if privileged_value is not None:
152 |                 for i, v in enumerate(values):
153 |                     if v == privileged_value:
154 |                         # starting from 1 because first value is for all samples
155 |                         tpr_max = tprs[i + 1]
156 |                         fpr_max = fprs[i + 1]
157 | 
158 |             if underprivileged_value is not None:
159 |                 for i, v in enumerate(values):
160 |                     if v == underprivileged_value:
161 |                         # starting from 1 because first value is for all samples
162 |                         tpr_min = tprs[i + 1]
163 |                         fpr_min = fprs[i + 1]
164 | 
165 |             equalized_odds_diff = np.round(max(tpr_max - tpr_min, fpr_max - fpr_min), 4)
166 |             equalized_odds_ratio = np.round(
167 |                 min(tpr_min / tpr_max, fpr_min / fpr_max), 4
168 |             )
169 | 
170 |             stats = pd.DataFrame(
171 |                 {
172 |                     "": [
173 |                         demographic_parity_diff,
174 |                         demographic_parity_ratio,
175 |                         equalized_odds_diff,
176 |                         equalized_odds_ratio,
177 |                     ]
178 |                 },
179 |                 index=[
180 |                     "Demographic Parity Difference",
181 |                     "Demographic Parity Ratio",
182 |                     "Equalized Odds Difference",
183 |                     "Equalized Odds Ratio",
184 |                 ],
185 |             )
186 | 
187 |             fairness_metric_name = ""
188 |             fairness_metric_value = 0
189 |             is_fair = False
190 |             if fairness_metric == "demographic_parity_difference":
191 |                 fairness_metric_name = "Demographic Parity Difference"
192 |                 fairness_metric_value = demographic_parity_diff
193 |                 is_fair = demographic_parity_diff < fairness_threshold
194 |             elif fairness_metric == "demographic_parity_ratio":
195 |                 fairness_metric_name = "Demographic Parity Ratio"
196 |                 fairness_metric_value = demographic_parity_ratio
197 |                 is_fair = demographic_parity_ratio > fairness_threshold
198 |             elif fairness_metric == "equalized_odds_difference":
199 |                 fairness_metric_name = "Equalized Odds Difference"
200 |                 fairness_metric_value = equalized_odds_diff
201 |                 is_fair = equalized_odds_diff < fairness_threshold
202 |             elif fairness_metric == "equalized_odds_ratio":
203 |                 fairness_metric_name = "Equalized Odds Ratio"
204 |                 fairness_metric_value = equalized_odds_ratio
205 |                 is_fair = equalized_odds_ratio > fairness_threshold
206 | 
207 |             if "parity" in fairness_metric:
208 |                 if privileged_value is None:
209 |                     ind = np.argmax(selection_rates[1:])
210 |                     privileged_value = values[ind]
211 |                 if underprivileged_value is None:
212 |                     ind = np.argmin(selection_rates[1:])
213 |                     underprivileged_value = values[ind]
214 | 
215 |             if "odds" in fairness_metric:
216 |                 if tpr_max - tpr_min > fpr_max - fpr_min:
217 |                     if privileged_value is None:
218 |                         ind = np.argmax(tprs[1:])
219 |                         privileged_value = values[ind]
220 |                     if underprivileged_value is None:
221 |                         ind = np.argmin(tprs[1:])
222 |                         underprivileged_value = values[ind]
223 |                 else:
224 |                     if privileged_value is None:
225 |                         ind = np.argmax(fprs[1:])
226 |                         privileged_value = values[ind]
227 |                     if underprivileged_value is None:
228 |                         ind = np.argmin(fprs[1:])
229 |                         underprivileged_value = values[ind]
230 | 
231 |             fairness_metrics[col_name] = {
232 |                 "metrics": metrics,
233 |                 "stats": stats,
234 |                 "figures": FairnessPlots.binary_classification(
235 |                     fairness_metric,
236 |                     col_name,
237 |                     metrics,
238 |                     selection_rates,
239 |                     max_selection_rate,
240 |                     fairness_threshold,
241 |                 ),
242 |                 "fairness_metric_name": fairness_metric_name,
243 |                 "fairness_metric_value": fairness_metric_value,
244 |                 "is_fair": is_fair,
245 |                 "privileged_value": privileged_value,
246 |                 "underprivileged_value": underprivileged_value,
247 |             }
248 | 
249 |         # fairness optimization stats
250 |         fairness_metrics[
251 |             "fairness_optimization"
252 |         ] = FairnessOptimization.binary_classification(
253 |             target,
254 |             predicted_labels,
255 |             sensitive_features,
256 |             fairness_metric,
257 |             fairness_threshold,
258 |             privileged_groups,
259 |             underprivileged_groups,
260 |             previous_fairness_optimization,
261 |             min_selection_rate,
262 |             max_selection_rate,
263 |         )
264 | 
265 |         return fairness_metrics
266 | 
267 |     @staticmethod
268 |     def regression(
269 |         target,
270 |         predictions,
271 |         sensitive_features,
272 |         fairness_metric,
273 |         fairness_threshold,
274 |         privileged_groups=[],
275 |         underprivileged_groups=[],
276 |         previous_fairness_optimization=None,
277 |     ):
278 |         metric_name = fairness_metric.split("@")[1].upper()
279 | 
280 |         if "ratio" in fairness_metric.lower():
281 |             fairness_metric_name = f"Group Loss Ratio @ {metric_name}"
282 |         else:
283 |             fairness_metric_name = f"Group Loss Difference @ {metric_name}"
284 | 
285 |         fairness_metrics = {}
286 | 
287 |         regression_metrics = {
288 |             "SAMPLES": lambda t, p, sw=None: t.shape[0],
289 |             "MAE": mean_absolute_error,
290 |             "MSE": mean_squared_error,
291 |             "RMSE": lambda t, p, sample_weight=None: np.sqrt(
292 |                 mean_squared_error(t, p, sample_weight=sample_weight)
293 |             ),
294 |             "R2": r2_score,
295 |             "MAPE": mean_absolute_percentage_error,
296 |             "SPEARMAN": spearman,
297 |             "PEARSON": pearson,
298 |         }
299 |         overall = {}
300 |         for k, v in regression_metrics.items():
301 |             overall[k] = v(target, predictions)
302 | 
303 |         for col in sensitive_features.columns:
304 |             col_name = col[10:]  # skip 'senstive_'
305 | 
306 |             values = sensitive_features[col].unique()
307 |             all_metrics = [overall]
308 | 
309 |             for value in values:
310 |                 metrics = {}
311 |                 for k, v in regression_metrics.items():
312 |                     metrics[k] = v(
313 |                         target[sensitive_features[col] == value],
314 |                         predictions[sensitive_features[col] == value],
315 |                     )
316 |                 all_metrics += [metrics]
317 | 
318 |             mdf = pd.DataFrame(all_metrics, index=["Overall"] + list(values))
319 | 
320 |             privileged_value, underprivileged_value = None, None
321 |             for pg in privileged_groups:
322 |                 if col_name in pg:
323 |                     privileged_value = pg.get(col_name)
324 |             for upg in underprivileged_groups:
325 |                 if col_name in upg:
326 |                     underprivileged_value = upg.get(col_name)
327 | 
328 |             if privileged_value is None:
329 |                 if metric_name in ["R2", "SPEARMAN", "PEARSON"]:
330 |                     # the higher the better
331 |                     privileged_value = mdf.index[
332 |                         mdf[metric_name][1:].argmax() + 1
333 |                     ]  # without overall metrics
334 |                 else:
335 |                     # the lower the better
336 |                     privileged_value = mdf.index[
337 |                         mdf[metric_name][1:].argmin() + 1
338 |                     ]  # without overall metrics
339 | 
340 |             if underprivileged_value is None:
341 |                 if metric_name in ["R2", "SPEARMAN", "PEARSON"]:
342 |                     # the higher the better
343 |                     underprivileged_value = mdf.index[
344 |                         mdf[metric_name][1:].argmin() + 1
345 |                     ]  # without overall metrics
346 |                 else:
347 |                     # the lower the better
348 |                     underprivileged_value = mdf.index[
349 |                         mdf[metric_name][1:].argmax() + 1
350 |                     ]  # without overall metrics
351 | 
352 |             metric_min = mdf[metric_name].loc[privileged_value]
353 |             metric_max = mdf[metric_name].loc[underprivileged_value]
354 | 
355 |             ratio = np.round(metric_min / metric_max, 4)
356 |             diff = np.round(metric_max - metric_min, 4)
357 | 
358 |             # ratio = np.round(mdf[metric_name][1:].min()/mdf[metric_name][1:].max(), 4)
359 |             # diff = np.round(mdf[metric_name][1:].max()-mdf[metric_name][1:].min(), 4)
360 | 
361 |             is_fair = False
362 |             if "ratio" in fairness_metric.lower():
363 |                 fairness_metric_value = ratio
364 |                 if ratio > fairness_threshold:
365 |                     is_fair = True
366 |             else:
367 |                 fairness_metric_value = diff
368 |                 if diff < fairness_threshold:
369 |                     is_fair = True
370 | 
371 |             fairness_metrics[col_name] = {
372 |                 "metrics": mdf,
373 |                 "figures": FairnessPlots.regression(
374 |                     fairness_metric, col_name, mdf, fairness_metric_name
375 |                 ),
376 |                 "privileged_value": privileged_value,
377 |                 "underprivileged_value": underprivileged_value,
378 |                 "ratio": ratio,
379 |                 "diff": diff,
380 |                 "metric_name": metric_name,
381 |                 "fairness_metric_name": fairness_metric_name,
382 |                 "fairness_metric_value": fairness_metric_value,
383 |                 "is_fair": is_fair,
384 |                 "fairness_threshold": fairness_threshold,
385 |             }
386 | 
387 |         fairness_metrics["fairness_optimization"] = FairnessOptimization.regression(
388 |             target,
389 |             predictions,
390 |             sensitive_features,
391 |             fairness_metric,
392 |             fairness_threshold,
393 |             privileged_groups,
394 |             underprivileged_groups,
395 |             previous_fairness_optimization,
396 |             performance_metric=regression_metrics[metric_name],
397 |             performance_metric_name=metric_name,
398 |         )
399 | 
400 |         return fairness_metrics
401 | 
402 |     @staticmethod
403 |     def multiclass_classification(
404 |         original_target,
405 |         predicted_labels,
406 |         sensitive_features,
407 |         fairness_metric,
408 |         fairness_threshold,
409 |         privileged_groups=[],
410 |         underprivileged_groups=[],
411 |         previous_fairness_optimization=None,
412 |     ):
413 |         original_target = np.array(original_target).ravel()
414 |         predicted_labels = np.array(predicted_labels)
415 |         target_values = list(np.unique(original_target))
416 | 
417 |         fairness_metrics = {}
418 | 
419 |         for col in sensitive_features.columns:
420 |             col_name = col[10:]  # skip 'senstive_'
421 | 
422 |             for target_value in target_values:
423 |                 # we need to reset them for each target value
424 |                 privileged_value, underprivileged_value = None, None
425 |                 for pg in privileged_groups:
426 |                     if col_name in pg:
427 |                         privileged_value = pg.get(col_name)
428 |                 for upg in underprivileged_groups:
429 |                     if col_name in upg:
430 |                         underprivileged_value = upg.get(col_name)
431 | 
432 |                 target = np.copy(original_target)
433 |                 target[original_target == target_value] = 1
434 |                 target[original_target != target_value] = 0
435 | 
436 |                 preds = np.copy(predicted_labels)
437 |                 preds[predicted_labels == target_value] = 1
438 |                 preds[predicted_labels != target_value] = 0
439 | 
440 |                 accuracies = []
441 |                 selection_rates = []
442 |                 tprs = []
443 |                 fprs = []
444 |                 tnrs = []
445 |                 fnrs = []
446 |                 samples = []
447 |                 demographic_parity_diff = None
448 |                 demographic_parity_ratio = None
449 |                 equalized_odds_diff = None
450 |                 equalized_odds_ratio = None
451 | 
452 |                 # overall
453 |                 accuracies += [accuracy(target, preds)]
454 |                 selection_rates += [selection_rate(preds)]
455 |                 tprs += [true_positive_rate(target, preds)]
456 |                 fprs += [false_positive_rate(target, preds)]
457 |                 tnrs += [true_negative_rate(target, preds)]
458 |                 fnrs += [false_negative_rate(target, preds)]
459 |                 samples += [target.shape[0]]
460 | 
461 |                 values = sensitive_features[col].unique()
462 | 
463 |                 for value in values:
464 |                     accuracies += [
465 |                         accuracy(
466 |                             target[sensitive_features[col] == value],
467 |                             preds[sensitive_features[col] == value],
468 |                         )
469 |                     ]
470 |                     selection_rates += [
471 |                         selection_rate(preds[sensitive_features[col] == value])
472 |                     ]
473 |                     tprs += [
474 |                         true_positive_rate(
475 |                             target[sensitive_features[col] == value],
476 |                             preds[sensitive_features[col] == value],
477 |                         )
478 |                     ]
479 |                     fprs += [
480 |                         false_positive_rate(
481 |                             target[sensitive_features[col] == value],
482 |                             preds[sensitive_features[col] == value],
483 |                         )
484 |                     ]
485 |                     tnrs += [
486 |                         true_negative_rate(
487 |                             target[sensitive_features[col] == value],
488 |                             preds[sensitive_features[col] == value],
489 |                         )
490 |                     ]
491 |                     fnrs += [
492 |                         false_negative_rate(
493 |                             target[sensitive_features[col] == value],
494 |                             preds[sensitive_features[col] == value],
495 |                         )
496 |                     ]
497 |                     samples += [np.sum([sensitive_features[col] == value])]
498 | 
499 |                 metrics = pd.DataFrame(
500 |                     {
501 |                         "Samples": samples,
502 |                         "Accuracy": accuracies,
503 |                         "Selection Rate": selection_rates,
504 |                         "True Positive Rate": tprs,
505 |                         "False Negative Rate": fnrs,
506 |                         "False Positive Rate": fprs,
507 |                         "True Negative Rate": tnrs,
508 |                     },
509 |                     index=["Overall"] + list(values),
510 |                 )
511 | 
512 |                 max_selection_rate = np.max(selection_rates[1:])
513 |                 min_selection_rate = np.min(selection_rates[1:])
514 | 
515 |                 if privileged_value is not None:
516 |                     for i, v in enumerate(values):
517 |                         if v == privileged_value:
518 |                             # starting from 1 because first selection rate is for all samples
519 |                             max_selection_rate = selection_rates[i + 1]
520 | 
521 |                 if underprivileged_value is not None:
522 |                     for i, v in enumerate(values):
523 |                         if v == underprivileged_value:
524 |                             # starting from 1 because first selection rate is for all samples
525 |                             min_selection_rate = selection_rates[i + 1]
526 | 
527 |                 demographic_parity_diff = np.round(
528 |                     max_selection_rate - min_selection_rate, 4
529 |                 )
530 |                 demographic_parity_ratio = np.round(
531 |                     min_selection_rate / max_selection_rate, 4
532 |                 )
533 | 
534 |                 tpr_min = np.min(tprs[1:])
535 |                 tpr_max = np.max(tprs[1:])
536 | 
537 |                 fpr_min = np.min(fprs[1:])
538 |                 fpr_max = np.max(fprs[1:])
539 | 
540 |                 if privileged_value is not None:
541 |                     for i, v in enumerate(values):
542 |                         if v == privileged_value:
543 |                             # starting from 1 because first value is for all samples
544 |                             tpr_max = tprs[i + 1]
545 |                             fpr_max = fprs[i + 1]
546 | 
547 |                 if underprivileged_value is not None:
548 |                     for i, v in enumerate(values):
549 |                         if v == underprivileged_value:
550 |                             # starting from 1 because first value is for all samples
551 |                             tpr_min = tprs[i + 1]
552 |                             fpr_min = fprs[i + 1]
553 | 
554 |                 equalized_odds_diff = np.round(
555 |                     max(tpr_max - tpr_min, fpr_max - fpr_min), 4
556 |                 )
557 |                 equalized_odds_ratio = np.round(
558 |                     min(tpr_min / tpr_max, fpr_min / fpr_max), 4
559 |                 )
560 | 
561 |                 stats = pd.DataFrame(
562 |                     {
563 |                         "": [
564 |                             demographic_parity_diff,
565 |                             demographic_parity_ratio,
566 |                             equalized_odds_diff,
567 |                             equalized_odds_ratio,
568 |                         ]
569 |                     },
570 |                     index=[
571 |                         "Demographic Parity Difference",
572 |                         "Demographic Parity Ratio",
573 |                         "Equalized Odds Difference",
574 |                         "Equalized Odds Ratio",
575 |                     ],
576 |                 )
577 | 
578 |                 fairness_metric_name = ""
579 |                 fairness_metric_value = 0
580 |                 is_fair = False
581 |                 if fairness_metric == "demographic_parity_difference":
582 |                     fairness_metric_name = "Demographic Parity Difference"
583 |                     fairness_metric_value = demographic_parity_diff
584 |                     is_fair = demographic_parity_diff < fairness_threshold
585 |                 elif fairness_metric == "demographic_parity_ratio":
586 |                     fairness_metric_name = "Demographic Parity Ratio"
587 |                     fairness_metric_value = demographic_parity_ratio
588 |                     is_fair = demographic_parity_ratio > fairness_threshold
589 |                 elif fairness_metric == "equalized_odds_difference":
590 |                     fairness_metric_name = "Equalized Odds Difference"
591 |                     fairness_metric_value = equalized_odds_diff
592 |                     is_fair = equalized_odds_diff < fairness_threshold
593 |                 elif fairness_metric == "equalized_odds_ratio":
594 |                     fairness_metric_name = "Equalized Odds Ratio"
595 |                     fairness_metric_value = equalized_odds_ratio
596 |                     is_fair = equalized_odds_ratio > fairness_threshold
597 | 
598 |                 if "parity" in fairness_metric:
599 |                     if privileged_value is None:
600 |                         ind = np.argmax(selection_rates[1:])
601 |                         privileged_value = values[ind]
602 |                     if underprivileged_value is None:
603 |                         ind = np.argmin(selection_rates[1:])
604 |                         underprivileged_value = values[ind]
605 | 
606 |                 if "odds" in fairness_metric:
607 |                     if tpr_max - tpr_min > fpr_max - fpr_min:
608 |                         if privileged_value is None:
609 |                             ind = np.argmax(tprs[1:])
610 |                             privileged_value = values[ind]
611 |                         if underprivileged_value is None:
612 |                             ind = np.argmin(tprs[1:])
613 |                             underprivileged_value = values[ind]
614 |                     else:
615 |                         if privileged_value is None:
616 |                             ind = np.argmax(fprs[1:])
617 |                             privileged_value = values[ind]
618 |                         if underprivileged_value is None:
619 |                             ind = np.argmin(fprs[1:])
620 |                             underprivileged_value = values[ind]
621 | 
622 |                 fairness_metrics[f"{col_name}__{target_value}"] = {
623 |                     "metrics": metrics,
624 |                     "stats": stats,
625 |                     "figures": FairnessPlots.binary_classification(
626 |                         fairness_metric,
627 |                         f"{col_name}__{target_value}",
628 |                         metrics,
629 |                         selection_rates,
630 |                         max_selection_rate,
631 |                         fairness_threshold,
632 |                     ),
633 |                     "fairness_metric_name": fairness_metric_name,
634 |                     "fairness_metric_value": fairness_metric_value,
635 |                     "is_fair": is_fair,
636 |                     "privileged_value": privileged_value,
637 |                     "underprivileged_value": underprivileged_value,
638 |                 }
639 | 
640 |         # fairness optimization stats
641 |         fairness_metrics[
642 |             "fairness_optimization"
643 |         ] = FairnessOptimization.multiclass_classification(
644 |             original_target,
645 |             predicted_labels,
646 |             sensitive_features,
647 |             fairness_metric,
648 |             fairness_threshold,
649 |             privileged_groups,
650 |             underprivileged_groups,
651 |             previous_fairness_optimization,
652 |         )
653 | 
654 |         return fairness_metrics
655 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/preprocessing.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from supervised.algorithms.registry import (
  7 |     BINARY_CLASSIFICATION,
  8 |     MULTICLASS_CLASSIFICATION,
  9 | )
 10 | from supervised.exceptions import AutoMLException
 11 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer
 12 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
 13 | from supervised.preprocessing.goldenfeatures_transformer import (
 14 |     GoldenFeaturesTransformer,
 15 | )
 16 | from supervised.preprocessing.kmeans_transformer import KMeansTransformer
 17 | from supervised.preprocessing.label_binarizer import LabelBinarizer
 18 | from supervised.preprocessing.label_encoder import LabelEncoder
 19 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 20 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
 21 | from supervised.preprocessing.scale import Scale
 22 | from supervised.preprocessing.text_transformer import TextTransformer
 23 | from supervised.utils.config import LOG_LEVEL
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | logger.setLevel(LOG_LEVEL)
 27 | 
 28 | 
 29 | class Preprocessing(object):
 30 |     def __init__(
 31 |         self,
 32 |         preprocessing_params={"target_preprocessing": [], "columns_preprocessing": {}},
 33 |         model_name=None,
 34 |         k_fold=None,
 35 |         repeat=None,
 36 |     ):
 37 |         self._params = preprocessing_params
 38 | 
 39 |         if "target_preprocessing" not in preprocessing_params:
 40 |             self._params["target_preprocessing"] = []
 41 |         if "columns_preprocessing" not in preprocessing_params:
 42 |             self._params["columns_preprocessing"] = {}
 43 | 
 44 |         # preprocssing step attributes
 45 |         self._categorical_y = None
 46 |         self._scale_y = None
 47 |         self._missing_values = []
 48 |         self._categorical = []
 49 |         self._scale = []
 50 |         self._remove_columns = []
 51 |         self._datetime_transforms = []
 52 |         self._text_transforms = []
 53 |         self._golden_features = None
 54 |         self._kmeans = None
 55 |         self._add_random_feature = self._params.get("add_random_feature", False)
 56 |         self._drop_features = self._params.get("drop_features", [])
 57 |         self._model_name = model_name
 58 |         self._k_fold = k_fold
 59 |         self._repeat = repeat
 60 | 
 61 |     def _exclude_missing_targets(self, X=None, y=None):
 62 |         # check if there are missing values in target column
 63 |         if y is None:
 64 |             return X, y
 65 |         y_missing = pd.isnull(y)
 66 |         if np.sum(np.array(y_missing)) == 0:
 67 |             return X, y
 68 |         y = y.drop(y.index[y_missing])
 69 |         y.index = range(y.shape[0])
 70 |         if X is not None:
 71 |             X = X.drop(X.index[y_missing])
 72 |             X.index = range(X.shape[0])
 73 |         return X, y
 74 | 
 75 |     # fit and transform
 76 |     def fit_and_transform(self, X_train, y_train, sample_weight=None):
 77 |         logger.debug("Preprocessing.fit_and_transform")
 78 | 
 79 |         if y_train is not None:
 80 |             # target preprocessing
 81 |             # this must be used first, maybe we will drop some rows because of missing target values
 82 |             target_preprocessing = self._params.get("target_preprocessing")
 83 |             logger.debug("target_preprocessing params: {}".format(target_preprocessing))
 84 | 
 85 |             X_train, y_train, sample_weight, _ = ExcludeRowsMissingTarget.transform(
 86 |                 X_train, y_train, sample_weight
 87 |             )
 88 | 
 89 |             if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
 90 |                 logger.debug("Convert target to integer")
 91 |                 self._categorical_y = LabelEncoder(try_to_fit_numeric=True)
 92 |                 self._categorical_y.fit(y_train)
 93 |                 y_train = pd.Series(self._categorical_y.transform(y_train))
 94 | 
 95 |             if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
 96 |                 logger.debug("Convert target to one-hot coding")
 97 |                 self._categorical_y = LabelBinarizer()
 98 |                 self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target")
 99 |                 y_train = self._categorical_y.transform(
100 |                     pd.DataFrame({"target": y_train}), "target"
101 |                 )
102 | 
103 |             if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
104 |                 logger.debug("Scale log and normal")
105 | 
106 |                 self._scale_y = Scale(
107 |                     ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL
108 |                 )
109 |                 y_train = pd.DataFrame({"target": y_train})
110 |                 self._scale_y.fit(y_train)
111 |                 y_train = self._scale_y.transform(y_train)
112 |                 y_train = y_train["target"]
113 | 
114 |             if Scale.SCALE_NORMAL in target_preprocessing:
115 |                 logger.debug("Scale normal")
116 | 
117 |                 self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL)
118 |                 y_train = pd.DataFrame({"target": y_train})
119 |                 self._scale_y.fit(y_train)
120 |                 y_train = self._scale_y.transform(y_train)
121 |                 y_train = y_train["target"]
122 | 
123 |         # columns preprocessing
124 |         columns_preprocessing = self._params.get("columns_preprocessing")
125 |         for column in columns_preprocessing:
126 |             transforms = columns_preprocessing[column]
127 |             # logger.debug("Preprocess column {} with: {}".format(column, transforms))
128 | 
129 |         # remove empty or constant columns
130 |         cols_to_remove = list(
131 |             filter(
132 |                 lambda k: "remove_column" in columns_preprocessing[k],
133 |                 columns_preprocessing,
134 |             )
135 |         )
136 | 
137 |         if X_train is not None:
138 |             X_train.drop(cols_to_remove, axis=1, inplace=True)
139 |         self._remove_columns = cols_to_remove
140 | 
141 |         numeric_cols = []  # get numeric cols before text transformations
142 |         # needed for golden features
143 |         if X_train is not None and (
144 |             "golden_features" in self._params or "kmeans_features" in self._params
145 |         ):
146 |             numeric_cols = X_train.select_dtypes(include="number").columns.tolist()
147 | 
148 |         # there can be missing values in the text data,
149 |         # but we don't want to handle it by fill missing methods
150 |         # zeros will be imputed by text_transform method
151 |         cols_to_process = list(
152 |             filter(
153 |                 lambda k: "text_transform" in columns_preprocessing[k],
154 |                 columns_preprocessing,
155 |             )
156 |         )
157 | 
158 |         new_text_columns = []
159 |         for col in cols_to_process:
160 |             t = TextTransformer()
161 |             t.fit(X_train, col)
162 |             X_train = t.transform(X_train)
163 |             self._text_transforms += [t]
164 |             new_text_columns += t._new_columns
165 |         # end of text transform
166 | 
167 |         for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]:
168 |             cols_to_process = list(
169 |                 filter(
170 |                     lambda k: missing_method in columns_preprocessing[k],
171 |                     columns_preprocessing,
172 |                 )
173 |             )
174 |             missing = PreprocessingMissingValues(cols_to_process, missing_method)
175 |             missing.fit(X_train)
176 |             X_train = missing.transform(X_train)
177 |             self._missing_values += [missing]
178 | 
179 |         # golden features
180 |         golden_columns = []
181 |         if "golden_features" in self._params:
182 |             results_path = self._params["golden_features"]["results_path"]
183 |             ml_task = self._params["golden_features"]["ml_task"]
184 |             features_count = self._params["golden_features"].get("features_count")
185 |             n_jobs = self._params["golden_features"].get("n_jobs", -1)
186 |             self._golden_features = GoldenFeaturesTransformer(
187 |                 results_path, ml_task, features_count, n_jobs
188 |             )
189 |             self._golden_features.fit(X_train[numeric_cols], y_train)
190 |             X_train = self._golden_features.transform(X_train)
191 |             golden_columns = self._golden_features._new_columns
192 | 
193 |         kmeans_columns = []
194 |         if "kmeans_features" in self._params:
195 |             results_path = self._params["kmeans_features"]["results_path"]
196 |             self._kmeans = KMeansTransformer(
197 |                 results_path, self._model_name, self._k_fold
198 |             )
199 |             self._kmeans.fit(X_train[numeric_cols], y_train)
200 |             X_train = self._kmeans.transform(X_train)
201 |             kmeans_columns = self._kmeans._new_features
202 | 
203 |         for convert_method in [
204 |             PreprocessingCategorical.CONVERT_INTEGER,
205 |             PreprocessingCategorical.CONVERT_ONE_HOT
206 |         ]:
207 |             cols_to_process = list(
208 |                 filter(
209 |                     lambda k: convert_method in columns_preprocessing[k],
210 |                     columns_preprocessing,
211 |                 )
212 |             )
213 |             convert = PreprocessingCategorical(cols_to_process, convert_method)
214 |             convert.fit(X_train, y_train)
215 |             X_train = convert.transform(X_train)
216 |             self._categorical += [convert]
217 | 
218 |         # datetime transform
219 |         cols_to_process = list(
220 |             filter(
221 |                 lambda k: "datetime_transform" in columns_preprocessing[k],
222 |                 columns_preprocessing,
223 |             )
224 |         )
225 | 
226 |         new_datetime_columns = []
227 |         for col in cols_to_process:
228 |             t = DateTimeTransformer()
229 |             t.fit(X_train, col)
230 |             X_train = t.transform(X_train)
231 |             self._datetime_transforms += [t]
232 |             new_datetime_columns += t._new_columns
233 | 
234 |         # SCALE
235 |         for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
236 |             cols_to_process = list(
237 |                 filter(
238 |                     lambda k: scale_method in columns_preprocessing[k],
239 |                     columns_preprocessing,
240 |                 )
241 |             )
242 |             if (
243 |                 len(cols_to_process)
244 |                 and len(new_datetime_columns)
245 |                 and scale_method == Scale.SCALE_NORMAL
246 |             ):
247 |                 cols_to_process += new_datetime_columns
248 |             if (
249 |                 len(cols_to_process)
250 |                 and len(new_text_columns)
251 |                 and scale_method == Scale.SCALE_NORMAL
252 |             ):
253 |                 cols_to_process += new_text_columns
254 | 
255 |             if (
256 |                 len(cols_to_process)
257 |                 and len(golden_columns)
258 |                 and scale_method == Scale.SCALE_NORMAL
259 |             ):
260 |                 cols_to_process += golden_columns
261 | 
262 |             if (
263 |                 len(cols_to_process)
264 |                 and len(kmeans_columns)
265 |                 and scale_method == Scale.SCALE_NORMAL
266 |             ):
267 |                 cols_to_process += kmeans_columns
268 | 
269 |             if len(cols_to_process):
270 |                 scale = Scale(cols_to_process)
271 |                 scale.fit(X_train)
272 |                 X_train = scale.transform(X_train)
273 |                 self._scale += [scale]
274 | 
275 |         if self._add_random_feature:
276 |             # -1, 1, with 0 mean
277 |             X_train["random_feature"] = np.random.rand(X_train.shape[0]) * 2.0 - 1.0
278 | 
279 |         if self._drop_features:
280 |             available_cols = X_train.columns.tolist()
281 |             drop_cols = [c for c in self._drop_features if c in available_cols]
282 |             if len(drop_cols) == X_train.shape[1]:
283 |                 raise AutoMLException(
284 |                     "All features are droppped! Your data looks like random data."
285 |                 )
286 |             if drop_cols:
287 |                 X_train.drop(drop_cols, axis=1, inplace=True)
288 |             self._drop_features = drop_cols
289 | 
290 |         if X_train is not None:
291 |             # there can be catagorical columns (in CatBoost) which cant be clipped
292 |             numeric_cols = X_train.select_dtypes(include="number").columns.tolist()
293 |             X_train[numeric_cols] = X_train[numeric_cols].clip(
294 |                 lower=np.finfo(np.float32).min + 1000,
295 |                 upper=np.finfo(np.float32).max - 1000,
296 |             )
297 | 
298 |         return X_train, y_train, sample_weight
299 | 
300 |     def transform(self, X_validation, y_validation, sample_weight_validation=None):
301 |         logger.debug("Preprocessing.transform")
302 | 
303 |         # doing copy to avoid SettingWithCopyWarning
304 |         if X_validation is not None:
305 |             X_validation = X_validation.copy(deep=False)
306 |         if y_validation is not None:
307 |             y_validation = y_validation.copy(deep=False)
308 | 
309 |         # target preprocessing
310 |         # this must be used first, maybe we will drop some rows because of missing target values
311 |         if y_validation is not None:
312 |             target_preprocessing = self._params.get("target_preprocessing")
313 |             logger.debug("target_preprocessing -> {}".format(target_preprocessing))
314 | 
315 |             (
316 |                 X_validation,
317 |                 y_validation,
318 |                 sample_weight_validation,
319 |                 _,
320 |             ) = ExcludeRowsMissingTarget.transform(
321 |                 X_validation, y_validation, sample_weight_validation
322 |             )
323 | 
324 |             if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing:
325 |                 if y_validation is not None and self._categorical_y is not None:
326 |                     y_validation = pd.Series(
327 |                         self._categorical_y.transform(y_validation)
328 |                     )
329 | 
330 |             if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing:
331 |                 if y_validation is not None and self._categorical_y is not None:
332 |                     y_validation = self._categorical_y.transform(
333 |                         pd.DataFrame({"target": y_validation}), "target"
334 |                     )
335 | 
336 |             if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing:
337 |                 if self._scale_y is not None and y_validation is not None:
338 |                     logger.debug("Transform log and normalize")
339 |                     y_validation = pd.DataFrame({"target": y_validation})
340 |                     y_validation = self._scale_y.transform(y_validation)
341 |                     y_validation = y_validation["target"]
342 | 
343 |             if Scale.SCALE_NORMAL in target_preprocessing:
344 |                 if self._scale_y is not None and y_validation is not None:
345 |                     logger.debug("Transform normalize")
346 |                     y_validation = pd.DataFrame({"target": y_validation})
347 |                     y_validation = self._scale_y.transform(y_validation)
348 |                     y_validation = y_validation["target"]
349 | 
350 |         # columns preprocessing
351 |         if len(self._remove_columns) and X_validation is not None:
352 |             cols_to_remove = [
353 |                 col for col in X_validation.columns if col in self._remove_columns
354 |             ]
355 |             X_validation.drop(cols_to_remove, axis=1, inplace=True)
356 | 
357 |         # text transform
358 |         for tt in self._text_transforms:
359 |             if X_validation is not None and tt is not None:
360 |                 X_validation = tt.transform(X_validation)
361 | 
362 |         for missing in self._missing_values:
363 |             if X_validation is not None and missing is not None:
364 |                 X_validation = missing.transform(X_validation)
365 | 
366 |         # to be sure that all missing are filled
367 |         # in case new data there can be gaps!
368 |         if (
369 |             X_validation is not None
370 |             and pd.isnull(X_validation).sum().sum() > 0
371 |             and len(self._params["columns_preprocessing"]) > 0
372 |         ):
373 |             # there is something missing, fill it
374 |             # we should notice user about it!
375 |             # warnings should go to the separate file ...
376 |             # warnings.warn(
377 |             #    "There are columns {} with missing values which didnt have missing values in train dataset.".format(
378 |             #        list(
379 |             #            X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))]
380 |             #        )
381 |             #    )
382 |             # )
383 |             missing = PreprocessingMissingValues(
384 |                 X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
385 |             )
386 |             missing.fit(X_validation)
387 |             X_validation = missing.transform(X_validation)
388 | 
389 |         # golden features
390 |         if self._golden_features is not None:
391 |             X_validation = self._golden_features.transform(X_validation)
392 | 
393 |         if self._kmeans is not None:
394 |             X_validation = self._kmeans.transform(X_validation)
395 | 
396 |         for convert in self._categorical:
397 |             if X_validation is not None and convert is not None:
398 |                 X_validation = convert.transform(X_validation)
399 | 
400 |         for dtt in self._datetime_transforms:
401 |             if X_validation is not None and dtt is not None:
402 |                 X_validation = dtt.transform(X_validation)
403 | 
404 |         for scale in self._scale:
405 |             if X_validation is not None and scale is not None:
406 |                 X_validation = scale.transform(X_validation)
407 | 
408 |         if self._add_random_feature:
409 |             # -1, 1, with 0 mean
410 |             X_validation["random_feature"] = (
411 |                 np.random.rand(X_validation.shape[0]) * 2.0 - 1.0
412 |             )
413 | 
414 |         if self._drop_features and X_validation is not None:
415 |             X_validation.drop(self._drop_features, axis=1, inplace=True)
416 | 
417 |         if X_validation is not None:
418 |             # there can be catagorical columns (in CatBoost) which cant be clipped
419 |             numeric_cols = X_validation.select_dtypes(include="number").columns.tolist()
420 |             X_validation[numeric_cols] = X_validation[numeric_cols].clip(
421 |                 lower=np.finfo(np.float32).min + 1000,
422 |                 upper=np.finfo(np.float32).max - 1000,
423 |             )
424 | 
425 |         return X_validation, y_validation, sample_weight_validation
426 | 
427 |     def inverse_scale_target(self, y):
428 |         if self._scale_y is not None:
429 |             y = pd.DataFrame({"target": y})
430 |             y = self._scale_y.inverse_transform(y)
431 |             y = y["target"]
432 |         return y
433 | 
434 |     def inverse_categorical_target(self, y):
435 |         if self._categorical_y is not None:
436 |             y = self._categorical_y.inverse_transform(y)
437 |             y = y.astype(str)
438 |         return y
439 | 
440 |     def get_target_class_names(self):
441 |         pos_label, neg_label = "1", "0"
442 |         if self._categorical_y is not None:
443 |             if self._params["ml_task"] == BINARY_CLASSIFICATION:
444 |                 # binary classification
445 |                 for label, value in self._categorical_y.to_json().items():
446 |                     if value == 1:
447 |                         pos_label = label
448 |                     else:
449 |                         neg_label = label
450 |                 return [neg_label, pos_label]
451 |             else:
452 |                 # multiclass classification
453 |                 # logger.debug(self._categorical_y.to_json())
454 |                 if "unique_values" not in self._categorical_y.to_json():
455 |                     labels = dict(
456 |                         (v, k) for k, v in self._categorical_y.to_json().items()
457 |                     )
458 |                 else:
459 |                     labels = {
460 |                         i: v
461 |                         for i, v in enumerate(
462 |                             self._categorical_y.to_json()["unique_values"]
463 |                         )
464 |                     }
465 | 
466 |                 return list(labels.values())
467 | 
468 |         else:  # self._categorical_y is None
469 |             if "ml_task" in self._params:
470 |                 if self._params["ml_task"] == BINARY_CLASSIFICATION:
471 |                     return ["0", "1"]
472 |         return []
473 | 
474 |     def prepare_target_labels(self, y):
475 |         pos_label, neg_label = "1", "0"
476 | 
477 |         if self._categorical_y is not None:
478 |             if len(y.shape) == 1:
479 |                 # binary classification
480 |                 for label, value in self._categorical_y.to_json().items():
481 |                     if value == 1:
482 |                         pos_label = label
483 |                     else:
484 |                         neg_label = label
485 |                 # threshold is applied in AutoML class
486 |                 return pd.DataFrame(
487 |                     {
488 |                         "prediction_{}".format(neg_label): 1 - y,
489 |                         "prediction_{}".format(pos_label): y,
490 |                     }
491 |                 )
492 |             else:
493 |                 # multiclass classification
494 |                 if "unique_values" not in self._categorical_y.to_json():
495 |                     labels = dict(
496 |                         (v, k) for k, v in self._categorical_y.to_json().items()
497 |                     )
498 |                 else:
499 |                     labels = {
500 |                         i: v
501 |                         for i, v in enumerate(
502 |                             self._categorical_y.to_json()["unique_values"]
503 |                         )
504 |                     }
505 | 
506 |                 d = {}
507 |                 cols = []
508 |                 for i in range(y.shape[1]):
509 |                     d["prediction_{}".format(labels[i])] = y[:, i]
510 |                     cols += ["prediction_{}".format(labels[i])]
511 |                 df = pd.DataFrame(d)
512 |                 df["label"] = np.argmax(np.array(df[cols]), axis=1)
513 | 
514 |                 df["label"] = df["label"].map(labels)
515 | 
516 |                 return df
517 |         else:  # self._categorical_y is None
518 |             if "ml_task" in self._params:
519 |                 if self._params["ml_task"] == BINARY_CLASSIFICATION:
520 |                     return pd.DataFrame({"prediction_0": 1 - y, "prediction_1": y})
521 |                 elif self._params["ml_task"] == MULTICLASS_CLASSIFICATION:
522 |                     return pd.DataFrame(
523 |                         data=y,
524 |                         columns=["prediction_{}".format(i) for i in range(y.shape[1])],
525 |                     )
526 | 
527 |         return pd.DataFrame({"prediction": y})
528 | 
529 |     def to_json(self):
530 |         preprocessing_params = {}
531 |         if self._remove_columns:
532 |             preprocessing_params["remove_columns"] = self._remove_columns
533 |         if self._missing_values is not None and len(self._missing_values):
534 |             mvs = []  # refactor
535 |             for mv in self._missing_values:
536 |                 if mv.to_json():
537 |                     mvs += [mv.to_json()]
538 |             if mvs:
539 |                 preprocessing_params["missing_values"] = mvs
540 |         if self._categorical is not None and len(self._categorical):
541 |             cats = []  # refactor
542 |             for cat in self._categorical:
543 |                 if cat.to_json():
544 |                     cats += [cat.to_json()]
545 |             if cats:
546 |                 preprocessing_params["categorical"] = cats
547 | 
548 |         if self._datetime_transforms is not None and len(self._datetime_transforms):
549 |             dtts = []
550 |             for dtt in self._datetime_transforms:
551 |                 dtts += [dtt.to_json()]
552 |             if dtts:
553 |                 preprocessing_params["datetime_transforms"] = dtts
554 | 
555 |         if self._text_transforms is not None and len(self._text_transforms):
556 |             tts = []
557 |             for tt in self._text_transforms:
558 |                 tts += [tt.to_json()]
559 |             if tts:
560 |                 preprocessing_params["text_transforms"] = tts
561 | 
562 |         if self._golden_features is not None:
563 |             preprocessing_params["golden_features"] = self._golden_features.to_json()
564 | 
565 |         if self._kmeans is not None:
566 |             preprocessing_params["kmeans"] = self._kmeans.to_json()
567 | 
568 |         if self._scale is not None and len(self._scale):
569 |             scs = [sc.to_json() for sc in self._scale if sc.to_json()]
570 |             if scs:
571 |                 preprocessing_params["scale"] = scs
572 |         if self._categorical_y is not None:
573 |             cat_y = self._categorical_y.to_json()
574 |             if cat_y:
575 |                 preprocessing_params["categorical_y"] = cat_y
576 |         if self._scale_y is not None:
577 |             preprocessing_params["scale_y"] = self._scale_y.to_json()
578 | 
579 |         if "ml_task" in self._params:
580 |             preprocessing_params["ml_task"] = self._params["ml_task"]
581 | 
582 |         if self._add_random_feature:
583 |             preprocessing_params["add_random_feature"] = True
584 | 
585 |         if self._drop_features:
586 |             preprocessing_params["drop_features"] = self._drop_features
587 | 
588 |         preprocessing_params["params"] = self._params
589 | 
590 |         return preprocessing_params
591 | 
592 |     def from_json(self, data_json, results_path):
593 |         self._params = data_json.get("params", self._params)
594 | 
595 |         if "remove_columns" in data_json:
596 |             self._remove_columns = data_json.get("remove_columns", [])
597 |         if "missing_values" in data_json:
598 |             self._missing_values = []
599 |             for mv_data in data_json["missing_values"]:
600 |                 mv = PreprocessingMissingValues()
601 |                 mv.from_json(mv_data)
602 |                 self._missing_values += [mv]
603 |         if "categorical" in data_json:
604 |             self._categorical = []
605 |             for cat_data in data_json["categorical"]:
606 |                 cat = PreprocessingCategorical()
607 |                 cat.from_json(cat_data)
608 |                 self._categorical += [cat]
609 | 
610 |         if "datetime_transforms" in data_json:
611 |             self._datetime_transforms = []
612 |             for dtt_params in data_json["datetime_transforms"]:
613 |                 dtt = DateTimeTransformer()
614 |                 dtt.from_json(dtt_params)
615 |                 self._datetime_transforms += [dtt]
616 | 
617 |         if "text_transforms" in data_json:
618 |             self._text_transforms = []
619 |             for tt_params in data_json["text_transforms"]:
620 |                 tt = TextTransformer()
621 |                 tt.from_json(tt_params)
622 |                 self._text_transforms += [tt]
623 | 
624 |         if "golden_features" in data_json:
625 |             self._golden_features = GoldenFeaturesTransformer()
626 |             self._golden_features.from_json(data_json["golden_features"], results_path)
627 | 
628 |         if "kmeans" in data_json:
629 |             self._kmeans = KMeansTransformer()
630 |             self._kmeans.from_json(data_json["kmeans"], results_path)
631 | 
632 |         if "scale" in data_json:
633 |             self._scale = []
634 |             for scale_data in data_json["scale"]:
635 |                 sc = Scale()
636 |                 sc.from_json(scale_data)
637 |                 self._scale += [sc]
638 |         if "categorical_y" in data_json:
639 |             if "new_columns" in data_json["categorical_y"]:
640 |                 self._categorical_y = LabelBinarizer()
641 |             else:
642 |                 self._categorical_y = LabelEncoder()
643 | 
644 |             self._categorical_y.from_json(data_json["categorical_y"])
645 |         if "scale_y" in data_json:
646 |             self._scale_y = Scale()
647 |             self._scale_y.from_json(data_json["scale_y"])
648 |         if "ml_task" in data_json:
649 |             self._params["ml_task"] = data_json["ml_task"]
650 | 
651 |         self._add_random_feature = data_json.get("add_random_feature", False)
652 |         self._drop_features = data_json.get("drop_features", [])
653 | 
```

--------------------------------------------------------------------------------
/supervised/automl.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | import matplotlib
  4 | 
  5 | import warnings
  6 | 
  7 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
  8 | 
  9 | from collections.abc import Iterable
 10 | 
 11 | # libraries for type hints
 12 | from typing import List, Optional, Union
 13 | 
 14 | import numpy
 15 | import pandas
 16 | from typing_extensions import (
 17 |     Literal,
 18 | )  # typing_extensions is used for using Literal from python 3.7
 19 | 
 20 | from supervised.base_automl import BaseAutoML
 21 | from supervised.utils.config import LOG_LEVEL
 22 | 
 23 | logging.basicConfig(
 24 |     format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR
 25 | )
 26 | logger = logging.getLogger(__name__)
 27 | logger.setLevel(LOG_LEVEL)
 28 | 
 29 | 
 30 | class AutoML(BaseAutoML):
 31 | 
 32 |     """
 33 |     Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression).
 34 |     """
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         results_path: Optional[str] = None,
 39 |         total_time_limit: int = 60 * 60,
 40 |         mode: Literal["Explain", "Perform", "Compete", "Optuna"] = "Explain",
 41 |         ml_task: Literal[
 42 |             "auto", "binary_classification", "multiclass_classification", "regression"
 43 |         ] = "auto",
 44 |         model_time_limit: Optional[int] = None,
 45 |         algorithms: Union[
 46 |             Literal["auto"],
 47 |             List[
 48 |                 Literal[
 49 |                     "Baseline",
 50 |                     "Linear",
 51 |                     "Decision Tree",
 52 |                     "Random Forest",
 53 |                     "Extra Trees",
 54 |                     "LightGBM",
 55 |                     "Xgboost",
 56 |                     "CatBoost",
 57 |                     "Neural Network",
 58 |                     "Nearest Neighbors",
 59 |                 ]
 60 |             ],
 61 |         ] = "auto",
 62 |         train_ensemble: bool = True,
 63 |         stack_models: Union[Literal["auto"], bool] = "auto",
 64 |         eval_metric: str = "auto",
 65 |         validation_strategy: Union[Literal["auto"], dict] = "auto",
 66 |         explain_level: Union[Literal["auto"], Literal[0, 1, 2]] = "auto",
 67 |         golden_features: Union[Literal["auto"], bool, int] = "auto",
 68 |         features_selection: Union[Literal["auto"], bool] = "auto",
 69 |         start_random_models: Union[Literal["auto"], int] = "auto",
 70 |         hill_climbing_steps: Union[Literal["auto"], int] = "auto",
 71 |         top_models_to_improve: Union[Literal["auto"], int] = "auto",
 72 |         boost_on_errors: Union[Literal["auto"], bool] = "auto",
 73 |         kmeans_features: Union[Literal["auto"], bool] = "auto",
 74 |         mix_encoding: Union[Literal["auto"], bool] = "auto",
 75 |         max_single_prediction_time: Optional[Union[int, float]] = None,
 76 |         optuna_time_budget: Optional[int] = None,
 77 |         optuna_init_params: dict = {},
 78 |         optuna_verbose: bool = True,
 79 |         fairness_metric: str = "auto",
 80 |         fairness_threshold: Union[Literal["auto"], float] = "auto",
 81 |         privileged_groups: Union[Literal["auto"], list] = "auto",
 82 |         underprivileged_groups: Union[Literal["auto"], list] = "auto",
 83 |         n_jobs: int = -1,
 84 |         verbose: int = 1,
 85 |         random_state: int = 1234,
 86 |     ):
 87 |         """
 88 |         Initialize `AutoML` object.
 89 | 
 90 |         Arguments:
 91 |             results_path (str): The path with results. If None, then the name of directory will be generated with the template: AutoML_{number},
 92 |                 where the number can be from 1 to 1,000 - depends which direcory name will be available.
 93 |                 If the `results_path` will point to directory with AutoML results (`params.json` must be present),
 94 |                 then all models will be loaded.
 95 | 
 96 |             total_time_limit (int): The total time limit in seconds for AutoML training.
 97 |                 It is not used when `model_time_limit` is not `None`.
 98 | 
 99 |             mode (str): Can be {`Explain`, `Perform`, `Compete`, `Optuna`}. This parameter defines the goal of AutoML and how intensive the AutoML search will be.
100 | 
101 |                 - `Explain` : To to be used when the user wants to explain and understand the data.
102 |                     - Uses 75%/25% train/test split.
103 |                     - Uses the following models: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `XGBoost`, `Neural Network`, and `Ensemble`.
104 |                     - Has full explanations in reports: learning curves, importance plots, and SHAP plots.
105 |                 - `Perform` : To be used when the user wants to train a model that will be used in real-life use cases.
106 |                     - Uses 5-fold CV (Cross-Validation).
107 |                     - Uses the following models: `Linear`, `Random Forest`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`, and `Ensemble`.
108 |                     - Has learning curves and importance plots in reports.
109 |                 - `Compete` : To be used for machine learning competitions (maximum performance).
110 |                     - Uses 80/20 train/test split, or 5-fold CV, or 10-fold CV (Cross-Validation) - it depends on `total_time_limit`. If not set directly, AutoML will select validation automatically.
111 |                     - Uses the following models: `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`,  `XGBoost`, `CatBoost`, `Neural Network`,
112 |                         `Nearest Neighbors`, `Ensemble`, and `Stacking`.
113 |                     - It has only learning curves in the reports.
114 |                 - `Optuna` : To be used for creating highly-tuned machine learning models.
115 |                     - Uses 10-fold CV (Cross-Validation).
116 |                     - It tunes with Optuna the following algorithms: `Random Forest`, `Extra Trees`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`.
117 |                     - It applies `Ensemble` and `Stacking` for trained models.
118 |                     - It has only learning curves in the reports.
119 | 
120 |             ml_task (str): Can be {"auto", "binary_classification", "multiclass_classification", "regression"}.
121 | 
122 |                 - If left `auto` AutoML will try to guess the task based on target values.
123 |                 - If there will be only 2 values in the target, then task will be set to `"binary_classification"`.
124 |                 - If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`.
125 |                 - In all other casses, the task is set to `"regression"`.
126 | 
127 |             model_time_limit (int): The time limit for training a single model, in seconds.
128 |                 If `model_time_limit` is set, the `total_time_limit` is not respected.
129 |                 The single model can contain several learners. The time limit for subsequent learners is computed based on `model_time_limit`.
130 | 
131 |                 For example, in the case of 10-fold cross-validation, one model will have 10 learners.
132 |                 The `model_time_limit` is the time for all 10 learners.
133 | 
134 |             algorithms (list of str): The list of algorithms that will be used in the training.
135 |                 The algorithms can be:
136 | 
137 |                 - `Baseline`,
138 |                 - `Linear`,
139 |                 - `Decision Tree`,
140 |                 - `Random Forest`,
141 |                 - `Extra Trees`,
142 |                 - `LightGBM`,
143 |                 - `Xgboost`,
144 |                 - `CatBoost`,
145 |                 - `Neural Network`,
146 |                 - `Nearest Neighbors`,
147 | 
148 | 
149 |             train_ensemble (boolean): Whether an ensemble gets created at the end of the training.
150 | 
151 |             stack_models (boolean): Whether a models stack gets created at the end of the training. Stack level is 1.
152 | 
153 |             eval_metric (str): The metric to be used in early stopping and to compare models.
154 | 
155 |                 - for binary classification: `logloss`, `auc`, `f1`, `average_precision`, `accuracy` - default is logloss (if left "auto")
156 |                 - for mutliclass classification: `logloss`, `f1`, `accuracy` - default is `logloss` (if left "auto")
157 |                 - for regression: `rmse`, `mse`, `mae`, `r2`, `mape`, `spearman`, `pearson` - default is `rmse` (if left "auto")
158 | 
159 |             validation_strategy (dict): Dictionary with validation type. Right now train/test split and cross-validation are supported.
160 | 
161 |                 Example:
162 | 
163 |                     Cross-validation exmaple:
164 |                     {
165 |                         "validation_type": "kfold",
166 |                         "k_folds": 5,
167 |                         "shuffle": True,
168 |                         "stratify": True,
169 |                         "random_seed": 123
170 |                     }
171 | 
172 |                     Train/test example:
173 |                     {
174 |                         "validation_type": "split",
175 |                         "train_ratio": 0.75,
176 |                         "shuffle": True,
177 |                         "stratify": True
178 |                     }
179 | 
180 |             explain_level (int): The level of explanations included to each model:
181 | 
182 |                 - if `explain_level` is `0` no explanations are produced.
183 |                 - if `explain_level` is `1` the following explanations are produced: importance plot (with permutation method), for decision trees produce tree plots, for linear models save coefficients.
184 |                 - if `explain_level` is `2` the following explanations are produced: the same as `1` plus SHAP explanations.
185 | 
186 |                 If left `auto` AutoML will produce explanations based on the selected `mode`.
187 | 
188 |             golden_features (boolean or int): Whether to use golden features (and how many should be added)
189 |                 If left `auto` AutoML will use golden features based on the selected `mode`:
190 | 
191 |                 - If `mode` is "Explain", `golden_features` = False.
192 |                 - If `mode` is "Perform", `golden_features` = True.
193 |                 - If `mode` is "Compete", `golden_features` = True.
194 | 
195 |                 If `boolean` value is set then the number of Golden Features is set automatically.
196 |                 It is set to min(100, max(10, 0.1*number_of_input_features)).
197 | 
198 |                 If `int` value is set, the number of Golden Features is set to this value.
199 | 
200 |             features_selection (boolean): Whether to do features_selection
201 |                 If left `auto` AutoML will do feature selection based on the selected `mode`:
202 | 
203 |                 - If `mode` is "Explain", `features_selection` = False.
204 |                 - If `mode` is "Perform", `features_selection` = True.
205 |                 - If `mode` is "Compete", `features_selection` = True.
206 | 
207 |             start_random_models (int): Number of starting random models to try.
208 |                 If left `auto` AutoML will select it based on the selected `mode`:
209 | 
210 |                 - If `mode` is "Explain", `start_random_models` = 1.
211 |                 - If `mode` is "Perform", `start_random_models` = 5.
212 |                 - If `mode` is "Compete", `start_random_models` = 10.
213 | 
214 |             hill_climbing_steps (int): Number of steps to perform during hill climbing.
215 |                 If left `auto` AutoML will select it based on the selected `mode`:
216 | 
217 |                 - If `mode` is "Explain", `hill_climbing_steps` = 0.
218 |                 - If `mode` is "Perform", `hill_climbing_steps` = 2.
219 |                 - If `mode` is "Compete", `hill_climbing_steps` = 2.
220 | 
221 |             top_models_to_improve (int): Number of best models to improve in `hill_climbing` steps.
222 |                 If left `auto` AutoML will select it based on the selected `mode`:
223 | 
224 |                 - If `mode` is "Explain", `top_models_to_improve` = 0.
225 |                 - If `mode` is "Perform", `top_models_to_improve` = 2.
226 |                 - If `mode` is "Compete", `top_models_to_improve` = 3.
227 | 
228 |             boost_on_errors (boolean): Whether a model with boost on errors from previous best model should be trained. By default available in the `Compete` mode.
229 | 
230 |             kmeans_features (boolean): Whether a model with k-means generated features should be trained. By default available in the `Compete` mode.
231 | 
232 |             mix_encoding (boolean): Whether a model with mixed encoding should be trained. Mixed encoding is the encoding that uses label encoding
233 |                 for categoricals with more than 25 categories, and one-hot binary encoding for other categoricals. It is only applied if there are
234 |                 categorical features with cardinality smaller than 25. By default it is available in the `Compete` mode.
235 | 
236 |             max_single_prediction_time (int or float): The limit for prediction time for single sample. Use it if you want to have a model with fast predictions.
237 |                 Ideal for creating ML pipelines used as REST API. Time is in seconds. By default (`max_single_prediction_time=None`) models are not optimized for fast predictions,
238 |                 except the mode `Perform`. For the mode `Perform` the default is `0.5` seconds.
239 | 
240 |             optuna_time_budget (int): The time in seconds which should be used by Optuna to tune each algorithm. It is time for tuning single algorithm.
241 |                 If you select two algorithms: Xgboost and CatBoost, and set optuna_time_budget=1000, then Xgboost will be tuned for 1000 seconds and CatBoost will be tuned for 1000 seconds.
242 |                 What is more, the tuning is made for each data type, for example for raw data and for data with inserted Golden Features.
243 |                 This parameter is only used when `mode="Optuna"`. If you set `mode="Optuna"` and forget to set this parameter, it will be set to 3600 seconds.
244 | 
245 |             optuna_init_params (dict): If you have already tuned parameters from Optuna you can reuse them by setting this parameter.
246 |                 This parameter is only used when `mode="Optuna"`. The dict should have structure and params as specified in the MLJAR AutoML .
247 | 
248 |             optuna_verbose (boolean): If true the Optuna tuning details are displayed. Set to `True` by default.
249 | 
250 |             fairness_metric (string): Name of fairness metric that will be used for assessing fairness criteria.
251 |                 Available metrics for binary and multiclass classification:
252 | 
253 |                 - `demographic_parity_difference`,
254 |                 - `demographic_parity_ratio` - default metric,
255 |                 - `equalized_odds_difference`,
256 |                 - `equalized_odds_ratio`.
257 | 
258 |                 Metrics for regression:
259 | 
260 |                 - `group_loss_difference`,
261 |                 - `group_loss_ratio` - default metric.
262 | 
263 | 
264 |             fairness_threshold (float): The treshold value for fairness metric.
265 |                 The direction optimization (below or above threshold) of fairness metric is determined automatically.
266 | 
267 |                 Default values:
268 | 
269 |                 - for `demographic_parity_difference` the metric value should be below 0.1,
270 |                 - for `demographic_parity_ratio` the metric value should be above 0.8,
271 |                 - for `equalized_odds_difference` the metric value should be below 0.1,
272 |                 - for `equalized_odds_ratio` the metric value shoule be above 0.8.
273 |                 - for `group_loss_ratio` the metric value shoule be above 0.8.
274 | 
275 |                 For `group_loss_difference` the default threshold value can't be set because it depends on the dataset.
276 |                 If `group_loss_difference` metric is used and `fairness_threshold` is not specified manually, then an exception will be raised.
277 | 
278 |             privileged_groups (list): The list of privileged groups.
279 | 
280 |                 By default, list of privileged groups are automatically detected based on fairness metrics.
281 |                 For example, in binary classification task, a privileged group is the one with the highest selection rate.
282 | 
283 |                 Example value: `[{"sex": "Male"}]`
284 | 
285 |             underprivileged_groups (list): The list of underprivileged groups.
286 | 
287 |                 By default, list of underprivileged groups are automatically detected based on fairness metrics.
288 |                 For example, in binary classification task, an underprivileged group is the one with the lowest selection rate.
289 | 
290 |                 Example value: `[{"sex": "Female"}]`
291 | 
292 |             n_jobs (int): Number of CPU cores to be used. By default is set to `-1` which means using  all processors.
293 | 
294 |             verbose (int): Controls the verbosity when fitting and predicting.
295 | 
296 |                 Note:
297 |                     Still not implemented, please left `1`
298 | 
299 |             random_state (int): Controls the randomness of the `AutoML`
300 | 
301 | 
302 |         Examples:
303 | 
304 |             Binary Classification Example:
305 | 
306 |             >>> import pandas as pd
307 |             >>> from sklearn.model_selection import train_test_split
308 |             >>> from sklearn.metrics import roc_auc_score
309 |             >>> from supervised import AutoML
310 |             >>> df = pd.read_csv(
311 |             ...        "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
312 |             ...       skipinitialspace=True
313 |             ...    )
314 |             >>> X_train, X_test, y_train, y_test = train_test_split(
315 |             ... df[df.columns[:-1]], df["income"], test_size=0.25
316 |             ... )
317 |             >>> automl = AutoML()
318 |             >>> automl.fit(X_train, y_train)
319 |             >>> y_pred_prob = automl.predict_proba(X_test)
320 |             >>> print(f"AUROC: {roc_auc_score(y_test, y_pred_prob):.2f}%")
321 | 
322 | 
323 |             Multi-Class Classification Example:
324 | 
325 |             >>> import pandas as pd
326 |             >>> from sklearn.datasets import load_digits
327 |             >>> from sklearn.metrics import accuracy_score
328 |             >>> from sklearn.model_selection import train_test_split
329 |             >>> from supervised import AutoML
330 |             >>> digits = load_digits()
331 |             >>> X_train, X_test, y_train, y_test = train_test_split(
332 |             ...     digits.data, digits.target, stratify=digits.target, test_size=0.25,
333 |             ...     random_state=123
334 |             ... )
335 |             >>> automl = AutoML(mode="Perform")
336 |             >>> automl.fit(X_train, y_train)
337 |             >>> y_pred = automl.predict(X_test)
338 |             >>> print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}%")
339 | 
340 |             Regression Example:
341 | 
342 |             >>> import pandas as pd
343 |             >>> from sklearn.datasets import fetch_california_housing
344 |             >>> from sklearn.model_selection import train_test_split
345 |             >>> from sklearn.metrics import mean_squared_error
346 |             >>> from supervised import AutoML
347 |             >>> housing = fetch_california_housing()
348 |             >>> X_train, X_test, y_train, y_test = train_test_split(
349 |             ...       pd.DataFrame(housing.data, columns=housing.feature_names),
350 |             ...       housing.target,
351 |             ...       test_size=0.25,
352 |             ...       random_state=123,
353 |             ... )
354 |             >>> automl = AutoML(mode="Compete")
355 |             >>> automl.fit(X_train, y_train)
356 |             >>> print("Test R^2:", automl.score(X_test, y_test))
357 | 
358 |             Scikit-learn Pipeline Integration Example:
359 | 
360 |             >>> from imblearn.over_sampling import RandomOverSampler
361 |             >>> from sklearn.pipeline import make_pipeline
362 |             >>> from sklearn.datasets import make_classification
363 |             >>> from sklearn.model_selection import train_test_split
364 |             >>> from supervised import AutoML
365 |             >>> X, y = make_classification()
366 |             >>> X_train, X_test, y_train, y_test = train_test_split(X,y)
367 |             >>> pipeline = make_pipeline(RandomOverSampler(), AutoML())
368 |             >>> print(pipeline.fit(X_train, y_train).score(X_test, y_test))
369 | 
370 |         """
371 |         super(AutoML, self).__init__()
372 |         # Set user arguments
373 |         self.mode = mode
374 |         self.ml_task = ml_task
375 |         self.results_path = results_path
376 |         self.total_time_limit = total_time_limit
377 |         self.model_time_limit = model_time_limit
378 |         self.algorithms = algorithms
379 |         self.train_ensemble = train_ensemble
380 |         self.stack_models = stack_models
381 |         self.eval_metric = eval_metric
382 |         self.validation_strategy = validation_strategy
383 |         self.verbose = verbose
384 |         self.explain_level = explain_level
385 |         self.golden_features = golden_features
386 |         self.features_selection = features_selection
387 |         self.start_random_models = start_random_models
388 |         self.hill_climbing_steps = hill_climbing_steps
389 |         self.top_models_to_improve = top_models_to_improve
390 |         self.boost_on_errors = boost_on_errors
391 |         self.kmeans_features = kmeans_features
392 |         self.mix_encoding = mix_encoding
393 |         self.max_single_prediction_time = max_single_prediction_time
394 |         self.optuna_time_budget = optuna_time_budget
395 |         self.optuna_init_params = optuna_init_params
396 |         self.optuna_verbose = optuna_verbose
397 |         self.fairness_metric = fairness_metric
398 |         self.fairness_threshold = fairness_threshold
399 |         self.privileged_groups = privileged_groups
400 |         self.underprivileged_groups = underprivileged_groups
401 |         self.n_jobs = n_jobs
402 |         self.random_state = random_state
403 | 
404 |     def fit(
405 |         self,
406 |         X: Union[numpy.ndarray, pandas.DataFrame],
407 |         y: Union[numpy.ndarray, pandas.Series],
408 |         sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None,
409 |         cv: Optional[Union[Iterable, List]] = None,
410 |         sensitive_features: Optional[
411 |             Union[numpy.ndarray, pandas.Series, pandas.DataFrame]
412 |         ] = None,
413 |     ):
414 |         """Fit the AutoML model.
415 | 
416 |         Arguments:
417 |             X (numpy.ndarray or pandas.DataFrame): Training data
418 | 
419 |             y (numpy.ndarray or pandas.Series): Training targets
420 | 
421 |             sample_weight (numpy.ndarray or pandas.Series): Training sample weights
422 | 
423 |             cv (iterable or list): List or iterable with (train, validation) splits representing array of indices.
424 |                 It is used only with custom validation (`validation_strategy={'validation_type': 'custom'}`).
425 | 
426 |             sensitive_features (pandas.Series or pandas.DataFrame): Sensitive features to learn fair models
427 | 
428 |         Returns:
429 |             AutoML object: Returns `self`
430 |         """
431 |         try:
432 |             original_backend = matplotlib.get_backend()
433 |             matplotlib.use("Agg")
434 |             return self._fit(X, y, sample_weight, cv, sensitive_features)
435 |         except Exception as e:
436 |             raise e
437 |         finally:
438 |             matplotlib.use(original_backend)
439 |             try:
440 |                 if 'inline' in original_backend:
441 |                     import matplotlib_inline
442 |                     matplotlib_inline.backend_inline._enable_matplotlib_integration()
443 |             except:
444 |                 pass
445 | 
446 | 
447 |     def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndarray:
448 |         """
449 |         Computes predictions from AutoML best model.
450 | 
451 |         Arguments:
452 |             X (list or numpy.ndarray or pandas.DataFrame):
453 |                 Input values to make predictions on.
454 | 
455 |         Returns:
456 |             numpy.ndarray:
457 | 
458 |             - One-dimensional array of class labels for classification.
459 |             - One-dimensional array of predictions for regression.
460 | 
461 |         Raises:
462 |             AutoMLException: Model has not yet been fitted.
463 |         """
464 |         return self._predict(X)
465 | 
466 |     def predict_proba(
467 |         self, X: Union[List, numpy.ndarray, pandas.DataFrame]
468 |     ) -> numpy.ndarray:
469 |         """
470 |         Computes class probabilities from AutoML best model.
471 |         This method can only be used for classification tasks.
472 | 
473 |         Arguments:
474 |             X (list or numpy.ndarray or pandas.DataFrame):
475 |                 Input values to make predictions on.
476 | 
477 |         Returns:
478 |             numpy.ndarray of shape (n_samples, n_classes):
479 |                 Matrix of containing class probabilities of the input samples
480 | 
481 |         Raises:
482 |             AutoMLException: Model has not yet been fitted.
483 | 
484 |         """
485 |         return self._predict_proba(X)
486 | 
487 |     def predict_all(
488 |         self, X: Union[List, numpy.ndarray, pandas.DataFrame]
489 |     ) -> pandas.DataFrame:
490 |         """
491 |         Computes both class probabilities and class labels for classification tasks.
492 |         Computes predictions for regression tasks.
493 | 
494 |         Arguments:
495 |             X (list or numpy.ndarray or pandas.DataFrame):
496 |                 Input values to make predictions on.
497 | 
498 |         Returns:
499 |             pandas.Dataframe:
500 |                 Dataframe (n_samples, n_classes + 1) containing both class probabilities and class
501 |                 labels of the input samples for classification tasks.
502 |                 Dataframe with predictions for regression tasks.
503 | 
504 |         Raises:
505 |             AutoMLException: Model has not yet been fitted.
506 | 
507 |         """
508 |         return self._predict_all(X)
509 | 
510 |     def score(
511 |         self,
512 |         X: Union[numpy.ndarray, pandas.DataFrame],
513 |         y: Optional[Union[numpy.ndarray, pandas.Series]] = None,
514 |         sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None,
515 |     ) -> float:
516 |         """Calculates a goodness of `fit` for an AutoML instance.
517 | 
518 |         Arguments:
519 |             X (numpy.ndarray or pandas.DataFrame):
520 |                 Test values to make predictions on.
521 | 
522 |             y (numpy.ndarray or pandas.Series):
523 |                 True labels for X.
524 | 
525 |             sample_weight (numpy.ndarray or pandas.Series):
526 |                 Sample weights.
527 |         Returns:
528 |             float: Returns a goodness of fit measure (higher is better):
529 | 
530 |             - For classification tasks: returns the mean accuracy on the given test data and labels.
531 |             - For regression tasks: returns the R^2 (coefficient of determination) on the given test data and labels.
532 |         """
533 |         return self._score(X, y, sample_weight)
534 | 
535 |     def report(self, width=900, height=1200):
536 |         return self._report(width, height)
537 | 
538 |     def need_retrain(
539 |         self,
540 |         X: Union[numpy.ndarray, pandas.DataFrame],
541 |         y: Union[numpy.ndarray, pandas.Series],
542 |         sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None,
543 |         decrease: float = 0.1,
544 |     ) -> bool:
545 |         """Decides about model retraining based on new data.
546 | 
547 |         Arguments:
548 |             X (numpy.ndarray or pandas.DataFrame):
549 |                 New data.
550 | 
551 |             y (numpy.ndarray or pandas.Series):
552 |                 True labels for X.
553 | 
554 |             sample_weight (numpy.ndarray or pandas.Series):
555 |                 Sample weights.
556 | 
557 |             decrease (float): The ratio of change in the performance used as a threshold for retraining decision.
558 |                 By default, it is set to `0.1` which means that if the performance of AutoML will decrease by 10%
559 |                 on new data then there is a need to retrain. This value should be set depending on your project needs.
560 |                 Sometimes, 10% is enough, but for some projects, it can be even lower than 1%.
561 | 
562 |             Returns:
563 |                 boolean: Decides if there is a need to retrain the AutoML.
564 |         """
565 |         return self._need_retrain(X, y, sample_weight, decrease)
566 | 
```