mljar/mljar-supervised # codebase.md

This is page 4 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/tests/tests_automl/test_explain_levels.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | 
  5 | import pandas as pd
  6 | from sklearn import datasets
  7 | 
  8 | from supervised import AutoML
  9 | from supervised.algorithms.random_forest import additional
 10 | 
 11 | additional["max_steps"] = 3
 12 | additional["trees_in_step"] = 1
 13 | 
 14 | from supervised.algorithms.xgboost import additional
 15 | 
 16 | additional["max_rounds"] = 1
 17 | 
 18 | 
 19 | class AutoMLExplainLevelsTest(unittest.TestCase):
 20 |     automl_dir = "AutoMLExplainLevelsTest"
 21 | 
 22 |     def setUp(self):
 23 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 24 | 
 25 |     def tearDown(self):
 26 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 27 | 
 28 |     def run_explain_default(self, task, alg):
 29 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 30 |         a = AutoML(
 31 |             results_path=self.automl_dir,
 32 |             total_time_limit=10,
 33 |             algorithms=[alg],
 34 |             train_ensemble=False,
 35 |             validation_strategy={
 36 |                 "validation_type": "kfold",
 37 |                 "k_folds": 2,
 38 |                 "shuffle": True,
 39 |                 "stratify": True,
 40 |             },
 41 |             start_random_models=1,
 42 |         )
 43 | 
 44 |         if task == "binary":
 45 |             X, y = datasets.make_classification(
 46 |                 n_samples=100,
 47 |                 n_features=5,
 48 |                 n_informative=4,
 49 |                 n_redundant=1,
 50 |                 n_classes=2,
 51 |                 n_clusters_per_class=3,
 52 |                 n_repeated=0,
 53 |                 shuffle=False,
 54 |                 random_state=0,
 55 |             )
 56 |         elif task == "multi":
 57 |             X, y = datasets.make_classification(
 58 |                 n_samples=100,
 59 |                 n_features=5,
 60 |                 n_informative=4,
 61 |                 n_redundant=1,
 62 |                 n_classes=5,
 63 |                 n_clusters_per_class=3,
 64 |                 n_repeated=0,
 65 |                 shuffle=False,
 66 |                 random_state=0,
 67 |             )
 68 |         else:
 69 |             X, y = datasets.make_regression(
 70 |                 n_samples=100,
 71 |                 n_features=5,
 72 |                 n_informative=4,
 73 |                 shuffle=False,
 74 |                 random_state=0,
 75 |             )
 76 | 
 77 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 78 | 
 79 |         a.fit(X, y)
 80 | 
 81 |         result_files = os.listdir(
 82 |             os.path.join(self.automl_dir, f'1_Default_{alg.replace(" ", "")}')
 83 |         )
 84 | 
 85 |         # There should be files with:
 86 |         # - permutation importance
 87 |         # - shap importance
 88 |         # - shap dependence
 89 |         # - shap decisions
 90 | 
 91 |         # Check permutation importance
 92 |         produced = False
 93 |         for f in result_files:
 94 |             if "importance.csv" in f and "shap" not in f:
 95 |                 produced = True
 96 |                 break
 97 |         self.assertTrue(produced)
 98 |         # Check shap importance
 99 |         produced = False
100 |         for f in result_files:
101 |             if "importance.csv" in f and "shap" in f:
102 |                 produced = True
103 |                 break
104 |         self.assertTrue(produced)
105 |         # Check shap dependence
106 |         produced = False
107 |         for f in result_files:
108 |             if "shap_dependence" in f:
109 |                 produced = True
110 |                 break
111 |         self.assertTrue(produced)
112 |         # Check shap decisions
113 |         produced = False
114 |         for f in result_files:
115 |             if "decisions.png" in f:
116 |                 produced = True
117 |                 break
118 |         self.assertTrue(produced)
119 | 
120 |     # def test_explain_default(self):
121 | 
122 |     #     for task in ["binary", "multi", "regression"]:
123 |     #         for alg in ["Xgboost", "Random Forest", "LightGBM"]:
124 |     #             self.run_explain_default(task, alg)
125 | 
126 |     def test_no_explain_linear(self):
127 |         a = AutoML(
128 |             results_path=self.automl_dir,
129 |             total_time_limit=1,
130 |             algorithms=["Linear"],
131 |             train_ensemble=False,
132 |             validation_strategy={
133 |                 "validation_type": "kfold",
134 |                 "k_folds": 2,
135 |                 "shuffle": True,
136 |                 "stratify": True,
137 |             },
138 |             explain_level=0,
139 |             start_random_models=1,
140 |         )
141 | 
142 |         X, y = datasets.make_regression(
143 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
144 |         )
145 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
146 | 
147 |         a.fit(X, y)
148 | 
149 |         result_files = os.listdir(os.path.join(self.automl_dir, "1_Linear"))
150 | 
151 |         # There should be no files with:
152 |         # - permutation importance
153 |         # - shap importance
154 |         # - shap dependence
155 |         # - shap decisions
156 | 
157 |         # Check permutation importance
158 |         produced = False
159 |         for f in result_files:
160 |             if "importance.csv" in f and "shap" not in f:
161 |                 produced = True
162 |                 break
163 |         self.assertFalse(produced)
164 |         # Check shap importance
165 |         produced = False
166 |         for f in result_files:
167 |             if "importance.csv" in f and "shap" in f:
168 |                 produced = True
169 |                 break
170 |         self.assertFalse(produced)
171 |         # Check shap dependence
172 |         produced = False
173 |         for f in result_files:
174 |             if "dependence.png" in f:
175 |                 produced = True
176 |                 break
177 |         self.assertFalse(produced)
178 |         # Check shap decisions
179 |         produced = False
180 |         for f in result_files:
181 |             if "decisions.png" in f:
182 |                 produced = True
183 |                 break
184 |         self.assertFalse(produced)
185 |         # Check coefficients
186 |         produced = False
187 |         for f in result_files:
188 |             if "coefs.csv" in f:
189 |                 produced = True
190 |                 break
191 |         self.assertFalse(produced)
192 | 
193 |     def test_explain_just_permutation_importance(self):
194 |         a = AutoML(
195 |             results_path=self.automl_dir,
196 |             total_time_limit=1,
197 |             algorithms=["Xgboost"],
198 |             train_ensemble=False,
199 |             validation_strategy={
200 |                 "validation_type": "kfold",
201 |                 "k_folds": 2,
202 |                 "shuffle": True,
203 |                 "stratify": True,
204 |             },
205 |             explain_level=1,
206 |             start_random_models=1,
207 |         )
208 | 
209 |         X, y = datasets.make_regression(
210 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
211 |         )
212 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
213 | 
214 |         a.fit(X, y)
215 | 
216 |         result_files = os.listdir(os.path.join(self.automl_dir, "1_Default_Xgboost"))
217 | 
218 |         # There should be no files with:
219 |         # - permutation importance
220 |         # - shap importance
221 |         # - shap dependence
222 |         # - shap decisions
223 | 
224 |         # Check permutation importance
225 |         produced = False
226 |         for f in result_files:
227 |             if "importance.csv" in f and "shap" not in f:
228 |                 produced = True
229 |                 break
230 |         self.assertTrue(produced)
231 |         # Check shap importance
232 |         produced = False
233 |         for f in result_files:
234 |             if "importance.csv" in f and "shap" in f:
235 |                 produced = True
236 |                 break
237 |         self.assertFalse(produced)
238 |         # Check shap dependence
239 |         produced = False
240 |         for f in result_files:
241 |             if "dependence.png" in f:
242 |                 produced = True
243 |                 break
244 |         self.assertFalse(produced)
245 |         # Check shap decisions
246 |         produced = False
247 |         for f in result_files:
248 |             if "decisions.png" in f:
249 |                 produced = True
250 |                 break
251 |         self.assertFalse(produced)
252 | 
253 |     def test_build_decision_tree(self):
254 |         a = AutoML(
255 |             results_path=self.automl_dir,
256 |             total_time_limit=10,
257 |             algorithms=["Decision Tree"],
258 |             train_ensemble=False,
259 |             validation_strategy={
260 |                 "validation_type": "kfold",
261 |                 "k_folds": 2,
262 |                 "shuffle": True,
263 |                 "stratify": True,
264 |             },
265 |             explain_level=2,
266 |             start_random_models=1,
267 |         )
268 | 
269 |         X, y = datasets.make_regression(
270 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
271 |         )
272 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
273 | 
274 |         a.fit(X, y)
275 | 
276 |         result_files = os.listdir(os.path.join(self.automl_dir, "1_DecisionTree"))
277 | 
278 |         # There should be files with:
279 |         # - decision tree visualization
280 |         # - permutation importance
281 |         # - shap importance
282 |         # - shap dependence
283 |         # - shap decisions
284 | 
285 |         # Check Decision Tree visualization
286 |         produced = False
287 |         for f in result_files:
288 |             if "tree.svg" in f:
289 |                 produced = True
290 |                 break
291 |         # disable  ??? TODO
292 |         # self.assertTrue(produced)
293 | 
294 |         # Check permutation importance
295 |         produced = False
296 |         for f in result_files:
297 |             if "importance.csv" in f and "shap" not in f:
298 |                 produced = True
299 |                 break
300 |         self.assertTrue(produced)
301 |         # Check shap importance
302 |         produced = False
303 |         for f in result_files:
304 |             if "importance.csv" in f and "shap" in f:
305 |                 produced = True
306 |                 break
307 |         self.assertTrue(produced)
308 |         # Check shap dependence
309 |         produced = False
310 |         for f in result_files:
311 |             if "dependence.png" in f:
312 |                 produced = True
313 |                 break
314 |         self.assertTrue(produced)
315 |         # Check shap decisions
316 |         produced = False
317 |         for f in result_files:
318 |             if "decisions.png" in f:
319 |                 produced = True
320 |                 break
321 |         self.assertTrue(produced)
322 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_targets.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import unittest
  3 | import pytest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from supervised import AutoML
  9 | from supervised.algorithms.xgboost import additional
 10 | from supervised.exceptions import AutoMLException
 11 | 
 12 | additional["max_rounds"] = 1
 13 | 
 14 | 
 15 | class AutoMLTargetsTest(unittest.TestCase):
 16 |     automl_dir = "automl_tests"
 17 |     rows = 50
 18 | 
 19 |     def tearDown(self):
 20 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 21 | 
 22 |     def test_bin_class_01(self):
 23 |         X = np.random.rand(self.rows, 3)
 24 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 25 |         y = np.random.randint(0, 2, self.rows)
 26 | 
 27 |         automl = AutoML(
 28 |             results_path=self.automl_dir,
 29 |             total_time_limit=1,
 30 |             algorithms=["Xgboost"],
 31 |             train_ensemble=False,
 32 |             explain_level=0,
 33 |             start_random_models=1,
 34 |         )
 35 |         automl.fit(X, y)
 36 |         pred = automl.predict(X)
 37 | 
 38 |         u = np.unique(pred)
 39 |         self.assertTrue(0 in u or 1 in u)
 40 |         self.assertTrue(len(u) <= 2)
 41 | 
 42 |     def test_bin_class_11(self):
 43 |         X = np.random.rand(self.rows, 3)
 44 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 45 |         y = np.random.randint(0, 2, self.rows) * 2 - 1
 46 | 
 47 |         automl = AutoML(
 48 |             results_path=self.automl_dir,
 49 |             total_time_limit=1,
 50 |             algorithms=["Xgboost"],
 51 |             train_ensemble=False,
 52 |             explain_level=0,
 53 |             start_random_models=1,
 54 |         )
 55 |         automl.fit(X, y)
 56 |         p = automl.predict(X)
 57 |         pred = automl.predict(X)
 58 | 
 59 |         u = np.unique(pred)
 60 | 
 61 |         self.assertTrue(-1 in u or 1 in u)
 62 |         self.assertTrue(0 not in u)
 63 |         self.assertTrue(len(u) <= 2)
 64 | 
 65 |     def test_bin_class_AB(self):
 66 |         X = np.random.rand(self.rows, 3)
 67 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 68 |         y = np.random.permutation(["a", "B"] * int(self.rows / 2))
 69 | 
 70 |         automl = AutoML(
 71 |             results_path=self.automl_dir,
 72 |             total_time_limit=1,
 73 |             algorithms=["Xgboost"],
 74 |             train_ensemble=False,
 75 |             explain_level=0,
 76 |             start_random_models=1,
 77 |         )
 78 |         automl.fit(X, y)
 79 |         p = automl.predict(X)
 80 |         pred = automl.predict(X)
 81 |         u = np.unique(pred)
 82 |         self.assertTrue("a" in u or "B" in u)
 83 |         self.assertTrue(len(u) <= 2)
 84 | 
 85 |     def test_bin_class_AB_missing_targets(self):
 86 |         X = np.random.rand(self.rows, 3)
 87 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 88 |         y = pd.Series(
 89 |             np.random.permutation(["a", "B"] * int(self.rows / 2)), name="target"
 90 |         )
 91 | 
 92 |         y.iloc[1] = None
 93 |         y.iloc[3] = np.NaN
 94 |         y.iloc[13] = np.nan
 95 | 
 96 |         automl = AutoML(
 97 |             results_path=self.automl_dir,
 98 |             total_time_limit=1,
 99 |             algorithms=["Xgboost"],
100 |             train_ensemble=False,
101 |             explain_level=0,
102 |             start_random_models=1,
103 |         )
104 | 
105 |         with pytest.warns(
106 |             expected_warning=UserWarning,
107 |             match="There are samples with missing target values in the data which will be excluded for further analysis",
108 |         ) as record:
109 |             automl.fit(X, y)
110 | 
111 |         # check that only one warning was raised
112 |         self.assertEqual(len(record), 1)
113 | 
114 |         p = automl.predict(X)
115 |         pred = automl.predict(X)
116 | 
117 |         u = np.unique(pred)
118 |         self.assertTrue("a" in u or "B" in u)
119 |         self.assertTrue(len(u) <= 2)
120 | 
121 |     def test_multi_class_0123_floats(self):
122 |         X = np.random.rand(self.rows * 4, 3)
123 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
124 |         y = np.random.randint(0, 4, self.rows * 4)
125 |         y = y.astype(float)
126 | 
127 |         automl = AutoML(
128 |             results_path=self.automl_dir,
129 |             total_time_limit=1,
130 |             algorithms=["Xgboost"],
131 |             train_ensemble=False,
132 |             explain_level=0,
133 |             start_random_models=1,
134 |         )
135 |         automl.fit(X, y)
136 |         pred = automl.predict(X)
137 | 
138 |         u = np.unique(pred)
139 | 
140 |         self.assertTrue(0.0 in u or 1.0 in u or 2.0 in u or 3.0 in u)
141 |         self.assertTrue(len(u) <= 4)
142 | 
143 |     def test_multi_class_0123(self):
144 |         X = np.random.rand(self.rows * 4, 3)
145 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
146 |         y = np.random.randint(0, 4, self.rows * 4)
147 | 
148 |         automl = AutoML(
149 |             results_path=self.automl_dir,
150 |             total_time_limit=1,
151 |             algorithms=["Xgboost"],
152 |             train_ensemble=False,
153 |             explain_level=0,
154 |             start_random_models=1,
155 |         )
156 |         automl.fit(X, y)
157 |         pred = automl.predict(X)
158 | 
159 |         u = np.unique(pred)
160 | 
161 |         self.assertTrue(0 in u or 1 in u or 2 in u or 3 in u)
162 |         self.assertTrue(len(u) <= 4)
163 | 
164 |     def test_multi_class_0123_strings(self):
165 |         X = np.random.rand(self.rows * 4, 3)
166 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
167 |         y = np.random.randint(0, 4, self.rows * 4)
168 |         y = y.astype(str)
169 | 
170 |         automl = AutoML(
171 |             results_path=self.automl_dir,
172 |             total_time_limit=1,
173 |             algorithms=["Xgboost"],
174 |             train_ensemble=False,
175 |             explain_level=0,
176 |             start_random_models=1,
177 |         )
178 |         automl.fit(X, y)
179 |         pred = automl.predict(X)
180 | 
181 |         u = np.unique(pred)
182 | 
183 |         self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u)
184 |         self.assertTrue(len(u) <= 4)
185 | 
186 |     def test_multi_class_abcd(self):
187 |         X = np.random.rand(self.rows * 4, 3)
188 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
189 |         y = pd.Series(
190 |             np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target"
191 |         )
192 | 
193 |         automl = AutoML(
194 |             results_path=self.automl_dir,
195 |             total_time_limit=1,
196 |             algorithms=["Xgboost"],
197 |             train_ensemble=False,
198 |             explain_level=0,
199 |             start_random_models=1,
200 |         )
201 |         automl.fit(X, y)
202 |         pred = automl.predict(X)
203 | 
204 |         u = np.unique(pred)
205 | 
206 |         self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0)
207 |         self.assertTrue(len(u) <= 4)
208 | 
209 |     def test_multi_class_abcd_np_array(self):
210 |         X = np.random.rand(self.rows * 4, 3)
211 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
212 |         y = np.random.permutation([None, "B", "CC", "d"] * self.rows)
213 | 
214 |         automl = AutoML(
215 |             results_path=self.automl_dir,
216 |             total_time_limit=1,
217 |             algorithms=["Xgboost"],
218 |             train_ensemble=False,
219 |             explain_level=0,
220 |             start_random_models=1,
221 |         )
222 |         automl.fit(X, y)
223 |         pred = automl.predict(X)
224 | 
225 |         u = np.unique(pred)
226 | 
227 |         self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0)
228 |         self.assertTrue(len(u) <= 4)
229 | 
230 |     def test_multi_class_abcd_mixed_int(self):
231 |         X = np.random.rand(self.rows * 4, 3)
232 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
233 |         y = pd.Series(
234 |             np.random.permutation([1, "B", "CC", "d"] * self.rows), name="target"
235 |         )
236 | 
237 |         automl = AutoML(
238 |             results_path=self.automl_dir,
239 |             total_time_limit=1,
240 |             algorithms=["Xgboost"],
241 |             train_ensemble=False,
242 |             explain_level=0,
243 |             start_random_models=1,
244 |         )
245 |         automl.fit(X, y)
246 |         pred = automl.predict(X)
247 |         u = np.unique(pred)
248 | 
249 |         self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0)
250 |         self.assertTrue(len(u) <= 4)
251 | 
252 |     def test_multi_class_abcd_missing_target(self):
253 |         X = np.random.rand(self.rows * 4, 3)
254 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
255 |         y = pd.Series(
256 |             np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target"
257 |         )
258 | 
259 |         y.iloc[0] = None
260 |         y.iloc[1] = None
261 |         automl = AutoML(
262 |             results_path=self.automl_dir,
263 |             total_time_limit=1,
264 |             algorithms=["Xgboost"],
265 |             train_ensemble=False,
266 |             explain_level=0,
267 |             start_random_models=1,
268 |         )
269 | 
270 |         with pytest.warns(
271 |             expected_warning=UserWarning,
272 |             match="There are samples with missing target values in the data which will be excluded for further analysis",
273 |         ) as record:
274 |             automl.fit(X, y)
275 | 
276 |         # check that only one warning was raised
277 |         self.assertEqual(len(record), 1)
278 | 
279 |         pred = automl.predict(X)
280 | 
281 |         u = np.unique(pred)
282 | 
283 |         self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0)
284 |         self.assertTrue(len(u) <= 4)
285 | 
286 |     def test_regression(self):
287 |         X = np.random.rand(self.rows, 3)
288 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
289 |         y = np.random.rand(self.rows)
290 | 
291 |         automl = AutoML(
292 |             results_path=self.automl_dir,
293 |             total_time_limit=1,
294 |             algorithms=["Xgboost"],
295 |             train_ensemble=False,
296 |             explain_level=0,
297 |             start_random_models=1,
298 |         )
299 |         automl.fit(X, y)
300 |         pred = automl.predict(X)
301 | 
302 |         self.assertIsInstance(pred, np.ndarray)
303 |         self.assertEqual(len(pred), X.shape[0])
304 | 
305 |     def test_regression_missing_target(self):
306 |         X = np.random.rand(self.rows, 3)
307 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
308 |         y = pd.Series(np.random.rand(self.rows), name="target")
309 | 
310 |         y.iloc[1] = None
311 | 
312 |         automl = AutoML(
313 |             results_path=self.automl_dir,
314 |             total_time_limit=1,
315 |             algorithms=["Xgboost"],
316 |             train_ensemble=False,
317 |             explain_level=0,
318 |             start_random_models=1,
319 |         )
320 | 
321 |         with pytest.warns(
322 |             match="There are samples with missing target values in the data which will be excluded for further analysis"
323 |         ) as record:
324 |             automl.fit(X, y)
325 | 
326 |         self.assertEqual(len(record), 1)
327 | 
328 |         pred = automl.predict(X)
329 | 
330 |         self.assertIsInstance(pred, np.ndarray)
331 |         self.assertEqual(len(pred), X.shape[0])
332 | 
333 |     def test_predict_on_empty_dataframe(self):
334 |         X = np.random.rand(self.rows, 3)
335 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
336 |         y = pd.Series(np.random.rand(self.rows), name="target")
337 | 
338 |         automl = AutoML(
339 |             results_path=self.automl_dir,
340 |             total_time_limit=1,
341 |             algorithms=["Xgboost"],
342 |             train_ensemble=False,
343 |             explain_level=0,
344 |             start_random_models=1,
345 |         )
346 |         automl.fit(X, y)
347 | 
348 |         with self.assertRaises(AutoMLException) as context:
349 |             pred = automl.predict(pd.DataFrame())
350 | 
351 |         with self.assertRaises(AutoMLException) as context:
352 |             pred = automl.predict(np.empty(shape=(0, 3)))
353 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/goldenfeatures_transformer.py:
--------------------------------------------------------------------------------

```python
  1 | import itertools
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from joblib import Parallel, delayed
  9 | from sklearn.metrics import log_loss, mean_squared_error
 10 | from sklearn.model_selection import train_test_split
 11 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 12 | 
 13 | from supervised.algorithms.registry import (
 14 |     BINARY_CLASSIFICATION,
 15 |     MULTICLASS_CLASSIFICATION,
 16 |     REGRESSION,
 17 | )
 18 | from supervised.exceptions import AutoMLException
 19 | from supervised.utils.jsonencoder import MLJSONEncoder
 20 | 
 21 | 
 22 | def get_binary_score(X_train, y_train, X_test, y_test):
 23 |     clf = DecisionTreeClassifier(max_depth=3)
 24 |     clf.fit(X_train, y_train)
 25 |     pred = clf.predict_proba(X_test)[:, 1]
 26 |     ll = log_loss(y_test, pred)
 27 |     return ll
 28 | 
 29 | 
 30 | def get_regression_score(X_train, y_train, X_test, y_test):
 31 |     clf = DecisionTreeRegressor(max_depth=3)
 32 |     clf.fit(X_train, y_train)
 33 |     pred = clf.predict(X_test)
 34 |     ll = mean_squared_error(y_test, pred)
 35 |     return ll
 36 | 
 37 | 
 38 | def get_multiclass_score(X_train, y_train, X_test, y_test):
 39 |     clf = DecisionTreeClassifier(max_depth=3)
 40 |     clf.fit(X_train, y_train)
 41 |     pred = clf.predict_proba(X_test)
 42 |     ll = log_loss(y_test, pred)
 43 |     return ll
 44 | 
 45 | 
 46 | def get_score(item):
 47 |     col1 = item[0]
 48 |     col2 = item[1]
 49 |     X_train = item[2]
 50 |     y_train = item[3]
 51 |     X_test = item[4]
 52 |     y_test = item[5]
 53 |     scorer = item[6]
 54 | 
 55 |     try:
 56 |         x_train = np.array(X_train[col1] - X_train[col2]).reshape(-1, 1)
 57 |         x_test = np.array(X_test[col1] - X_test[col2]).reshape(-1, 1)
 58 |         diff_score = scorer(x_train, y_train, x_test, y_test)
 59 |     except Exception as e:
 60 |         diff_score = None
 61 |         print(str(e))
 62 | 
 63 |     try:
 64 |         a, b = (
 65 |             np.array(X_train[col1], dtype=float),
 66 |             np.array(X_train[col2], dtype=float),
 67 |         )
 68 |         x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1)
 69 |         a, b = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float)
 70 |         x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1)
 71 |         ratio_1_score = scorer(x_train, y_train, x_test, y_test)
 72 |     except Exception as e:
 73 |         print(str(e))
 74 |         ratio_1_score = None
 75 | 
 76 |     try:
 77 |         b, a = (
 78 |             np.array(X_train[col1], dtype=float),
 79 |             np.array(X_train[col2], dtype=float),
 80 |         )
 81 |         x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1)
 82 |         b, a = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float)
 83 |         x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1)
 84 |         ratio_2_score = scorer(x_train, y_train, x_test, y_test)
 85 |     except Exception as e:
 86 |         print(str(e))
 87 |         ratio_2_score = None
 88 | 
 89 |     try:
 90 |         x_train = np.array(X_train[col1] + X_train[col2]).reshape(-1, 1)
 91 |         x_test = np.array(X_test[col1] + X_test[col2]).reshape(-1, 1)
 92 |         sum_score = scorer(x_train, y_train, x_test, y_test)
 93 |     except Exception as e:
 94 |         sum_score = None
 95 |         print(str(e))
 96 | 
 97 |     try:
 98 |         x_train = np.array(X_train[col1] * X_train[col2]).reshape(-1, 1)
 99 |         x_test = np.array(X_test[col1] * X_test[col2]).reshape(-1, 1)
100 |         multiply_score = scorer(x_train, y_train, x_test, y_test)
101 |     except Exception as e:
102 |         multiply_score = None
103 |         print(str(e))
104 | 
105 |     return (diff_score, ratio_1_score, ratio_2_score, sum_score, multiply_score)
106 | 
107 | 
108 | class GoldenFeaturesTransformer(object):
109 |     def __init__(self, results_path=None, ml_task=None, features_count=None, n_jobs=-1):
110 |         self._new_features = []
111 |         self._new_columns = []
112 |         self._ml_task = ml_task
113 |         self._features_count = features_count
114 |         self._n_jobs = n_jobs
115 |         self._scorer = None
116 |         if self._ml_task == BINARY_CLASSIFICATION:
117 |             self._scorer = get_binary_score
118 |         elif self._ml_task == MULTICLASS_CLASSIFICATION:
119 |             self._scorer = get_multiclass_score
120 |         else:
121 |             self._scorer = get_regression_score
122 | 
123 |         self._error = None
124 | 
125 |         self._result_file = "golden_features.json"
126 |         if results_path is not None:
127 |             self._result_path = os.path.join(results_path, self._result_file)
128 | 
129 |             if os.path.exists(self._result_path):
130 |                 with open(self._result_path, "r") as file:
131 |                     self.from_json(json.load(file), results_path)
132 | 
133 |     def fit(self, X, y):
134 |         if self._new_features:
135 |             return
136 |         if self._error is not None and self._error:
137 |             raise AutoMLException(
138 |                 "Golden Features not created due to error (please check errors.md). "
139 |                 + self._error
140 |             )
141 |             return
142 |         if X.shape[1] == 0:
143 |             self._error = f"Golden Features not created. No continous features. Input data shape: {X.shape}, {y.shape}"
144 |             self.save()
145 |             raise AutoMLException("Golden Features not created. No continous features.")
146 | 
147 |         start_time = time.time()
148 |         combinations = itertools.combinations(X.columns, r=2)
149 |         items = [i for i in combinations]
150 |         if len(items) > 250000:
151 |             si = np.random.choice(len(items), 250000, replace=False)
152 |             items = [items[i] for i in si]
153 | 
154 |         X_train, X_test, y_train, y_test = self._subsample(X, y)
155 | 
156 |         for i in range(len(items)):
157 |             items[i] += (X_train, y_train, X_test, y_test, self._scorer)
158 | 
159 |         scores = []
160 |         # parallel version
161 |         scores = Parallel(n_jobs=self._n_jobs, backend="loky")(
162 |             delayed(get_score)(i) for i in items
163 |         )
164 | 
165 |         # single process version
166 |         # for item in items:
167 |         #    scores += [get_score(item)]
168 | 
169 |         if not scores:
170 |             self._error = f"Golden Features not created. Empty scores. Input data shape: {X.shape}, {y.shape}"
171 |             self.save()
172 |             raise AutoMLException("Golden Features not created. Empty scores.")
173 | 
174 |         result = []
175 |         for i in range(len(items)):
176 |             if scores[i][0] is not None:
177 |                 result += [(items[i][0], items[i][1], "diff", scores[i][0])]
178 |             if scores[i][1] is not None:
179 |                 result += [(items[i][0], items[i][1], "ratio", scores[i][1])]
180 |             if scores[i][2] is not None:
181 |                 result += [(items[i][1], items[i][0], "ratio", scores[i][2])]
182 |             if scores[i][3] is not None:
183 |                 result += [(items[i][1], items[i][0], "sum", scores[i][3])]
184 |             if scores[i][4] is not None:
185 |                 result += [(items[i][1], items[i][0], "multiply", scores[i][4])]
186 | 
187 |         df = pd.DataFrame(
188 |             result, columns=["feature1", "feature2", "operation", "score"]
189 |         )
190 |         df.sort_values(by="score", inplace=True)
191 | 
192 |         new_cols_cnt = np.min([100, np.max([10, int(0.1 * X.shape[1])])])
193 | 
194 |         if (
195 |             self._features_count is not None
196 |             and self._features_count > 0
197 |             and self._features_count < df.shape[0]
198 |         ):
199 |             new_cols_cnt = self._features_count
200 | 
201 |         print(self._features_count, new_cols_cnt)
202 |         self._new_features = json.loads(df.head(new_cols_cnt).to_json(orient="records"))
203 | 
204 |         for new_feature in self._new_features:
205 |             new_col = "_".join(
206 |                 [
207 |                     new_feature["feature1"],
208 |                     new_feature["operation"],
209 |                     new_feature["feature2"],
210 |                 ]
211 |             )
212 |             self._new_columns += [new_col]
213 |             print(f"Add Golden Feature: {new_col}")
214 | 
215 |         self.save()
216 | 
217 |         print(
218 |             f"Created {len(self._new_features)} Golden Features in {np.round(time.time() - start_time,2)} seconds."
219 |         )
220 | 
221 |     def transform(self, X):
222 |         for new_feature in self._new_features:
223 |             new_col = "_".join(
224 |                 [
225 |                     new_feature["feature1"],
226 |                     new_feature["operation"],
227 |                     new_feature["feature2"],
228 |                 ]
229 |             )
230 |             if new_feature["operation"] == "diff":
231 |                 X[new_col] = X[new_feature["feature1"]] - X[new_feature["feature2"]]
232 |             elif new_feature["operation"] == "ratio":
233 |                 a, b = (
234 |                     np.array(X[new_feature["feature1"]], dtype=float),
235 |                     np.array(X[new_feature["feature2"]], dtype=float),
236 |                 )
237 |                 X[new_col] = np.divide(
238 |                     a, b, out=np.zeros_like(a), where=b != 0
239 |                 ).reshape(-1, 1)
240 |             elif new_feature["operation"] == "sum":
241 |                 X[new_col] = X[new_feature["feature1"]] + X[new_feature["feature2"]]
242 |             elif new_feature["operation"] == "multiply":
243 |                 X[new_col] = X[new_feature["feature1"]] * X[new_feature["feature2"]]
244 | 
245 |         return X
246 | 
247 |     def to_json(self):
248 |         data_json = {
249 |             "new_features": self._new_features,
250 |             "new_columns": self._new_columns,
251 |             "ml_task": self._ml_task,
252 |         }
253 |         if self._error is not None and self._error:
254 |             data_json["error"] = self._error
255 |         return data_json
256 | 
257 |     def from_json(self, data_json, results_path):
258 |         self._new_features = data_json.get("new_features", [])
259 |         self._new_columns = data_json.get("new_columns", [])
260 |         self._ml_task = data_json.get("ml_task")
261 |         self._error = data_json.get("error")
262 |         self._result_path = os.path.join(results_path, self._result_file)
263 | 
264 |     def save(self):
265 |         with open(self._result_path, "w") as fout:
266 |             fout.write(json.dumps(self.to_json(), indent=4, cls=MLJSONEncoder))
267 | 
268 |     def _subsample(self, X, y):
269 |         MAX_SIZE = 10000
270 |         TRAIN_SIZE = 2500
271 | 
272 |         shuffle = True
273 |         stratify = None
274 | 
275 |         if X.shape[0] > MAX_SIZE:
276 |             if self._ml_task != REGRESSION:
277 |                 stratify = y
278 |             X_train, _, y_train, _ = train_test_split(
279 |                 X,
280 |                 y,
281 |                 train_size=MAX_SIZE,
282 |                 shuffle=shuffle,
283 |                 stratify=stratify,
284 |                 random_state=1,
285 |             )
286 |             if self._ml_task != REGRESSION:
287 |                 stratify = y_train
288 | 
289 |             X_train, X_test, y_train, y_test = train_test_split(
290 |                 X_train,
291 |                 y_train,
292 |                 train_size=TRAIN_SIZE,
293 |                 shuffle=shuffle,
294 |                 stratify=stratify,
295 |                 random_state=1,
296 |             )
297 |         else:
298 |             if self._ml_task != REGRESSION:
299 |                 stratify = y
300 |             train_size = X.shape[0] // 4
301 |             X_train, X_test, y_train, y_test = train_test_split(
302 |                 X,
303 |                 y,
304 |                 train_size=train_size,
305 |                 shuffle=shuffle,
306 |                 stratify=stratify,
307 |                 random_state=1,
308 |             )
309 | 
310 |         return X_train, X_test, y_train, y_test
311 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/tuner.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | import warnings
  4 | 
  5 | import joblib
  6 | import matplotlib
  7 | import optuna
  8 | from matplotlib import pyplot as plt
  9 | 
 10 | from supervised.exceptions import AutoMLException
 11 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 12 | from supervised.tuner.optuna.catboost import CatBoostObjective
 13 | from supervised.tuner.optuna.extra_trees import ExtraTreesObjective
 14 | from supervised.tuner.optuna.knn import KNNObjective
 15 | from supervised.tuner.optuna.lightgbm import LightgbmObjective
 16 | from supervised.tuner.optuna.nn import NeuralNetworkObjective
 17 | from supervised.tuner.optuna.random_forest import RandomForestObjective
 18 | from supervised.tuner.optuna.xgboost import XgboostObjective
 19 | from supervised.utils.jsonencoder import MLJSONEncoder
 20 | from supervised.utils.metric import Metric
 21 | 
 22 | 
 23 | class OptunaTuner:
 24 |     def __init__(
 25 |         self,
 26 |         results_path,
 27 |         ml_task,
 28 |         eval_metric,
 29 |         time_budget=3600,
 30 |         init_params={},
 31 |         verbose=True,
 32 |         n_jobs=-1,
 33 |         random_state=42,
 34 |     ):
 35 |         if eval_metric.name not in [
 36 |             "auc",
 37 |             "logloss",
 38 |             "rmse",
 39 |             "mse",
 40 |             "mae",
 41 |             "mape",
 42 |             "r2",
 43 |             "spearman",
 44 |             "pearson",
 45 |             "f1",
 46 |             "average_precision",
 47 |             "accuracy",
 48 |             "user_defined_metric",
 49 |         ]:
 50 |             raise AutoMLException(f"Metric {eval_metric.name} is not supported")
 51 | 
 52 |         self.study_dir = os.path.join(results_path, "optuna")
 53 |         if not os.path.exists(self.study_dir):
 54 |             try:
 55 |                 os.mkdir(self.study_dir)
 56 |             except Exception as e:
 57 |                 print("Problem while creating directory for optuna studies.", str(e))
 58 |         self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
 59 |         self.tuning = init_params
 60 |         self.eval_metric = eval_metric
 61 | 
 62 |         self.direction = (
 63 |             "maximize" if Metric.optimize_negative(eval_metric.name) else "minimize"
 64 |         )
 65 |         self.n_warmup_steps = (
 66 |             500  # set large enough to give small learning rates a chance
 67 |         )
 68 |         self.time_budget = time_budget
 69 |         self.verbose = verbose
 70 |         self.ml_task = ml_task
 71 |         self.n_jobs = n_jobs
 72 |         self.random_state = random_state
 73 |         self.cat_features_indices = []
 74 |         self.load()
 75 |         if not self.verbose:
 76 |             optuna.logging.set_verbosity(optuna.logging.CRITICAL)
 77 | 
 78 |     @staticmethod
 79 |     def is_optimizable(algorithm_name):
 80 |         return algorithm_name in [
 81 |             "Extra Trees",
 82 |             "Random Forest",
 83 |             "CatBoost",
 84 |             "Xgboost",
 85 |             "LightGBM",
 86 |             "Nearest Neighbors",
 87 |             "Neural Network",
 88 |         ]
 89 | 
 90 |     def optimize(
 91 |         self,
 92 |         algorithm,
 93 |         data_type,
 94 |         X_train,
 95 |         y_train,
 96 |         sample_weight,
 97 |         X_validation,
 98 |         y_validation,
 99 |         sample_weight_validation,
100 |         learner_params,
101 |     ):
102 |         # only tune models with original data type
103 |         if data_type != "original":
104 |             return learner_params
105 | 
106 |         key = f"{data_type}_{algorithm}"
107 |         if key in self.tuning:
108 |             return self.update_learner_params(learner_params, self.tuning[key])
109 | 
110 |         if self.verbose:
111 |             print(
112 |                 f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds "
113 |                 f"eval_metric {self.eval_metric.name} ({self.direction})"
114 |             )
115 | 
116 |         self.cat_features_indices = []
117 |         for i in range(X_train.shape[1]):
118 |             if PreprocessingUtils.is_categorical(X_train.iloc[:, i]):
119 |                 self.cat_features_indices += [i]
120 | 
121 |         study = optuna.create_study(
122 |             direction=self.direction,
123 |             sampler=optuna.samplers.TPESampler(seed=self.random_state),
124 |             pruner=optuna.pruners.MedianPruner(n_warmup_steps=self.n_warmup_steps),
125 |         )
126 |         obejctive = None
127 |         if algorithm == "LightGBM":
128 |             objective = LightgbmObjective(
129 |                 self.ml_task,
130 |                 X_train,
131 |                 y_train,
132 |                 sample_weight,
133 |                 X_validation,
134 |                 y_validation,
135 |                 sample_weight_validation,
136 |                 self.eval_metric,
137 |                 self.cat_features_indices,
138 |                 self.n_jobs,
139 |                 self.random_state,
140 |             )
141 |         elif algorithm == "Xgboost":
142 |             objective = XgboostObjective(
143 |                 self.ml_task,
144 |                 X_train,
145 |                 y_train,
146 |                 sample_weight,
147 |                 X_validation,
148 |                 y_validation,
149 |                 sample_weight_validation,
150 |                 self.eval_metric,
151 |                 self.n_jobs,
152 |                 self.random_state,
153 |             )
154 |         elif algorithm == "CatBoost":
155 |             objective = CatBoostObjective(
156 |                 self.ml_task,
157 |                 X_train,
158 |                 y_train,
159 |                 sample_weight,
160 |                 X_validation,
161 |                 y_validation,
162 |                 sample_weight_validation,
163 |                 self.eval_metric,
164 |                 self.cat_features_indices,
165 |                 self.n_jobs,
166 |                 self.random_state,
167 |             )
168 |         elif algorithm == "Random Forest":
169 |             objective = RandomForestObjective(
170 |                 self.ml_task,
171 |                 X_train,
172 |                 y_train,
173 |                 sample_weight,
174 |                 X_validation,
175 |                 y_validation,
176 |                 sample_weight_validation,
177 |                 self.eval_metric,
178 |                 self.n_jobs,
179 |                 self.random_state,
180 |             )
181 |         elif algorithm == "Extra Trees":
182 |             objective = ExtraTreesObjective(
183 |                 self.ml_task,
184 |                 X_train,
185 |                 y_train,
186 |                 sample_weight,
187 |                 X_validation,
188 |                 y_validation,
189 |                 sample_weight_validation,
190 |                 self.eval_metric,
191 |                 self.n_jobs,
192 |                 self.random_state,
193 |             )
194 |         elif algorithm == "Nearest Neighbors":
195 |             objective = KNNObjective(
196 |                 self.ml_task,
197 |                 X_train,
198 |                 y_train,
199 |                 sample_weight,
200 |                 X_validation,
201 |                 y_validation,
202 |                 sample_weight_validation,
203 |                 self.eval_metric,
204 |                 self.n_jobs,
205 |                 self.random_state,
206 |             )
207 |         elif algorithm == "Neural Network":
208 |             objective = NeuralNetworkObjective(
209 |                 self.ml_task,
210 |                 X_train,
211 |                 y_train,
212 |                 sample_weight,
213 |                 X_validation,
214 |                 y_validation,
215 |                 sample_weight_validation,
216 |                 self.eval_metric,
217 |                 self.n_jobs,
218 |                 self.random_state,
219 |             )
220 | 
221 |         study.optimize(
222 |             objective, n_trials=5000, timeout=self.time_budget, gc_after_trial=True
223 |         )
224 | 
225 |         self.plot_study(algorithm, data_type, study)
226 | 
227 |         joblib.dump(study, os.path.join(self.study_dir, key + ".joblib"))
228 | 
229 |         best = study.best_params
230 | 
231 |         if algorithm == "LightGBM":
232 |             best["metric"] = objective.eval_metric_name
233 |             best["custom_eval_metric_name"] = objective.custom_eval_metric_name
234 |             best["num_boost_round"] = objective.rounds
235 |             best["early_stopping_rounds"] = objective.early_stopping_rounds
236 |             # best["learning_rate"] = objective.learning_rate
237 |             best["cat_feature"] = self.cat_features_indices
238 |             best["feature_pre_filter"] = False
239 |             best["seed"] = objective.seed
240 |         elif algorithm == "CatBoost":
241 |             best["eval_metric"] = objective.eval_metric_name
242 |             best["num_boost_round"] = objective.rounds
243 |             best["early_stopping_rounds"] = objective.early_stopping_rounds
244 |             # best["bootstrap_type"] = "Bernoulli"
245 |             # best["learning_rate"] = objective.learning_rate
246 |             best["seed"] = objective.seed
247 |         elif algorithm == "Xgboost":
248 |             best["objective"] = objective.objective
249 |             best["eval_metric"] = objective.eval_metric_name
250 |             # best["eta"] = objective.learning_rate
251 |             best["max_rounds"] = objective.rounds
252 |             best["early_stopping_rounds"] = objective.early_stopping_rounds
253 |             best["seed"] = objective.seed
254 |         elif algorithm == "Extra Trees":
255 |             # Extra Trees are not using early stopping
256 |             best["max_steps"] = objective.max_steps  # each step has 100 trees
257 |             best["seed"] = objective.seed
258 |             best["eval_metric_name"] = self.eval_metric.name
259 |         elif algorithm == "Random Forest":
260 |             # Random Forest is not using early stopping
261 |             best["max_steps"] = objective.max_steps  # each step has 100 trees
262 |             best["seed"] = objective.seed
263 |             best["eval_metric_name"] = self.eval_metric.name
264 |         elif algorithm == "Nearest Neighbors":
265 |             best["rows_limit"] = 100000
266 |         elif algorithm == "Neural Network":
267 |             pass
268 | 
269 |         self.tuning[key] = best
270 |         self.save()
271 | 
272 |         return self.update_learner_params(learner_params, best)
273 | 
274 |     def update_learner_params(self, learner_params, best):
275 |         for k, v in best.items():
276 |             learner_params[k] = v
277 |         return learner_params
278 | 
279 |     def save(self):
280 |         with open(self.tuning_fname, "w") as fout:
281 |             fout.write(json.dumps(self.tuning, indent=4, cls=MLJSONEncoder))
282 | 
283 |     def load(self):
284 |         if os.path.exists(self.tuning_fname):
285 |             params = json.loads(open(self.tuning_fname).read())
286 |             for k, v in params.items():
287 |                 self.tuning[k] = v
288 | 
289 |     def plot_study(self, algorithm, data_type, study):
290 |         key = f"{data_type}_{algorithm}"
291 | 
292 |         plots = [
293 |             (
294 |                 optuna.visualization.matplotlib.plot_optimization_history,
295 |                 "optimization_history",
296 |             ),
297 |             (
298 |                 optuna.visualization.matplotlib.plot_parallel_coordinate,
299 |                 "parallel_coordinate",
300 |             ),
301 |             (
302 |                 optuna.visualization.matplotlib.plot_param_importances,
303 |                 "param_importances",
304 |             ),
305 |             # (optuna.visualization.matplotlib.plot_slice, "slice"),
306 |         ]
307 | 
308 |         matplotlib_default_figsize = matplotlib.rcParams["figure.figsize"]
309 |         matplotlib.rcParams["figure.figsize"] = (11, 7)
310 | 
311 |         md = f"# Optuna tuning for {algorithm} on {data_type} data\n\n"
312 |         for plot, title in plots:
313 |             try:
314 |                 with warnings.catch_warnings():
315 |                     warnings.simplefilter("ignore")
316 |                     plt.figure()
317 |                     plt.rcParams["axes.grid"] = title != "parallel_coordinate"
318 |                     plot(study)
319 |                     plt.tight_layout(pad=2.0)
320 |                     fname = f"{key}_{title}.png"
321 |                     plt.savefig(os.path.join(self.study_dir, fname))
322 |                     plt.close("all")
323 | 
324 |                     md += f'## {algorithm} {title.replace("_", " ").title()}\n\n'
325 |                     md += f"![{algorithm} {data_type} {title}]({fname})\n\n"
326 | 
327 |             except Exception as e:
328 |                 print(str(e))
329 | 
330 |         matplotlib.rcParams["figure.figsize"] = matplotlib_default_figsize
331 |         plt.style.use("default")
332 | 
333 |         with open(os.path.join(self.study_dir, "README.md"), "a") as fout:
334 |             fout.write(md)
335 |             fout.write("\n\n[<< Go back](../README.md)\n")
336 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/lightgbm.py:
--------------------------------------------------------------------------------

```python
  1 | import contextlib
  2 | import copy
  3 | import logging
  4 | import os
  5 | 
  6 | import lightgbm as lgb
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn.base import ClassifierMixin, RegressorMixin
 10 | 
 11 | from supervised.algorithms.algorithm import BaseAlgorithm
 12 | from supervised.algorithms.registry import (
 13 |     BINARY_CLASSIFICATION,
 14 |     MULTICLASS_CLASSIFICATION,
 15 |     REGRESSION,
 16 |     AlgorithmsRegistry,
 17 | )
 18 | from supervised.utils.config import LOG_LEVEL
 19 | from supervised.utils.metric import (
 20 |     lightgbm_eval_metric_accuracy,
 21 |     lightgbm_eval_metric_average_precision,
 22 |     lightgbm_eval_metric_f1,
 23 |     lightgbm_eval_metric_pearson,
 24 |     lightgbm_eval_metric_r2,
 25 |     lightgbm_eval_metric_spearman,
 26 |     lightgbm_eval_metric_user_defined,
 27 | )
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | logger.setLevel(LOG_LEVEL)
 31 | 
 32 | 
 33 | def lightgbm_objective(ml_task, automl_eval_metric):
 34 |     objective = "regression"
 35 |     if ml_task == BINARY_CLASSIFICATION:
 36 |         objective = "binary"
 37 |     elif ml_task == MULTICLASS_CLASSIFICATION:
 38 |         objective = "multiclass"
 39 |     else:  # ml_task == REGRESSION
 40 |         objective = "regression"
 41 |     return objective
 42 | 
 43 | 
 44 | def lightgbm_eval_metric(ml_task, automl_eval_metric):
 45 |     if automl_eval_metric == "user_defined_metric":
 46 |         return "custom", automl_eval_metric
 47 |     metric_name_mapping = {
 48 |         BINARY_CLASSIFICATION: {
 49 |             "auc": "auc",
 50 |             "logloss": "binary_logloss",
 51 |             "f1": "custom",
 52 |             "average_precision": "custom",
 53 |             "accuracy": "custom",
 54 |         },
 55 |         MULTICLASS_CLASSIFICATION: {
 56 |             "logloss": "multi_logloss",
 57 |             "f1": "custom",
 58 |             "accuracy": "custom",
 59 |         },
 60 |         REGRESSION: {
 61 |             "rmse": "rmse",
 62 |             "mse": "l2",
 63 |             "mae": "l1",
 64 |             "mape": "mape",
 65 |             "r2": "custom",
 66 |             "spearman": "custom",
 67 |             "pearson": "custom",
 68 |         },
 69 |     }
 70 | 
 71 |     metric = metric_name_mapping[ml_task][automl_eval_metric]
 72 |     custom_eval_metric = None
 73 | 
 74 |     if automl_eval_metric in [
 75 |         "r2",
 76 |         "spearman",
 77 |         "pearson",
 78 |         "f1",
 79 |         "average_precision",
 80 |         "accuracy",
 81 |     ]:
 82 |         custom_eval_metric = automl_eval_metric
 83 | 
 84 |     return metric, custom_eval_metric
 85 | 
 86 | 
 87 | class LightgbmAlgorithm(BaseAlgorithm):
 88 |     algorithm_name = "LightGBM"
 89 |     algorithm_short_name = "LightGBM"
 90 | 
 91 |     def __init__(self, params):
 92 |         super(LightgbmAlgorithm, self).__init__(params)
 93 |         self.library_version = lgb.__version__
 94 | 
 95 |         self.explain_level = params.get("explain_level", 0)
 96 |         self.rounds = additional.get("max_rounds", 10000)
 97 |         self.max_iters = 1
 98 |         self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
 99 | 
100 |         n_jobs = self.params.get("n_jobs", 0)
101 |         # 0 is the default for LightGBM to use all cores
102 |         if n_jobs == -1:
103 |             n_jobs = 0
104 | 
105 |         self.learner_params = {
106 |             "boosting_type": "gbdt",
107 |             "objective": self.params.get("objective", "binary"),
108 |             "metric": self.params.get("metric", "binary_logloss"),
109 |             "num_leaves": self.params.get("num_leaves", 31),
110 |             "learning_rate": self.params.get("learning_rate", 0.1),
111 |             "feature_fraction": self.params.get("feature_fraction", 1.0),
112 |             "bagging_fraction": self.params.get("bagging_fraction", 1.0),
113 |             "min_data_in_leaf": self.params.get("min_data_in_leaf", 20),
114 |             "num_threads": n_jobs,
115 |             "verbose": -1,
116 |             "seed": self.params.get("seed", 1),
117 |             "extra_trees": self.params.get("extra_trees", False),
118 |         }
119 | 
120 |         for extra_param in [
121 |             "lambda_l1",
122 |             "lambda_l2",
123 |             "bagging_freq",
124 |             "feature_pre_filter",
125 |             "cat_feature",
126 |             "cat_l2",
127 |             "cat_smooth",
128 |             "max_bin",
129 |         ]:
130 |             if extra_param in self.params:
131 |                 self.learner_params[extra_param] = self.params[extra_param]
132 | 
133 |         if "num_boost_round" in self.params:
134 |             self.rounds = self.params["num_boost_round"]
135 |         if "early_stopping_rounds" in self.params:
136 |             self.early_stopping_rounds = self.params["early_stopping_rounds"]
137 | 
138 |         if "num_class" in self.params:  # multiclass classification
139 |             self.learner_params["num_class"] = self.params.get("num_class")
140 | 
141 |         self.custom_eval_metric = None
142 |         if self.params.get("custom_eval_metric_name") is not None:
143 |             if self.params["custom_eval_metric_name"] == "r2":
144 |                 self.custom_eval_metric = lightgbm_eval_metric_r2
145 |             elif self.params["custom_eval_metric_name"] == "spearman":
146 |                 self.custom_eval_metric = lightgbm_eval_metric_spearman
147 |             elif self.params["custom_eval_metric_name"] == "pearson":
148 |                 self.custom_eval_metric = lightgbm_eval_metric_pearson
149 |             elif self.params["custom_eval_metric_name"] == "f1":
150 |                 self.custom_eval_metric = lightgbm_eval_metric_f1
151 |             elif self.params["custom_eval_metric_name"] == "average_precision":
152 |                 self.custom_eval_metric = lightgbm_eval_metric_average_precision
153 |             elif self.params["custom_eval_metric_name"] == "accuracy":
154 |                 self.custom_eval_metric = lightgbm_eval_metric_accuracy
155 |             elif self.params["custom_eval_metric_name"] == "user_defined_metric":
156 |                 self.custom_eval_metric = lightgbm_eval_metric_user_defined
157 | 
158 |         logger.debug("LightgbmLearner __init__")
159 | 
160 |     def file_extension(self):
161 |         return "lightgbm"
162 | 
163 |     def update(self, update_params):
164 |         pass
165 | 
166 |     """
167 |     def get_boosting_rounds(self, lgb_train, valid_sets, esr, max_time):
168 |         if max_time is None:
169 |             max_time = 3600.0
170 |         start_time = time.time()
171 |         evals_result = {}
172 |         model = lgb.train(
173 |             self.learner_params,
174 |             lgb_train,
175 |             num_boost_round=2,
176 |             valid_sets=valid_sets,
177 |             early_stopping_rounds=esr,
178 |             evals_result=evals_result,
179 |             verbose_eval=False,
180 |         )
181 |         time_1_iter = (time.time() - start_time) / 2.0
182 | 
183 |         # 2.0 is just a scaling factor
184 |         # purely heuristic
185 |         iters = int(max_time / time_1_iter * 2.0)
186 |         iters = max(iters, 100)
187 |         iters = min(iters, 10000)
188 |         return iters
189 |     """
190 | 
191 |     def fit(
192 |         self,
193 |         X,
194 |         y,
195 |         sample_weight=None,
196 |         X_validation=None,
197 |         y_validation=None,
198 |         sample_weight_validation=None,
199 |         log_to_file=None,
200 |         max_time=None,
201 |     ):
202 |         lgb_train = lgb.Dataset(
203 |             X.values if isinstance(X, pd.DataFrame) else X,
204 |             y,
205 |             weight=sample_weight,
206 |         )
207 |         valid_sets = None
208 |         if self.early_stopping_rounds == 0:
209 |             self.model = lgb.train(
210 |                 self.learner_params,
211 |                 lgb_train,
212 |                 num_boost_round=self.rounds,
213 |                 init_model=self.model,
214 |             )
215 |         else:
216 |             valid_names = None
217 |             esr = None
218 |             if X_validation is not None and y_validation is not None:
219 |                 valid_sets = [
220 |                     lgb_train,
221 |                     lgb.Dataset(
222 |                         X_validation.values
223 |                         if isinstance(X_validation, pd.DataFrame)
224 |                         else X_validation,
225 |                         y_validation,
226 |                         weight=sample_weight_validation,
227 |                     ),
228 |                 ]
229 |                 valid_names = ["train", "validation"]
230 |                 esr = self.early_stopping_rounds
231 |             evals_result = {}
232 | 
233 |             # disable for now ...
234 |             # boosting_rounds = self.get_boosting_rounds(lgb_train, valid_sets, esr, max_time)
235 | 
236 |             self.model = lgb.train(
237 |                 self.learner_params,
238 |                 lgb_train,
239 |                 num_boost_round=self.rounds,
240 |                 valid_sets=valid_sets,
241 |                 valid_names=valid_names,
242 |                 feval=self.custom_eval_metric,
243 |                 callbacks=[
244 |                     lgb.early_stopping(esr, verbose=False),
245 |                     lgb.record_evaluation(evals_result),
246 |                 ],
247 |             )
248 | 
249 |             del lgb_train
250 |             if valid_sets is not None:
251 |                 del valid_sets[0]
252 |                 del valid_sets
253 | 
254 |             if log_to_file is not None:
255 |                 metric_name = list(evals_result["train"].keys())[0]
256 |                 result = pd.DataFrame(
257 |                     {
258 |                         "iteration": range(len(evals_result["train"][metric_name])),
259 |                         "train": evals_result["train"][metric_name],
260 |                         "validation": evals_result["validation"][metric_name],
261 |                     }
262 |                 )
263 |                 result.to_csv(log_to_file, index=False, header=False)
264 | 
265 |             if self.params["ml_task"] != REGRESSION:
266 |                 self.classes_ = np.unique(y)
267 | 
268 |     def is_fitted(self):
269 |         return self.model is not None
270 | 
271 |     def predict(self, X):
272 |         self.reload()
273 |         return self.model.predict(X.values if isinstance(X, pd.DataFrame) else X)
274 | 
275 |     def copy(self):
276 |         with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
277 |             return copy.deepcopy(self)
278 | 
279 |     def save(self, model_file_path):
280 |         self.model.save_model(model_file_path)
281 |         self.model_file_path = model_file_path
282 |         logger.debug("LightgbmAlgorithm save model to %s" % model_file_path)
283 | 
284 |     def load(self, model_file_path):
285 |         logger.debug("LightgbmAlgorithm load model from %s" % model_file_path)
286 |         self.model_file_path = model_file_path
287 |         self.model = lgb.Booster(model_file=model_file_path)
288 | 
289 |     def get_metric_name(self):
290 |         metric = self.params.get("metric")
291 |         custom_metric = self.params.get("custom_eval_metric_name")
292 | 
293 |         if metric is None:
294 |             return None
295 |         if metric == "custom":
296 |             return custom_metric
297 |         if metric == "binary_logloss":
298 |             return "logloss"
299 |         elif metric == "multi_logloss":
300 |             return "logloss"
301 |         return metric
302 | 
303 | 
304 | lgbm_bin_params = {
305 |     "objective": ["binary"],
306 |     "num_leaves": [15, 31, 63, 95, 127],
307 |     "learning_rate": [0.05, 0.1, 0.2],
308 |     "feature_fraction": [0.5, 0.8, 0.9, 1.0],
309 |     "bagging_fraction": [0.5, 0.8, 0.9, 1.0],
310 |     "min_data_in_leaf": [5, 10, 15, 20, 30, 50],
311 | }
312 | 
313 | classification_bin_default_params = {
314 |     "objective": "binary",
315 |     "num_leaves": 63,
316 |     "learning_rate": 0.05,
317 |     "feature_fraction": 0.9,
318 |     "bagging_fraction": 0.9,
319 |     "min_data_in_leaf": 10,
320 | }
321 | 
322 | 
323 | additional = {
324 |     "max_rounds": 10000,
325 |     "early_stopping_rounds": 50,
326 |     "max_rows_limit": None,
327 |     "max_cols_limit": None,
328 | }
329 | 
330 | required_preprocessing = [
331 |     "missing_values_inputation",
332 |     "convert_categorical",
333 |     "datetime_transform",
334 |     "text_transform",
335 |     "target_as_integer",
336 | ]
337 | 
338 | lgbm_multi_params = copy.deepcopy(lgbm_bin_params)
339 | lgbm_multi_params["objective"] = ["multiclass"]
340 | 
341 | classification_multi_default_params = {
342 |     "objective": "multiclass",
343 |     "num_leaves": 63,
344 |     "learning_rate": 0.05,
345 |     "feature_fraction": 0.9,
346 |     "bagging_fraction": 0.9,
347 |     "min_data_in_leaf": 10,
348 | }
349 | 
350 | lgbr_params = copy.deepcopy(lgbm_bin_params)
351 | lgbr_params["objective"] = ["regression"]
352 | 
353 | 
354 | class LgbmClassifier(ClassifierMixin, LightgbmAlgorithm):
355 |     pass
356 | 
357 | 
358 | AlgorithmsRegistry.add(
359 |     BINARY_CLASSIFICATION,
360 |     LgbmClassifier,
361 |     lgbm_bin_params,
362 |     required_preprocessing,
363 |     additional,
364 |     classification_bin_default_params,
365 | )
366 | 
367 | AlgorithmsRegistry.add(
368 |     MULTICLASS_CLASSIFICATION,
369 |     LgbmClassifier,
370 |     lgbm_multi_params,
371 |     required_preprocessing,
372 |     additional,
373 |     classification_multi_default_params,
374 | )
375 | 
376 | regression_required_preprocessing = [
377 |     "missing_values_inputation",
378 |     "convert_categorical",
379 |     "datetime_transform",
380 |     "text_transform",
381 |     "target_scale",
382 | ]
383 | 
384 | 
385 | regression_default_params = {
386 |     "objective": "regression",
387 |     "num_leaves": 63,
388 |     "learning_rate": 0.05,
389 |     "feature_fraction": 0.9,
390 |     "bagging_fraction": 0.9,
391 |     "min_data_in_leaf": 10,
392 | }
393 | 
394 | 
395 | class LgbmRegressor(RegressorMixin, LightgbmAlgorithm):
396 |     pass
397 | 
398 | 
399 | AlgorithmsRegistry.add(
400 |     REGRESSION,
401 |     LgbmRegressor,
402 |     lgbr_params,
403 |     regression_required_preprocessing,
404 |     additional,
405 |     regression_default_params,
406 | )
407 | 
```

--------------------------------------------------------------------------------
/supervised/utils/shap.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | shap_pacakge_available = False
  8 | try:
  9 |     # I'm tired of all shap dependency hell
 10 |     # ugh
 11 |     import shap
 12 |     shap_pacakge_available = True
 13 | except Exception:
 14 |     pass
 15 | 
 16 | from sklearn.preprocessing import OneHotEncoder
 17 | 
 18 | from supervised.algorithms.registry import (
 19 |     BINARY_CLASSIFICATION,
 20 |     MULTICLASS_CLASSIFICATION,
 21 |     REGRESSION,
 22 | )
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | from supervised.utils.config import LOG_LEVEL
 26 | 
 27 | logger.setLevel(LOG_LEVEL)
 28 | import warnings
 29 | 
 30 | 
 31 | class PlotSHAP:
 32 |     @staticmethod
 33 |     def is_available(algorithm, X_train, y_train, ml_task):
 34 |         if not shap_pacakge_available:
 35 |             return False
 36 |         # https://github.com/mljar/mljar-supervised/issues/112 disable for NN
 37 |         # https://github.com/mljar/mljar-supervised/issues/114 disable for CatBoost
 38 |         if algorithm.algorithm_short_name in ["Baseline", "Neural Network", "CatBoost"]:
 39 |             return False
 40 |         if (
 41 |             algorithm.algorithm_short_name == "Xgboost"
 42 |             and algorithm.learner_params["booster"] == "gblinear"
 43 |         ):
 44 |             # Xgboost gblinear is not supported by SHAP
 45 |             return False
 46 |         # disable for large number of columns
 47 |         if X_train.shape[1] > 500:
 48 |             warnings.warn(
 49 |                 "Disable SHAP explanations because of number of columns > 500."
 50 |             )
 51 |             return False
 52 |         if ml_task == MULTICLASS_CLASSIFICATION and len(np.unique(y_train)) > 100:
 53 |             warnings.warn(
 54 |                 "Disable SHAP explanations because of large number of classes (> 100)."
 55 |             )
 56 |             return False
 57 |         if X_train.shape[0] < 20:
 58 |             warnings.warn(
 59 |                 "Disable SHAP explanations because of small number of samples (< 20)."
 60 |             )
 61 |             return False
 62 |         return True
 63 | 
 64 |     @staticmethod
 65 |     def get_explainer(algorithm, X_train):
 66 |         explainer = None
 67 |         if algorithm.algorithm_short_name in [
 68 |             "Xgboost",
 69 |             "Decision Tree",
 70 |             "Random Forest",
 71 |             "LightGBM",
 72 |             "Extra Trees",
 73 |             "CatBoost",
 74 |         ]:
 75 |             explainer = shap.TreeExplainer(algorithm.model)
 76 |         elif algorithm.algorithm_short_name in ["Linear"]:
 77 |             explainer = shap.LinearExplainer(algorithm.model, X_train)
 78 |         # elif algorithm.algorithm_short_name in ["Neural Network"]:
 79 |         #    explainer = shap.KernelExplainer(algorithm.model.predict, X_train)  # slow
 80 | 
 81 |         return explainer
 82 | 
 83 |     @staticmethod
 84 |     def get_sample(X_validation, y_validation):
 85 |         # too many samples in the data, down-sample it
 86 |         SAMPLES_LIMIT = 1000
 87 |         if X_validation.shape[0] > SAMPLES_LIMIT:
 88 |             X_validation.reset_index(inplace=True, drop=True)
 89 |             y_validation.reset_index(inplace=True, drop=True)
 90 |             X_vald = X_validation.sample(SAMPLES_LIMIT)
 91 |             y_vald = y_validation[X_vald.index]
 92 |         else:
 93 |             X_vald = X_validation
 94 |             y_vald = y_validation
 95 |         return X_vald, y_vald
 96 | 
 97 |     def get_predictions(algorithm, X_vald, y_vald, ml_task):
 98 |         # compute predictions on down-sampled data
 99 |         predictions = algorithm.predict(X_vald)
100 | 
101 |         if ml_task == MULTICLASS_CLASSIFICATION:
102 |             oh = OneHotEncoder(sparse_output=False)
103 |             y_encoded = oh.fit_transform(np.array(y_vald).reshape(-1, 1))
104 |             residua = np.sum(np.abs(np.array(y_encoded) - predictions), axis=1)
105 |         else:
106 |             residua = np.abs(np.array(y_vald) - predictions)
107 | 
108 |         df_preds = pd.DataFrame(
109 |             {"res": residua, "lp": range(residua.shape[0]), "target": np.array(y_vald)},
110 |             index=X_vald.index,
111 |         )
112 |         df_preds = df_preds.sort_values(by="res", ascending=False)
113 | 
114 |         return df_preds
115 | 
116 |     @staticmethod
117 |     def summary(shap_values, X_vald, model_file_path, learner_name, class_names):
118 |         fig = plt.gcf()
119 |         classes = None
120 |         if class_names is not None and len(class_names):
121 |             classes = class_names
122 |         with warnings.catch_warnings():
123 |             warnings.simplefilter("ignore")
124 |             shap.summary_plot(
125 |                 shap_values, X_vald, plot_type="bar", show=False, class_names=classes
126 |             )
127 |             fig.tight_layout(pad=2.0)
128 |             fig.savefig(os.path.join(model_file_path, f"{learner_name}_shap_summary.png"))
129 |             plt.close("all")
130 | 
131 |         vals = None
132 |         if isinstance(shap_values, list):
133 |             for sh in shap_values:
134 |                 v = np.abs(sh).mean(0)
135 |                 vals = v if vals is None else vals + v
136 |         else:
137 |             vals = np.abs(shap_values).mean(0)
138 |         feature_importance = pd.DataFrame(
139 |             list(zip(X_vald.columns, vals)), columns=["feature", "shap_importance"]
140 |         )
141 |         feature_importance.sort_values(
142 |             by=["shap_importance"], ascending=False, inplace=True
143 |         )
144 |         feature_importance.to_csv(
145 |             os.path.join(model_file_path, f"{learner_name}_shap_importance.csv"),
146 |             index=False,
147 |         )
148 | 
149 |     @staticmethod
150 |     def dependence(shap_values, X_vald, model_file_path, learner_name, file_postfix=""):
151 |         with warnings.catch_warnings():
152 |             warnings.simplefilter("ignore")
153 |             fig = plt.figure(figsize=(14, 7))
154 |             plots_cnt = np.min([9, X_vald.shape[1]])
155 |             cols_cnt = 3
156 |             rows_cnt = 3
157 |             if plots_cnt < 4:
158 |                 rows_cnt = 1
159 |             elif plots_cnt < 7:
160 |                 rows_cnt = 2
161 |             for i in range(plots_cnt):
162 |                 ax = fig.add_subplot(rows_cnt, cols_cnt, i + 1)
163 |                 shap.dependence_plot(
164 |                     f"rank({i})",
165 |                     shap_values,
166 |                     X_vald,
167 |                     show=False,
168 |                     title=f"Importance #{i+1}",
169 |                     ax=ax,
170 |                 )
171 | 
172 |             fig.tight_layout(pad=2.0)
173 |             fig.savefig(
174 |                 os.path.join(
175 |                     model_file_path, f"{learner_name}_shap_dependence{file_postfix}.png"
176 |                 )
177 |             )
178 |             plt.close("all")
179 | 
180 |     @staticmethod
181 |     def compute(
182 |         algorithm,
183 |         X_train,
184 |         y_train,
185 |         X_validation,
186 |         y_validation,
187 |         model_file_path,
188 |         learner_name,
189 |         class_names,
190 |         ml_task,
191 |     ):
192 |         if not PlotSHAP.is_available(algorithm, X_train, y_train, ml_task):
193 |             return
194 |         try:
195 |             with warnings.catch_warnings():
196 |                 warnings.simplefilter("ignore")
197 |                 explainer = PlotSHAP.get_explainer(algorithm, X_train)
198 |                 X_vald, y_vald = PlotSHAP.get_sample(X_validation, y_validation)
199 |                 shap_values = explainer.shap_values(X_vald)
200 | 
201 |             # fix problem with 1 or 2 dimensions for binary classification
202 |             expected_value = explainer.expected_value
203 |             if ml_task == BINARY_CLASSIFICATION and isinstance(shap_values, list):
204 |                 shap_values = shap_values[1]
205 |                 expected_value = explainer.expected_value[1]
206 | 
207 |             # Summary SHAP plot
208 |             PlotSHAP.summary(
209 |                 shap_values, X_vald, model_file_path, learner_name, class_names
210 |             )
211 |             # Dependence SHAP plots
212 |             if ml_task == MULTICLASS_CLASSIFICATION:
213 |                 for t in np.unique(y_vald):
214 |                     PlotSHAP.dependence(
215 |                         shap_values[t],
216 |                         X_vald,
217 |                         model_file_path,
218 |                         learner_name,
219 |                         f"_class_{class_names[t]}",
220 |                     )
221 |             else:
222 |                 PlotSHAP.dependence(shap_values, X_vald, model_file_path, learner_name)
223 | 
224 |             # Decision SHAP plots
225 |             df_preds = PlotSHAP.get_predictions(algorithm, X_vald, y_vald, ml_task)
226 | 
227 |             if ml_task == REGRESSION:
228 |                 PlotSHAP.decisions_regression(
229 |                     df_preds,
230 |                     shap_values,
231 |                     expected_value,
232 |                     X_vald,
233 |                     y_vald,
234 |                     model_file_path,
235 |                     learner_name,
236 |                 )
237 |             elif ml_task == BINARY_CLASSIFICATION:
238 |                 PlotSHAP.decisions_binary(
239 |                     df_preds,
240 |                     shap_values,
241 |                     expected_value,
242 |                     X_vald,
243 |                     y_vald,
244 |                     model_file_path,
245 |                     learner_name,
246 |                 )
247 |             else:
248 |                 PlotSHAP.decisions_multiclass(
249 |                     df_preds,
250 |                     shap_values,
251 |                     expected_value,
252 |                     X_vald,
253 |                     y_vald,
254 |                     model_file_path,
255 |                     learner_name,
256 |                     class_names,
257 |                 )
258 |         except Exception as e:
259 |             pass
260 |             # print(
261 |             #    f"Exception while producing SHAP explanations. {str(e)}\nContinuing ..."
262 |             # )
263 | 
264 |     @staticmethod
265 |     def decisions_regression(
266 |         df_preds,
267 |         shap_values,
268 |         expected_value,
269 |         X_vald,
270 |         y_vald,
271 |         model_file_path,
272 |         learner_name,
273 |     ):
274 |         fig = plt.gcf()
275 |         shap.decision_plot(
276 |             expected_value,
277 |             shap_values[df_preds.lp[:10], :],
278 |             X_vald.loc[df_preds.index[:10]],
279 |             show=False,
280 |         )
281 |         fig.tight_layout(pad=2.0)
282 |         fig.savefig(
283 |             os.path.join(model_file_path, f"{learner_name}_shap_worst_decisions.png")
284 |         )
285 |         plt.close("all")
286 | 
287 |         fig = plt.gcf()
288 |         shap.decision_plot(
289 |             expected_value,
290 |             shap_values[df_preds.lp[-10:], :],
291 |             X_vald.loc[df_preds.index[-10:]],
292 |             show=False,
293 |         )
294 |         fig.tight_layout(pad=2.0)
295 |         fig.savefig(
296 |             os.path.join(model_file_path, f"{learner_name}_shap_best_decisions.png")
297 |         )
298 |         plt.close("all")
299 | 
300 |     @staticmethod
301 |     def decisions_binary(
302 |         df_preds,
303 |         shap_values,
304 |         expected_value,
305 |         X_vald,
306 |         y_vald,
307 |         model_file_path,
308 |         learner_name,
309 |     ):
310 |         # classes are from 0 ...
311 |         for t in np.unique(y_vald):
312 |             fig = plt.gcf()
313 |             shap.decision_plot(
314 |                 expected_value,
315 |                 shap_values[df_preds[df_preds.target == t].lp[:10], :],
316 |                 X_vald.loc[df_preds[df_preds.target == t].index[:10]],
317 |                 show=False,
318 |             )
319 |             fig.tight_layout(pad=2.0)
320 |             fig.savefig(
321 |                 os.path.join(
322 |                     model_file_path,
323 |                     f"{learner_name}_shap_class_{t}_worst_decisions.png",
324 |                 )
325 |             )
326 |             plt.close("all")
327 | 
328 |             fig = plt.gcf()
329 |             shap.decision_plot(
330 |                 expected_value,
331 |                 shap_values[df_preds[df_preds.target == t].lp[-10:], :],
332 |                 X_vald.loc[df_preds[df_preds.target == t].index[-10:]],
333 |                 show=False,
334 |             )
335 |             fig.tight_layout(pad=2.0)
336 |             fig.savefig(
337 |                 os.path.join(
338 |                     model_file_path, f"{learner_name}_shap_class_{t}_best_decisions.png"
339 |                 )
340 |             )
341 |             plt.close("all")
342 | 
343 |     @staticmethod
344 |     def decisions_multiclass(
345 |         df_preds,
346 |         shap_values,
347 |         expected_value,
348 |         X_vald,
349 |         y_vald,
350 |         model_file_path,
351 |         learner_name,
352 |         class_names,
353 |     ):
354 |         for decision_type in ["worst", "best"]:
355 |             m = 1 if decision_type == "worst" else -1
356 |             for i in range(4):
357 |                 fig = plt.gcf()
358 |                 shap.multioutput_decision_plot(
359 |                     list(expected_value),
360 |                     shap_values,
361 |                     row_index=df_preds.lp.iloc[m * i],
362 |                     show=False,
363 |                     legend_labels=class_names,
364 |                     title=f"It should be {class_names[df_preds.target.iloc[m*i]]}",
365 |                 )
366 |                 fig.tight_layout(pad=2.0)
367 |                 fig.savefig(
368 |                     os.path.join(
369 |                         model_file_path,
370 |                         f"{learner_name}_sample_{i}_{decision_type}_decisions.png",
371 |                     )
372 |                 )
373 |                 plt.close("all")
374 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/xgboost.py:
--------------------------------------------------------------------------------

```python
  1 | import copy
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import xgboost as xgb
  7 | from sklearn.base import ClassifierMixin, RegressorMixin
  8 | 
  9 | from supervised.algorithms.algorithm import BaseAlgorithm
 10 | from supervised.algorithms.registry import (
 11 |     BINARY_CLASSIFICATION,
 12 |     MULTICLASS_CLASSIFICATION,
 13 |     REGRESSION,
 14 |     AlgorithmsRegistry,
 15 | )
 16 | from supervised.utils.config import LOG_LEVEL
 17 | from supervised.utils.metric import (
 18 |     xgboost_eval_metric_accuracy,
 19 |     xgboost_eval_metric_average_precision,
 20 |     xgboost_eval_metric_f1,
 21 |     xgboost_eval_metric_mse,
 22 |     xgboost_eval_metric_pearson,
 23 |     xgboost_eval_metric_r2,
 24 |     xgboost_eval_metric_spearman,
 25 |     xgboost_eval_metric_user_defined,
 26 | )
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | logger.setLevel(LOG_LEVEL)
 30 | 
 31 | 
 32 | class XgbAlgorithmException(Exception):
 33 |     def __init__(self, message):
 34 |         super(XgbAlgorithmException, self).__init__(message)
 35 |         logger.error(message)
 36 | 
 37 | 
 38 | def time_constraint(env):
 39 |     # print("time constraint")
 40 |     pass
 41 | 
 42 | 
 43 | def xgboost_eval_metric(ml_task, automl_eval_metric):
 44 |     # the mapping is almost the same
 45 |     eval_metric_name = automl_eval_metric
 46 |     if ml_task == MULTICLASS_CLASSIFICATION:
 47 |         if automl_eval_metric == "logloss":
 48 |             eval_metric_name = "mlogloss"
 49 |     return eval_metric_name
 50 | 
 51 | 
 52 | def xgboost_objective(ml_task, automl_eval_metric):
 53 |     objective = "reg:squarederror"
 54 |     if ml_task == BINARY_CLASSIFICATION:
 55 |         objective = "binary:logistic"
 56 |     elif ml_task == MULTICLASS_CLASSIFICATION:
 57 |         objective = "multi:softprob"
 58 |     else:  # ml_task == REGRESSION
 59 |         objective = "reg:squarederror"
 60 |     return objective
 61 | 
 62 | 
 63 | class XgbAlgorithm(BaseAlgorithm):
 64 |     """
 65 |     This is a wrapper over xgboost algorithm.
 66 |     """
 67 | 
 68 |     algorithm_name = "Extreme Gradient Boosting"
 69 |     algorithm_short_name = "Xgboost"
 70 | 
 71 |     def __init__(self, params):
 72 |         super(XgbAlgorithm, self).__init__(params)
 73 |         self.library_version = xgb.__version__
 74 | 
 75 |         self.explain_level = params.get("explain_level", 0)
 76 |         self.boosting_rounds = additional.get("max_rounds", 10000)
 77 |         self.max_iters = 1
 78 |         self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
 79 |         self.learner_params = {
 80 |             "tree_method": "hist",
 81 |             "booster": "gbtree",
 82 |             "objective": self.params.get("objective"),
 83 |             "eval_metric": self.params.get("eval_metric"),
 84 |             "eta": self.params.get("eta", 0.01),
 85 |             "max_depth": self.params.get("max_depth", 1),
 86 |             "min_child_weight": self.params.get("min_child_weight", 1),
 87 |             "subsample": self.params.get("subsample", 0.8),
 88 |             "colsample_bytree": self.params.get("colsample_bytree", 0.8),
 89 |             "n_jobs": self.params.get("n_jobs", -1),
 90 |             # "silent": self.params.get("silent", 1),
 91 |             "seed": self.params.get("seed", 1),
 92 |             "verbosity": 0,
 93 |         }
 94 | 
 95 |         if "lambda" in self.params:
 96 |             self.learner_params["lambda"] = self.params["lambda"]
 97 |         if "alpha" in self.params:
 98 |             self.learner_params["alpha"] = self.params["alpha"]
 99 | 
100 |         # check https://github.com/dmlc/xgboost/issues/5637
101 |         if self.learner_params["seed"] > 2147483647:
102 |             self.learner_params["seed"] = self.learner_params["seed"] % 2147483647
103 |         if "num_class" in self.params:  # multiclass classification
104 |             self.learner_params["num_class"] = self.params.get("num_class")
105 | 
106 |         if "max_rounds" in self.params:
107 |             self.boosting_rounds = self.params["max_rounds"]
108 | 
109 |         self.custom_eval_metric = None
110 |         if self.params.get("eval_metric", "") == "r2":
111 |             self.custom_eval_metric = xgboost_eval_metric_r2
112 |         elif self.params.get("eval_metric", "") == "spearman":
113 |             self.custom_eval_metric = xgboost_eval_metric_spearman
114 |         elif self.params.get("eval_metric", "") == "pearson":
115 |             self.custom_eval_metric = xgboost_eval_metric_pearson
116 |         elif self.params.get("eval_metric", "") == "f1":
117 |             self.custom_eval_metric = xgboost_eval_metric_f1
118 |         elif self.params.get("eval_metric", "") == "average_precision":
119 |             self.custom_eval_metric = xgboost_eval_metric_average_precision
120 |         elif self.params.get("eval_metric", "") == "accuracy":
121 |             self.custom_eval_metric = xgboost_eval_metric_accuracy
122 |         elif self.params.get("eval_metric", "") == "mse":
123 |             self.custom_eval_metric = xgboost_eval_metric_mse
124 |         elif self.params.get("eval_metric", "") == "user_defined_metric":
125 |             self.custom_eval_metric = xgboost_eval_metric_user_defined
126 | 
127 |         logger.debug("XgbLearner __init__")
128 | 
129 |     """
130 |     def get_boosting_rounds(self, dtrain, evals, esr, max_time):
131 |         if max_time is None:
132 |             return self.boosting_rounds
133 | 
134 |         start_time = time.time()
135 |         evals_result = {}
136 |         model = xgb.train(
137 |             self.learner_params,
138 |             dtrain,
139 |             2,
140 |             evals=evals,
141 |             early_stopping_rounds=esr,
142 |             evals_result=evals_result,
143 |             verbose_eval=False,
144 |         )
145 |         time_1_iter = (time.time() - start_time) / 2.0
146 | 
147 |         # 2.0 is just a scaling factor
148 |         # purely heuristic
149 |         iters = int(max_time / time_1_iter * 2.0)
150 |         iters = max(iters, 100)
151 |         iters = min(iters, 10000)
152 |         return iters
153 |     """
154 | 
155 |     def fit(
156 |         self,
157 |         X,
158 |         y,
159 |         sample_weight=None,
160 |         X_validation=None,
161 |         y_validation=None,
162 |         sample_weight_validation=None,
163 |         log_to_file=None,
164 |         max_time=None,
165 |     ):
166 |         dtrain = xgb.DMatrix(
167 |             X.values if isinstance(X, pd.DataFrame) else X,
168 |             label=y,
169 |             missing=np.NaN,
170 |             weight=sample_weight,
171 |         )
172 |         
173 |         if X_validation is not None and y_validation is not None:       
174 |             dvalidation = xgb.DMatrix(
175 |                 X_validation.values
176 |                 if isinstance(X_validation, pd.DataFrame)
177 |                 else X_validation,
178 |                 label=y_validation,
179 |                 missing=np.NaN,
180 |                 weight=sample_weight_validation,
181 |             )
182 |         else:
183 |             dvalidation = None
184 |             
185 |         evals_result = {}
186 | 
187 |         evals = []
188 |         esr = None
189 |         if X_validation is not None and y_validation is not None:
190 |             evals = [(dtrain, "train"), (dvalidation, "validation")]
191 |             esr = self.early_stopping_rounds
192 | 
193 |         # disable for now, dont have better idea how to handle time limit ...
194 |         # looks like there is better not to limit the algorithm
195 |         # just wait till they converge ...
196 |         # boosting_rounds = self.get_boosting_rounds(dtrain, evals, esr, max_time)
197 | 
198 |         if self.custom_eval_metric is not None:
199 |             del self.learner_params["eval_metric"]
200 | 
201 |         self.model = xgb.train(
202 |             self.learner_params,
203 |             dtrain,
204 |             self.boosting_rounds,
205 |             evals=evals,
206 |             early_stopping_rounds=esr,
207 |             evals_result=evals_result,
208 |             verbose_eval=False,
209 |             custom_metric=self.custom_eval_metric
210 |             # callbacks=[time_constraint] # callback slows down by factor ~8
211 |         )
212 | 
213 |         del dtrain
214 |         del dvalidation
215 | 
216 |         if log_to_file is not None:
217 |             metric_name = list(evals_result["train"].keys())[-1]
218 | 
219 |             result = pd.DataFrame(
220 |                 {
221 |                     "iteration": range(len(evals_result["train"][metric_name])),
222 |                     "train": evals_result["train"][metric_name],
223 |                     "validation": evals_result["validation"][metric_name],
224 |                 }
225 |             )
226 |             # it a is custom metric
227 |             # that is always minimized
228 |             # we need to revert it
229 |             if metric_name in [
230 |                 "r2",
231 |                 "spearman",
232 |                 "pearson",
233 |                 "f1",
234 |                 "average_precision",
235 |                 "accuracy",
236 |             ]:
237 |                 result["train"] *= -1.0
238 |                 result["validation"] *= -1.0
239 | 
240 |             result.to_csv(log_to_file, index=False, header=False)
241 | 
242 |         if self.params["ml_task"] != REGRESSION:
243 |             self.classes_ = np.unique(y)
244 | 
245 |         # fix high memory consumption in xgboost,
246 |         # waiting for release with fix
247 |         # https://github.com/dmlc/xgboost/issues/5474
248 |         """
249 |         # disable, for now all learners are saved to hard disk and then deleted from RAM
250 |         with tempfile.NamedTemporaryFile() as tmp:
251 |             self.model.save_model(tmp.name)
252 |             del self.model
253 |             self.model = xgb.Booster()
254 |             self.model.load_model(tmp.name)
255 |         """
256 | 
257 |     def is_fitted(self):
258 |         return self.model is not None
259 | 
260 |     def predict(self, X):
261 |         self.reload()
262 | 
263 |         if self.model is None:
264 |             raise XgbAlgorithmException("Xgboost model is None")
265 | 
266 |         dtrain = xgb.DMatrix(
267 |             X.values if isinstance(X, pd.DataFrame) else X, missing=np.NaN
268 |         )
269 |         # xgboost > 2.0.0 version
270 |         if hasattr(self.model, "best_iteration"):
271 |             a = self.model.predict(
272 |                 dtrain, iteration_range=(0, self.model.best_iteration + 1)
273 |             )
274 |         else:
275 |             a = self.model.predict(dtrain)
276 | 
277 |         return a
278 | 
279 |     def copy(self):
280 |         return copy.deepcopy(self)
281 | 
282 |     def save(self, model_file_path):
283 |         self.model.save_model(model_file_path)
284 |         self.model_file_path = model_file_path
285 |         logger.debug("XgbAlgorithm save model to %s" % model_file_path)
286 | 
287 |     def load(self, model_file_path):
288 |         logger.debug("XgbLearner load model from %s" % model_file_path)
289 |         self.model = xgb.Booster()  # init model
290 |         self.model.load_model(model_file_path)
291 |         self.model_file_path = model_file_path
292 | 
293 |     def file_extension(self):
294 |         # we need to keep models as json files
295 |         # to keep information about best_iteration
296 |         return "xgboost.json"
297 | 
298 |     def get_metric_name(self):
299 |         metric = self.params.get("eval_metric")
300 |         if metric is None:
301 |             return None
302 |         if metric == "mlogloss":
303 |             return "logloss"
304 |         return metric
305 | 
306 | 
307 | # For binary classification target should be 0, 1. There should be no NaNs in target.
308 | xgb_bin_class_params = {
309 |     "objective": ["binary:logistic"],
310 |     "eta": [0.05, 0.075, 0.1, 0.15],
311 |     "max_depth": [4, 5, 6, 7, 8, 9],
312 |     "min_child_weight": [1, 5, 10, 25, 50],
313 |     "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
314 |     "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
315 | }
316 | 
317 | classification_bin_default_params = {
318 |     "objective": "binary:logistic",
319 |     "eta": 0.075,
320 |     "max_depth": 6,
321 |     "min_child_weight": 1,
322 |     "subsample": 1.0,
323 |     "colsample_bytree": 1.0,
324 | }
325 | 
326 | xgb_regression_params = dict(xgb_bin_class_params)
327 | xgb_regression_params["objective"] = ["reg:squarederror"]
328 | # xgb_regression_params["eval_metric"] = ["rmse", "mae", "mape"]
329 | xgb_regression_params["max_depth"] = [4, 5, 6, 7, 8, 9]
330 | 
331 | 
332 | xgb_multi_class_params = dict(xgb_bin_class_params)
333 | xgb_multi_class_params["objective"] = ["multi:softprob"]
334 | # xgb_multi_class_params["eval_metric"] = ["mlogloss"]
335 | 
336 | classification_multi_default_params = {
337 |     "objective": "multi:softprob",
338 |     "eta": 0.075,
339 |     "max_depth": 6,
340 |     "min_child_weight": 1,
341 |     "subsample": 1.0,
342 |     "colsample_bytree": 1.0,
343 | }
344 | 
345 | 
346 | regression_default_params = {
347 |     "objective": "reg:squarederror",
348 |     "eta": 0.075,
349 |     "max_depth": 6,
350 |     "min_child_weight": 1,
351 |     "subsample": 1.0,
352 |     "colsample_bytree": 1.0,
353 | }
354 | 
355 | additional = {
356 |     "max_rounds": 10000,
357 |     "early_stopping_rounds": 50,
358 |     "max_rows_limit": None,
359 |     "max_cols_limit": None,
360 | }
361 | required_preprocessing = [
362 |     "missing_values_inputation",
363 |     "convert_categorical",
364 |     "datetime_transform",
365 |     "text_transform",
366 |     "target_as_integer",
367 | ]
368 | 
369 | 
370 | class XgbClassifier(ClassifierMixin, XgbAlgorithm):
371 |     pass
372 | 
373 | 
374 | AlgorithmsRegistry.add(
375 |     BINARY_CLASSIFICATION,
376 |     XgbClassifier,
377 |     xgb_bin_class_params,
378 |     required_preprocessing,
379 |     additional,
380 |     classification_bin_default_params,
381 | )
382 | 
383 | AlgorithmsRegistry.add(
384 |     MULTICLASS_CLASSIFICATION,
385 |     XgbClassifier,
386 |     xgb_multi_class_params,
387 |     required_preprocessing,
388 |     additional,
389 |     classification_multi_default_params,
390 | )
391 | 
392 | regression_required_preprocessing = [
393 |     "missing_values_inputation",
394 |     "convert_categorical",
395 |     "datetime_transform",
396 |     "text_transform",
397 |     "target_scale",
398 | ]
399 | 
400 | 
401 | class XgbRegressor(RegressorMixin, XgbAlgorithm):
402 |     pass
403 | 
404 | 
405 | AlgorithmsRegistry.add(
406 |     REGRESSION,
407 |     XgbRegressor,
408 |     xgb_regression_params,
409 |     regression_required_preprocessing,
410 |     additional,
411 |     regression_default_params,
412 | )
413 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_automl.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pytest
  9 | from sklearn import datasets
 10 | from sklearn.decomposition import PCA
 11 | from sklearn.pipeline import make_pipeline
 12 | 
 13 | from supervised import AutoML
 14 | from supervised.exceptions import AutoMLException
 15 | 
 16 | iris = datasets.load_iris()
 17 | housing = datasets.fetch_california_housing()
 18 | # limit data size for faster tests
 19 | housing.data = housing.data[:500]
 20 | housing.target = housing.target[:500]
 21 | breast_cancer = datasets.load_breast_cancer()
 22 | 
 23 | 
 24 | @pytest.mark.usefixtures("data_folder")
 25 | class AutoMLTest(unittest.TestCase):
 26 |     automl_dir = "AutoMLTest"
 27 |     data_folder: Path
 28 | 
 29 |     def tearDown(self):
 30 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 31 | 
 32 |     def setUp(self):
 33 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 34 | 
 35 |     def test_new_directory(self):
 36 |         """Directory does not exist, create it"""
 37 |         # Assert directory does not exist
 38 |         self.assertTrue(not os.path.exists(self.automl_dir))
 39 |         # Create model with dir
 40 |         model = AutoML(results_path=self.automl_dir)
 41 |         # Generate data
 42 |         X, y = datasets.make_classification(n_samples=30)
 43 |         # Fit data
 44 |         model.fit(X, y)  # AutoML only validates constructor params on `fit()` call
 45 |         # Assert directory was created
 46 |         self.assertTrue(os.path.exists(self.automl_dir))
 47 | 
 48 |     def test_empty_directory(self):
 49 |         """Directory exists and is empty, use it"""
 50 |         # Assert directory does not exist
 51 |         self.assertTrue(not os.path.exists(self.automl_dir))
 52 |         # Make dir
 53 |         os.mkdir(self.automl_dir)
 54 |         # Assert dir exists
 55 |         self.assertTrue(os.path.exists(self.automl_dir))
 56 |         # Create automl with dir
 57 |         model = AutoML(results_path=self.automl_dir)
 58 |         # Generate data
 59 |         X, y = datasets.make_classification(n_samples=30)
 60 |         # Fit data
 61 |         model.fit(X, y)  # AutoML only validates constructor params on `fit()` call
 62 |         self.assertTrue(os.path.exists(self.automl_dir))
 63 | 
 64 |     def test_not_empty_directory(self):
 65 |         """
 66 |         Directory exists and is not empty,
 67 |         there is no params.json file in it, dont use it, raise exception
 68 |         """
 69 |         # Assert directory does not exist
 70 |         self.assertTrue(not os.path.exists(self.automl_dir))
 71 |         # Create directory
 72 |         os.mkdir(self.automl_dir)
 73 |         # Write some content to directory
 74 |         open(os.path.join(self.automl_dir, "test.file"), "w").close()
 75 |         # Assert directory exists
 76 |         self.assertTrue(os.path.exists(self.automl_dir))
 77 |         # Generate data
 78 |         X, y = datasets.make_classification(n_samples=30)
 79 |         # Assert than an Exception is raised
 80 |         with self.assertRaises(AutoMLException) as context:
 81 |             a = AutoML(results_path=self.automl_dir)
 82 |             a.fit(X, y)  # AutoML only validates constructor params on `fit()` call
 83 | 
 84 |         self.assertTrue("not empty" in str(context.exception))
 85 | 
 86 |     def test_use_directory_if_non_empty_exists_with_params_json(self):
 87 |         """
 88 |         Directory exists and is not empty,
 89 |         there is params.json in it, try to load it,
 90 |         raise exception because of fake params.json
 91 |         """
 92 |         # Assert directory does not exist
 93 |         self.assertTrue(not os.path.exists(self.automl_dir))
 94 |         # Create dir
 95 |         os.mkdir(self.automl_dir)
 96 |         # Write `params.json` to directory
 97 |         open(os.path.join(self.automl_dir, "params.json"), "w").close()
 98 |         # Assert directory exists
 99 |         self.assertTrue(os.path.exists(self.automl_dir))
100 |         # Generate data
101 |         X, y = datasets.make_classification(n_samples=30)
102 |         with self.assertRaises(AutoMLException) as context:
103 |             a = AutoML(results_path=self.automl_dir)
104 |             a.predict(X)  # AutoML tries to load on predict call
105 |         self.assertTrue("Cannot load" in str(context.exception))
106 | 
107 |     def test_get_params(self):
108 |         """
109 |         Passes params in AutoML constructor and uses `get_params()` after fitting.
110 |         Initial params must be equal to the ones returned by `get_params()`.
111 |         """
112 |         # Create model
113 |         model = AutoML(
114 |             hill_climbing_steps=3, start_random_models=1, results_path=self.automl_dir
115 |         )
116 |         # Get params before fit
117 |         params_before_fit = model.get_params()
118 |         # Generate data
119 |         X, y = datasets.make_classification(n_samples=30)
120 |         # Fit data
121 |         model.fit(X, y)
122 |         # Get params after fit
123 |         params_after_fit = model.get_params()
124 |         # Assert before and after params are equal
125 |         self.assertEqual(params_before_fit, params_after_fit)
126 | 
127 |     def test_scikit_learn_pipeline_integration(self):
128 |         """
129 |         Tests if AutoML is working on a scikit-learn's pipeline
130 |         """
131 |         # Create dataset
132 |         X, y = datasets.make_classification(n_samples=30)
133 |         # apply PCA to X
134 |         new_X = PCA(random_state=0).fit_transform(X)
135 |         # Create default model
136 |         default_model = AutoML(
137 |             algorithms=["Linear"], random_state=0, results_path=self.automl_dir
138 |         )
139 |         # Fit default model with transformed X and y, and predict transformed X
140 |         y_pred_default = default_model.fit(new_X, y).predict(new_X)
141 | 
142 |         # Create pipeline with PCA and AutoML
143 |         pipeline = make_pipeline(
144 |             PCA(random_state=0), AutoML(algorithms=["Linear"], random_state=0)
145 |         )
146 |         # Fit with original X and y and predict X
147 |         y_pred_pipe = pipeline.fit(X, y).predict(X)
148 |         # y_pred_default must be equal to y_pred_pipe
149 |         self.assertTrue((y_pred_pipe == y_pred_default).all())
150 | 
151 |     def test_predict_proba_in_regression(self):
152 |         model = AutoML(
153 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
154 |         )
155 |         model.fit(housing.data, housing.target)
156 |         with self.assertRaises(AutoMLException) as context:
157 |             # Try to call predict_proba in regression task
158 |             model.predict_proba(housing.data)
159 | 
160 |     def test_iris_dataset(self):
161 |         """Tests AutoML in the iris dataset (Multiclass classification)"""
162 |         model = AutoML(
163 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
164 |         )
165 |         score = model.fit(iris.data, iris.target).score(iris.data, iris.target)
166 |         self.assertGreater(score, 0.5)
167 | 
168 |     def test_housing_dataset(self):
169 |         """Tests AutoML in the housing dataset (Regression)"""
170 |         model = AutoML(
171 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
172 |         )
173 |         score = model.fit(housing.data, housing.target).score(
174 |             housing.data, housing.target
175 |         )
176 |         self.assertGreater(score, 0.5)
177 | 
178 |     def test_breast_cancer_dataset(self):
179 |         """Tests AutoML in the breast cancer (binary classification)"""
180 |         model = AutoML(
181 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
182 |         )
183 |         score = model.fit(breast_cancer.data, breast_cancer.target).score(
184 |             breast_cancer.data, breast_cancer.target
185 |         )
186 |         self.assertGreater(score, 0.5)
187 | 
188 |     def test_titatic_dataset(self):
189 |         """Tets AutoML in the titanic dataset (binary classification) with categorial features"""
190 |         data_folder = self.data_folder
191 |         automl = AutoML(
192 |             algorithms=["Xgboost"], mode="Explain", results_path=self.automl_dir
193 |         )
194 | 
195 |         df = pd.read_csv((data_folder / "Titanic/train.csv"))
196 | 
197 |         X = df[df.columns[2:]]
198 |         y = df["Survived"]
199 | 
200 |         automl.fit(X, y)
201 | 
202 |         test = pd.read_csv(data_folder / "Titanic/test_with_Survived.csv")
203 |         test_cols = [
204 |             "Parch",
205 |             "Ticket",
206 |             "Fare",
207 |             "Pclass",
208 |             "Name",
209 |             "Sex",
210 |             "Age",
211 |             "SibSp",
212 |             "Cabin",
213 |             "Embarked",
214 |         ]
215 |         score = automl.score(test[test_cols], test["Survived"])
216 |         self.assertGreater(score, 0.5)
217 | 
218 |     def test_score_without_y(self):
219 |         """Tests the use of `score()` without passing y. Should raise AutoMLException"""
220 |         model = AutoML(
221 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
222 |         )
223 |         # Assert than an Exception is raised
224 |         with self.assertRaises(AutoMLException) as context:
225 |             # Try to score without passing 'y'
226 |             score = model.fit(breast_cancer.data, breast_cancer.target).score(
227 |                 breast_cancer.data
228 |             )
229 | 
230 |         self.assertTrue("y must be specified" in str(context.exception))
231 | 
232 |     def test_no_constructor_args(self):
233 |         """Tests the use of AutoML without passing any args. Should work without any arguments"""
234 |         # Create model with no arguments
235 |         model = AutoML()
236 |         model.results_path = self.automl_dir
237 |         # Assert than an Exception is raised
238 |         score = model.fit(iris.data, iris.target).score(iris.data, iris.target)
239 |         self.assertGreater(score, 0.5)
240 | 
241 |     def test_fit_returns_self(self):
242 |         """Tests if the `fit()` method returns `self`. This allows to quickly implement one-liners with AutoML"""
243 |         model = AutoML()
244 |         model.results_path = self.automl_dir
245 |         self.assertTrue(
246 |             isinstance(model.fit(iris.data, iris.target), AutoML),
247 |             "`fit()` method must return 'self'",
248 |         )
249 | 
250 |     def test_invalid_mode(self):
251 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
252 |         param = {"mode": "invalid_mode"}
253 |         model.set_params(**param)
254 |         with self.assertRaises(ValueError) as context:
255 |             model.fit(iris.data, iris.target)
256 | 
257 |     def test_invalid_ml_task(self):
258 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
259 |         param = {"ml_task": "invalid_task"}
260 |         model.set_params(**param)
261 |         with self.assertRaises(ValueError) as context:
262 |             model.fit(iris.data, iris.target)
263 | 
264 |     def test_invalid_results_path(self):
265 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
266 |         param = {"results_path": 2}
267 |         model.set_params(**param)
268 |         with self.assertRaises(ValueError) as context:
269 |             model.fit(iris.data, iris.target)
270 | 
271 |     def test_invalid_total_time_limit(self):
272 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
273 |         param = {"total_time_limit": -1}
274 |         model.set_params(**param)
275 |         with self.assertRaises(ValueError) as context:
276 |             model.fit(iris.data, iris.target)
277 | 
278 |     def test_invalid_model_time_limit(self):
279 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
280 |         param = {"model_time_limit": -1}
281 |         model.set_params(**param)
282 |         with self.assertRaises(ValueError) as context:
283 |             model.fit(iris.data, iris.target)
284 | 
285 |     def test_invalid_algorithm_name(self):
286 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
287 |         param = {"algorithms": ["Baseline", "Neural Netrk"]}
288 |         model.set_params(**param)
289 |         with self.assertRaises(ValueError) as context:
290 |             model.fit(iris.data, iris.target)
291 | 
292 |     def test_invalid_train_ensemble(self):
293 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
294 |         param = {"train_ensemble": "not bool"}
295 |         model.set_params(**param)
296 |         with self.assertRaises(ValueError) as context:
297 |             model.fit(iris.data, iris.target)
298 | 
299 |     def test_invalid_stack_models(self):
300 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
301 |         param = {"stack_models": "not bool"}
302 |         model.set_params(**param)
303 |         with self.assertRaises(ValueError) as context:
304 |             model.fit(iris.data, iris.target)
305 | 
306 |     def test_invalid_eval_metric(self):
307 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
308 |         param = {"eval_metric": "not_real_metric"}
309 |         model.set_params(**param)
310 |         with self.assertRaises(ValueError) as context:
311 |             model.fit(iris.data, iris.target)
312 | 
313 |     def test_invalid_validation_strategy(self):
314 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
315 |         param = {"validation_strategy": "test"}
316 |         model.set_params(**param)
317 |         with self.assertRaises(ValueError) as context:
318 |             model.fit(iris.data, iris.target)
319 | 
320 |     def test_invalid_verbose(self):
321 |         model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir)
322 |         param = {"verbose": -1}
323 |         model.set_params(**param)
324 |         with self.assertRaises(ValueError) as context:
325 |             model.fit(iris.data, iris.target)
326 | 
327 |     def test_too_small_time_limit(self):
328 |         rows = 1000000
329 |         X = np.random.uniform(size=(rows, 100))
330 |         y = np.random.randint(0, 2, size=(rows,))
331 | 
332 |         automl = AutoML(
333 |             results_path=self.automl_dir, total_time_limit=1, train_ensemble=False
334 |         )
335 |         with self.assertRaises(AutoMLException) as context:
336 |             automl.fit(X, y)
337 | 
```

--------------------------------------------------------------------------------
/supervised/utils/metric.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | log = logging.getLogger(__name__)
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy as sp
  8 | from sklearn.metrics import (
  9 |     accuracy_score,
 10 |     average_precision_score,
 11 |     f1_score,
 12 |     log_loss,
 13 |     mean_absolute_error,
 14 |     mean_absolute_percentage_error,
 15 |     mean_squared_error,
 16 |     mean_squared_log_error,
 17 |     r2_score,
 18 |     roc_auc_score,
 19 | )
 20 | 
 21 | 
 22 | def logloss(y_true, y_predicted, sample_weight=None):
 23 |     # convert predicted values to float32 to avoid warnings
 24 |     ll = log_loss(y_true, y_predicted.astype(np.float32), sample_weight=sample_weight)
 25 |     return ll
 26 | 
 27 | 
 28 | def rmse(y_true, y_predicted, sample_weight=None):
 29 |     val = mean_squared_error(y_true, y_predicted, sample_weight=sample_weight)
 30 |     return np.sqrt(val) if val > 0 else -np.Inf
 31 | 
 32 | 
 33 | def rmsle(y_true, y_predicted, sample_weight=None):
 34 |     val = mean_squared_log_error(y_true, y_predicted, sample_weight=sample_weight)
 35 |     return np.sqrt(val) if val > 0 else -np.Inf
 36 | 
 37 | 
 38 | def negative_auc(y_true, y_predicted, sample_weight=None):
 39 |     val = roc_auc_score(y_true, y_predicted, sample_weight=sample_weight)
 40 |     return -1.0 * val
 41 | 
 42 | 
 43 | def negative_r2(y_true, y_predicted, sample_weight=None):
 44 |     val = r2_score(y_true, y_predicted, sample_weight=sample_weight)
 45 |     return -1.0 * val
 46 | 
 47 | 
 48 | def negative_f1(y_true, y_predicted, sample_weight=None):
 49 |     if isinstance(y_true, pd.DataFrame):
 50 |         y_true = np.array(y_true)
 51 |     if isinstance(y_predicted, pd.DataFrame):
 52 |         y_predicted = np.array(y_predicted)
 53 | 
 54 |     if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1:
 55 |         y_predicted = y_predicted.ravel()
 56 | 
 57 |     average = None
 58 |     if len(y_predicted.shape) == 1:
 59 |         y_predicted = (y_predicted > 0.5).astype(int)
 60 |         average = "binary"
 61 |     else:
 62 |         y_predicted = np.argmax(y_predicted, axis=1)
 63 |         average = "micro"
 64 | 
 65 |     val = f1_score(y_true, y_predicted, sample_weight=sample_weight, average=average)
 66 | 
 67 |     return -val
 68 | 
 69 | 
 70 | def negative_accuracy(y_true, y_predicted, sample_weight=None):
 71 |     if isinstance(y_true, pd.DataFrame):
 72 |         y_true = np.array(y_true)
 73 |     if isinstance(y_predicted, pd.DataFrame):
 74 |         y_predicted = np.array(y_predicted)
 75 | 
 76 |     if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1:
 77 |         y_predicted = y_predicted.ravel()
 78 | 
 79 |     if len(y_predicted.shape) == 1:
 80 |         y_predicted = (y_predicted > 0.5).astype(int)
 81 |     else:
 82 |         y_predicted = np.argmax(y_predicted, axis=1)
 83 | 
 84 |     val = accuracy_score(y_true, y_predicted, sample_weight=sample_weight)
 85 | 
 86 |     return -val
 87 | 
 88 | 
 89 | def negative_average_precision(y_true, y_predicted, sample_weight=None):
 90 |     if isinstance(y_true, pd.DataFrame):
 91 |         y_true = np.array(y_true)
 92 |     if isinstance(y_predicted, pd.DataFrame):
 93 |         y_predicted = np.array(y_predicted)
 94 | 
 95 |     val = average_precision_score(y_true, y_predicted, sample_weight=sample_weight)
 96 | 
 97 |     return -val
 98 | 
 99 | 
100 | def negative_spearman(y_true, y_predicted, sample_weight=None):
101 |     # sample weight is ignored
102 |     c, _ = sp.stats.spearmanr(y_true, y_predicted)
103 |     return -c
104 | 
105 | 
106 | def spearman(y_true, y_predicted, sample_weight=None):
107 |     # sample weight is ignored
108 |     c, _ = sp.stats.spearmanr(y_true, y_predicted)
109 |     return c
110 | 
111 | 
112 | def negative_pearson(y_true, y_predicted, sample_weight=None):
113 |     # sample weight is ignored
114 |     if isinstance(y_true, pd.DataFrame):
115 |         y_true = np.array(y_true).ravel()
116 |     if isinstance(y_predicted, pd.DataFrame):
117 |         y_predicted = np.array(y_predicted).ravel()
118 |     return -np.corrcoef(y_true, y_predicted)[0, 1]
119 | 
120 | 
121 | def pearson(y_true, y_predicted, sample_weight=None):
122 |     return -negative_pearson(y_true, y_predicted, sample_weight)
123 | 
124 | 
125 | class MetricException(Exception):
126 |     def __init__(self, message):
127 |         Exception.__init__(self, message)
128 |         log.error(message)
129 | 
130 | 
131 | def xgboost_eval_metric_r2(preds, dtrain):
132 |     # Xgboost needs to minimize eval_metric
133 |     target = dtrain.get_label()
134 |     weight = dtrain.get_weight()
135 |     if len(weight) == 0:
136 |         weight = None
137 |     return "r2", -r2_score(target, preds, sample_weight=weight)
138 | 
139 | 
140 | def xgboost_eval_metric_spearman(preds, dtrain):
141 |     # Xgboost needs to minimize eval_metric
142 |     target = dtrain.get_label()
143 |     return "spearman", negative_spearman(target, preds)
144 | 
145 | 
146 | def xgboost_eval_metric_pearson(preds, dtrain):
147 |     # Xgboost needs to minimize eval_metric
148 |     target = dtrain.get_label()
149 |     return "pearson", negative_pearson(target, preds)
150 | 
151 | 
152 | def xgboost_eval_metric_f1(preds, dtrain):
153 |     # Xgboost needs to minimize eval_metric
154 |     target = dtrain.get_label()
155 |     weight = dtrain.get_weight()
156 |     if len(weight) == 0:
157 |         weight = None
158 |     return "f1", negative_f1(target, preds, weight)
159 | 
160 | 
161 | def xgboost_eval_metric_average_precision(preds, dtrain):
162 |     # Xgboost needs to minimize eval_metric
163 |     target = dtrain.get_label()
164 |     weight = dtrain.get_weight()
165 |     if len(weight) == 0:
166 |         weight = None
167 |     return "average_precision", negative_average_precision(target, preds, weight)
168 | 
169 | 
170 | def xgboost_eval_metric_accuracy(preds, dtrain):
171 |     # Xgboost needs to minimize eval_metric
172 |     target = dtrain.get_label()
173 |     weight = dtrain.get_weight()
174 |     if len(weight) == 0:
175 |         weight = None
176 |     return "accuracy", negative_accuracy(target, preds, weight)
177 | 
178 | 
179 | def xgboost_eval_metric_mse(preds, dtrain):
180 |     # Xgboost needs to minimize eval_metric
181 |     target = dtrain.get_label()
182 |     weight = dtrain.get_weight()
183 |     if len(weight) == 0:
184 |         weight = None
185 |     return "mse", mean_squared_error(target, preds, sample_weight=weight)
186 | 
187 | 
188 | def lightgbm_eval_metric_r2(preds, dtrain):
189 |     target = dtrain.get_label()
190 |     weight = dtrain.get_weight()
191 |     return "r2", r2_score(target, preds, sample_weight=weight), True
192 | 
193 | 
194 | def lightgbm_eval_metric_spearman(preds, dtrain):
195 |     target = dtrain.get_label()
196 |     return "spearman", -negative_spearman(target, preds), True
197 | 
198 | 
199 | def lightgbm_eval_metric_pearson(preds, dtrain):
200 |     target = dtrain.get_label()
201 |     return "pearson", -negative_pearson(target, preds), True
202 | 
203 | 
204 | def lightgbm_eval_metric_f1(preds, dtrain):
205 |     target = dtrain.get_label()
206 |     weight = dtrain.get_weight()
207 | 
208 |     unique_targets = np.unique(target)
209 |     if len(unique_targets) > 2:
210 |         cols = len(unique_targets)
211 |         rows = int(preds.shape[0] / len(unique_targets))
212 |         preds = np.reshape(preds, (rows, cols), order="F")
213 | 
214 |     return "f1", -negative_f1(target, preds, weight), True
215 | 
216 | 
217 | def lightgbm_eval_metric_average_precision(preds, dtrain):
218 |     target = dtrain.get_label()
219 |     weight = dtrain.get_weight()
220 | 
221 |     return "average_precision", -negative_average_precision(target, preds, weight), True
222 | 
223 | 
224 | def lightgbm_eval_metric_accuracy(preds, dtrain):
225 |     target = dtrain.get_label()
226 |     weight = dtrain.get_weight()
227 | 
228 |     return "accuracy", -negative_accuracy(target, preds, weight), True
229 | 
230 | 
231 | class CatBoostEvalMetricSpearman(object):
232 |     def get_final_error(self, error, weight):
233 |         return error
234 | 
235 |     def is_max_optimal(self):
236 |         return True
237 | 
238 |     def evaluate(self, approxes, target, weight):
239 |         assert len(approxes) == 1
240 |         assert len(target) == len(approxes[0])
241 | 
242 |         preds = np.array(approxes[0])
243 |         target = np.array(target)
244 | 
245 |         return -negative_spearman(target, preds), 0
246 | 
247 | 
248 | class CatBoostEvalMetricPearson(object):
249 |     def get_final_error(self, error, weight):
250 |         return error
251 | 
252 |     def is_max_optimal(self):
253 |         return True
254 | 
255 |     def evaluate(self, approxes, target, weight):
256 |         assert len(approxes) == 1
257 |         assert len(target) == len(approxes[0])
258 | 
259 |         preds = np.array(approxes[0])
260 |         target = np.array(target)
261 | 
262 |         return -negative_pearson(target, preds), 0
263 | 
264 | 
265 | class CatBoostEvalMetricAveragePrecision(object):
266 |     def get_final_error(self, error, weight):
267 |         return error
268 | 
269 |     def is_max_optimal(self):
270 |         return True
271 | 
272 |     def evaluate(self, approxes, target, weight):
273 |         assert len(approxes) == 1
274 |         assert len(target) == len(approxes[0])
275 | 
276 |         preds = np.array(approxes[0])
277 |         target = np.array(target)
278 |         if weight is not None:
279 |             weight = np.array(weight)
280 | 
281 |         return -negative_average_precision(target, preds, weight), 0
282 | 
283 | 
284 | class CatBoostEvalMetricMSE(object):
285 |     def get_final_error(self, error, weight):
286 |         return error
287 | 
288 |     def is_max_optimal(self):
289 |         return False
290 | 
291 |     def evaluate(self, approxes, target, weight):
292 |         assert len(approxes) == 1
293 |         assert len(target) == len(approxes[0])
294 | 
295 |         preds = np.array(approxes[0])
296 |         target = np.array(target)
297 |         if weight is not None:
298 |             weight = np.array(weight)
299 | 
300 |         return mean_squared_error(target, preds, sample_weight=weight), 0
301 | 
302 | 
303 | class UserDefinedEvalMetric:
304 |     # should always minimize
305 |     eval_metric = mean_squared_error  # set the default
306 | 
307 |     def set_metric(self, feval):
308 |         UserDefinedEvalMetric.eval_metric = feval
309 | 
310 |     def __call__(self, y_true, y_predicted, sample_weight=None):
311 |         return UserDefinedEvalMetric.eval_metric(y_true, y_predicted, sample_weight)
312 | 
313 | 
314 | def xgboost_eval_metric_user_defined(preds, dtrain):
315 |     target = dtrain.get_label()
316 |     weight = dtrain.get_weight()
317 |     if len(weight) == 0:
318 |         weight = None
319 |     metric = UserDefinedEvalMetric()
320 |     return "user_defined_metric", metric(target, preds, sample_weight=weight)
321 | 
322 | 
323 | def lightgbm_eval_metric_user_defined(preds, dtrain):
324 |     target = dtrain.get_label()
325 |     weight = dtrain.get_weight()
326 |     metric = UserDefinedEvalMetric()
327 |     return "user_defined_metric", metric(target, preds, sample_weight=weight), False
328 | 
329 | 
330 | class CatBoostEvalMetricUserDefined(object):
331 |     def get_final_error(self, error, weight):
332 |         return error
333 | 
334 |     def is_max_optimal(self):
335 |         return False
336 | 
337 |     def evaluate(self, approxes, target, weight):
338 |         assert len(approxes) == 1
339 |         assert len(target) == len(approxes[0])
340 | 
341 |         preds = np.array(approxes[0])
342 |         target = np.array(target)
343 |         if weight is not None:
344 |             weight = np.array(weight)
345 | 
346 |         metric = UserDefinedEvalMetric()
347 |         return metric(target, preds, sample_weight=weight), 0
348 | 
349 | 
350 | class Metric(object):
351 |     def __init__(self, params):
352 |         if params is None:
353 |             raise MetricException("Metric params not defined")
354 |         self.params = params
355 |         self.name = self.params.get("name")
356 |         if self.name is None:
357 |             raise MetricException("Metric name not defined")
358 | 
359 |         self.minimize_direction = self.name in [
360 |             "logloss",
361 |             "auc",  # negative auc
362 |             "rmse",
363 |             "mae",
364 |             "mse",
365 |             "r2",  # negative r2
366 |             "mape",
367 |             "spearman",  # negative
368 |             "pearson",  # negative
369 |             "f1",  # negative
370 |             "average_precision",  # negative
371 |             "accuracy",  # negative
372 |             "user_defined_metric",
373 |         ]
374 |         if self.name == "logloss":
375 |             self.metric = logloss
376 |         elif self.name == "auc":
377 |             self.metric = negative_auc
378 |         elif self.name == "acc":
379 |             self.metric = accuracy_score
380 |         elif self.name == "rmse":
381 |             self.metric = rmse
382 |         elif self.name == "mse":
383 |             self.metric = mean_squared_error
384 |         elif self.name == "mae":
385 |             self.metric = mean_absolute_error
386 |         elif self.name == "r2":
387 |             self.metric = negative_r2
388 |         elif self.name == "mape":
389 |             self.metric = mean_absolute_percentage_error
390 |         elif self.name == "spearman":
391 |             self.metric = negative_spearman
392 |         elif self.name == "pearson":
393 |             self.metric = negative_pearson
394 |         elif self.name == "f1":
395 |             self.metric = negative_f1
396 |         elif self.name == "average_precision":
397 |             self.metric = negative_average_precision
398 |         elif self.name == "accuracy":
399 |             self.metric = negative_accuracy
400 |         elif self.name == "user_defined_metric":
401 |             self.metric = UserDefinedEvalMetric.eval_metric
402 |         # elif self.name == "rmsle": # need to update target preprocessing
403 |         #    self.metric = rmsle     # to assure that target is not negative ...
404 |         else:
405 |             raise MetricException(f"Unknown metric '{self.name}'")
406 | 
407 |     def __call__(self, y_true, y_predicted, sample_weight=None):
408 |         return self.metric(y_true, y_predicted, sample_weight=sample_weight)
409 | 
410 |     def improvement(self, previous, current):
411 |         if self.minimize_direction:
412 |             return current < previous
413 |         return current > previous
414 | 
415 |     def get_maximum(self):
416 |         if self.minimize_direction:
417 |             return 10e12
418 |         else:
419 |             return -10e12
420 | 
421 |     def worst_value(self):
422 |         if self.minimize_direction:
423 |             return np.Inf
424 |         return -np.Inf
425 | 
426 |     def get_minimize_direction(self):
427 |         return self.minimize_direction
428 | 
429 |     def is_negative(self):
430 |         return self.name in [
431 |             "auc",
432 |             "r2",
433 |             "spearman",
434 |             "pearson",
435 |             "f1",
436 |             "average_precision",
437 |             "accuracy",
438 |         ]
439 | 
440 |     @staticmethod
441 |     def optimize_negative(metric_name):
442 |         return metric_name in [
443 |             "auc",
444 |             "r2",
445 |             "spearman",
446 |             "pearson",
447 |             "f1",
448 |             "average_precision",
449 |             "accuracy",
450 |         ]
451 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/catboost.py:
--------------------------------------------------------------------------------

```python
  1 | import copy
  2 | import logging
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.base import ClassifierMixin, RegressorMixin
  8 | 
  9 | from supervised.algorithms.algorithm import BaseAlgorithm
 10 | from supervised.algorithms.registry import (
 11 |     BINARY_CLASSIFICATION,
 12 |     MULTICLASS_CLASSIFICATION,
 13 |     REGRESSION,
 14 |     AlgorithmsRegistry,
 15 | )
 16 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 17 | from supervised.utils.config import LOG_LEVEL
 18 | from supervised.utils.metric import (
 19 |     CatBoostEvalMetricAveragePrecision,
 20 |     CatBoostEvalMetricMSE,
 21 |     CatBoostEvalMetricPearson,
 22 |     CatBoostEvalMetricSpearman,
 23 |     CatBoostEvalMetricUserDefined,
 24 | )
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | logger.setLevel(LOG_LEVEL)
 28 | 
 29 | import catboost
 30 | from catboost import CatBoostClassifier, CatBoostRegressor, Pool
 31 | 
 32 | 
 33 | def catboost_eval_metric(ml_task, eval_metric):
 34 |     if eval_metric == "user_defined_metric":
 35 |         return eval_metric
 36 |     metric_name_mapping = {
 37 |         BINARY_CLASSIFICATION: {
 38 |             "auc": "AUC",
 39 |             "logloss": "Logloss",
 40 |             "f1": "F1",
 41 |             "average_precision": "average_precision",
 42 |             "accuracy": "Accuracy",
 43 |         },
 44 |         MULTICLASS_CLASSIFICATION: {
 45 |             "logloss": "MultiClass",
 46 |             "f1": "TotalF1:average=Micro",
 47 |             "accuracy": "Accuracy",
 48 |         },
 49 |         REGRESSION: {
 50 |             "rmse": "RMSE",
 51 |             "mse": "mse",
 52 |             "mae": "MAE",
 53 |             "mape": "MAPE",
 54 |             "r2": "R2",
 55 |             "spearman": "spearman",
 56 |             "pearson": "pearson",
 57 |         },
 58 |     }
 59 |     return metric_name_mapping[ml_task][eval_metric]
 60 | 
 61 | 
 62 | def catboost_objective(ml_task, eval_metric):
 63 |     objective = "RMSE"
 64 |     if ml_task == BINARY_CLASSIFICATION:
 65 |         objective = "Logloss"
 66 |     elif ml_task == MULTICLASS_CLASSIFICATION:
 67 |         objective = "MultiClass"
 68 |     else:  # ml_task == REGRESSION
 69 |         objective = catboost_eval_metric(REGRESSION, eval_metric)
 70 |         if objective in [
 71 |             "mse",
 72 |             "R2",
 73 |             "spearman",
 74 |             "pearson",
 75 |             "user_defined_metric",
 76 |         ]:  # cant optimize them directly
 77 |             objective = "RMSE"
 78 |     return objective
 79 | 
 80 | 
 81 | class CatBoostAlgorithm(BaseAlgorithm):
 82 |     algorithm_name = "CatBoost"
 83 |     algorithm_short_name = "CatBoost"
 84 |     warmup_iterations = 20
 85 | 
 86 |     def __init__(self, params):
 87 |         super(CatBoostAlgorithm, self).__init__(params)
 88 |         self.library_version = catboost.__version__
 89 |         self.snapshot_file_path = "training_snapshot"
 90 | 
 91 |         self.explain_level = params.get("explain_level", 0)
 92 |         self.rounds = additional.get("max_rounds", 10000)
 93 |         self.max_iters = 1
 94 |         self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
 95 | 
 96 |         Algo = CatBoostClassifier
 97 |         loss_function = "Logloss"
 98 |         if self.params["ml_task"] == BINARY_CLASSIFICATION:
 99 |             loss_function = self.params.get("loss_function", "Logloss")
100 |         elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
101 |             loss_function = self.params.get("loss_function", "MultiClass")
102 |         elif self.params["ml_task"] == REGRESSION:
103 |             loss_function = self.params.get("loss_function", "RMSE")
104 |             Algo = CatBoostRegressor
105 | 
106 |         cat_params = {
107 |             "iterations": self.params.get("num_boost_round", self.rounds),
108 |             "learning_rate": self.params.get("learning_rate", 0.1),
109 |             "depth": self.params.get("depth", 3),
110 |             "rsm": self.params.get("rsm", 1.0),
111 |             "l2_leaf_reg": self.params.get("l2_leaf_reg", 3.0),
112 |             "random_strength": self.params.get("random_strength", 1.0),
113 |             "loss_function": loss_function,
114 |             "eval_metric": self.params.get("eval_metric", loss_function),
115 |             # "custom_metric": self.params.get("eval_metric", loss_function),
116 |             "thread_count": self.params.get("n_jobs", -1),
117 |             "verbose": False,
118 |             "allow_writing_files": False,
119 |             "random_seed": self.params.get("seed", 1),
120 |         }
121 | 
122 |         for extra_param in [
123 |             "min_data_in_leaf",
124 |             "bootstrap_type",
125 |             "bagging_temperature",
126 |             "subsample",
127 |             "border_count",
128 |         ]:
129 |             if extra_param in self.params:
130 |                 cat_params[extra_param] = self.params[extra_param]
131 | 
132 |         self.log_metric_name = cat_params["eval_metric"]
133 |         if cat_params["eval_metric"] == "spearman":
134 |             cat_params["eval_metric"] = CatBoostEvalMetricSpearman()
135 |             self.log_metric_name = "CatBoostEvalMetricSpearman"
136 |         elif cat_params["eval_metric"] == "pearson":
137 |             cat_params["eval_metric"] = CatBoostEvalMetricPearson()
138 |             self.log_metric_name = "CatBoostEvalMetricPearson"
139 |         elif cat_params["eval_metric"] == "average_precision":
140 |             cat_params["eval_metric"] = CatBoostEvalMetricAveragePrecision()
141 |             self.log_metric_name = "CatBoostEvalMetricAveragePrecision"
142 |         elif cat_params["eval_metric"] == "mse":
143 |             cat_params["eval_metric"] = CatBoostEvalMetricMSE()
144 |             self.log_metric_name = "CatBoostEvalMetricMSE"
145 |         elif cat_params["eval_metric"] == "user_defined_metric":
146 |             cat_params["eval_metric"] = CatBoostEvalMetricUserDefined()
147 |             self.log_metric_name = "CatBoostEvalMetricUserDefined"
148 | 
149 |         self.model = Algo(**cat_params)
150 |         self.cat_features = None
151 |         self.best_ntree_limit = 0
152 | 
153 |         logger.debug("CatBoostAlgorithm.__init__")
154 | 
155 |     def _assess_iterations(self, X, y, sample_weight, eval_set, max_time=None):
156 |         if max_time is None:
157 |             max_time = 3600
158 |         try:
159 |             model = copy.deepcopy(self.model)
160 |             model.set_params(iterations=self.warmup_iterations)
161 |             start_time = time.time()
162 |             model.fit(
163 |                 X,
164 |                 y,
165 |                 sample_weight=sample_weight,
166 |                 cat_features=self.cat_features,
167 |                 init_model=None if self.model.tree_count_ is None else self.model,
168 |                 eval_set=eval_set,
169 |                 early_stopping_rounds=self.early_stopping_rounds,
170 |                 verbose_eval=False,
171 |             )
172 |             elapsed_time = (time.time() - start_time) / float(self.warmup_iterations)
173 |             # print(max_time, elapsed_time, max_time / elapsed_time, np.round(time.time() - start_time, 2))
174 |             new_rounds = int(min(10000, max_time / elapsed_time))
175 |             new_rounds = max(new_rounds, 10)
176 |             return model, new_rounds
177 |         except Exception as e:
178 |             # print(str(e))
179 |             return None, 1000
180 | 
181 |     def fit(
182 |         self,
183 |         X,
184 |         y,
185 |         sample_weight=None,
186 |         X_validation=None,
187 |         y_validation=None,
188 |         sample_weight_validation=None,
189 |         log_to_file=None,
190 |         max_time=None,
191 |     ):
192 |         if self.is_fitted():
193 |             print("CatBoost model already fitted. Skip fit().")
194 |             return
195 | 
196 |         if self.cat_features is None:
197 |             self.cat_features = []
198 |             for i in range(X.shape[1]):
199 |                 if PreprocessingUtils.is_categorical(X.iloc[:, i]):
200 |                     self.cat_features += [i]
201 |                     col_name = X.columns[i]
202 |                     X[col_name] = X[col_name].astype(str)
203 |                     if X_validation is not None:
204 |                         X_validation[col_name] = X_validation[col_name].astype(str)
205 | 
206 |         eval_set = None
207 |         if X_validation is not None and y_validation is not None:
208 |             eval_set = Pool(
209 |                 data=X_validation,
210 |                 label=y_validation,
211 |                 cat_features=self.cat_features,
212 |                 weight=sample_weight_validation,
213 |             )
214 | 
215 |         if self.params.get("num_boost_round") is None:
216 |             model_init, new_iterations = self._assess_iterations(
217 |                 X, y, sample_weight, eval_set, max_time
218 |             )
219 |             self.model.set_params(iterations=new_iterations)
220 |         else:
221 |             model_init = None
222 |             self.model.set_params(iterations=self.params.get("num_boost_round"))
223 |             self.early_stopping_rounds = self.params.get("early_stopping_rounds", 50)
224 | 
225 |         self.model.fit(
226 |             X,
227 |             y,
228 |             sample_weight=sample_weight,
229 |             cat_features=self.cat_features,
230 |             init_model=model_init,
231 |             eval_set=eval_set,
232 |             early_stopping_rounds=self.early_stopping_rounds,
233 |             verbose_eval=False,
234 |         )
235 | 
236 |         if self.model.best_iteration_ is not None:
237 |             if model_init is not None:
238 |                 self.best_ntree_limit = (
239 |                     self.model.best_iteration_ + model_init.tree_count_ + 1
240 |                 )
241 |             else:
242 |                 self.best_ntree_limit = self.model.best_iteration_ + 1
243 | 
244 |         else:
245 |             # just take all the trees
246 |             # the warm-up trees are already included
247 |             # dont need to add +1
248 |             self.best_ntree_limit = self.model.tree_count_
249 | 
250 |         if log_to_file is not None:
251 |             train_scores = self.model.evals_result_["learn"].get(self.log_metric_name)
252 |             validation_scores = self.model.evals_result_["validation"].get(
253 |                 self.log_metric_name
254 |             )
255 |             if model_init is not None:
256 |                 if train_scores is not None:
257 |                     train_scores = (
258 |                         model_init.evals_result_["learn"].get(self.log_metric_name)
259 |                         + train_scores
260 |                     )
261 |                 if validation_scores is not None:
262 |                     validation_scores = (
263 |                         model_init.evals_result_["validation"].get(self.log_metric_name)
264 |                         + validation_scores
265 |                     )
266 |             iteration = None
267 |             if train_scores is not None:
268 |                 iteration = range(len(validation_scores))
269 |             elif validation_scores is not None:
270 |                 iteration = range(len(validation_scores))
271 | 
272 |             result = pd.DataFrame(
273 |                 {
274 |                     "iteration": iteration,
275 |                     "train": train_scores,
276 |                     "validation": validation_scores,
277 |                 }
278 |             )
279 |             result.to_csv(log_to_file, index=False, header=False)
280 | 
281 |         if self.params["ml_task"] != REGRESSION:
282 |             self.classes_ = np.unique(y)
283 | 
284 |     def is_fitted(self):
285 |         return self.model is not None and self.model.tree_count_ is not None
286 | 
287 |     def predict(self, X):
288 |         self.reload()
289 |         if self.params["ml_task"] == BINARY_CLASSIFICATION:
290 |             return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)[:, 1]
291 |         elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
292 |             return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)
293 | 
294 |         return self.model.predict(X, ntree_end=self.best_ntree_limit)
295 | 
296 |     def copy(self):
297 |         return copy.deepcopy(self)
298 | 
299 |     def save(self, model_file_path):
300 |         self.model.save_model(model_file_path)
301 |         self.model_file_path = model_file_path
302 |         logger.debug("CatBoostAlgorithm save model to %s" % model_file_path)
303 | 
304 |     def load(self, model_file_path):
305 |         logger.debug("CatBoostLearner load model from %s" % model_file_path)
306 | 
307 |         # waiting for fix https://github.com/catboost/catboost/issues/696
308 |         Algo = CatBoostClassifier
309 |         if self.params["ml_task"] == REGRESSION:
310 |             Algo = CatBoostRegressor
311 | 
312 |         # loading might throw warnings in the case of custom eval_metric
313 |         # check https://github.com/catboost/catboost/issues/1169
314 |         self.model = Algo().load_model(model_file_path)
315 |         self.model_file_path = model_file_path
316 | 
317 |     def file_extension(self):
318 |         return "catboost"
319 | 
320 |     def get_metric_name(self):
321 |         metric = self.params.get("eval_metric")
322 |         if metric is None:
323 |             return None
324 |         if metric == "Logloss":
325 |             return "logloss"
326 |         elif metric == "AUC":
327 |             return "auc"
328 |         elif metric == "MultiClass":
329 |             return "logloss"
330 |         elif metric == "RMSE":
331 |             return "rmse"
332 |         elif metric == "MSE":
333 |             return "mse"
334 |         elif metric == "MAE":
335 |             return "mae"
336 |         elif metric == "MAPE":
337 |             return "mape"
338 |         elif metric in ["F1", "TotalF1:average=Micro"]:
339 |             return "f1"
340 |         elif metric == "Accuracy":
341 |             return "accuracy"
342 |         return metric
343 | 
344 | 
345 | classification_params = {
346 |     "learning_rate": [0.025, 0.05, 0.1, 0.2],
347 |     "depth": [4, 5, 6, 7, 8, 9],
348 |     "rsm": [0.7, 0.8, 0.9, 1],  # random subspace method
349 |     "loss_function": ["Logloss"],
350 | }
351 | 
352 | classification_default_params = {
353 |     "learning_rate": 0.1,
354 |     "depth": 6,
355 |     "rsm": 1,
356 |     "loss_function": "Logloss",
357 | }
358 | 
359 | additional = {
360 |     "max_rounds": 10000,
361 |     "early_stopping_rounds": 50,
362 |     "max_rows_limit": None,
363 |     "max_cols_limit": None,
364 | }
365 | required_preprocessing = [
366 |     "missing_values_inputation",
367 |     "datetime_transform",
368 |     "text_transform",
369 |     "target_as_integer",
370 | ]
371 | 
372 | 
373 | class CBClassifier(ClassifierMixin, CatBoostAlgorithm):
374 |     pass
375 | 
376 | 
377 | AlgorithmsRegistry.add(
378 |     BINARY_CLASSIFICATION,
379 |     CBClassifier,
380 |     classification_params,
381 |     required_preprocessing,
382 |     additional,
383 |     classification_default_params,
384 | )
385 | 
386 | multiclass_classification_params = copy.deepcopy(classification_params)
387 | multiclass_classification_params["loss_function"] = ["MultiClass"]
388 | multiclass_classification_params["depth"] = [3, 4, 5, 6]
389 | multiclass_classification_params["learning_rate"] = [0.1, 0.15, 0.2]
390 | 
391 | multiclass_classification_default_params = copy.deepcopy(classification_default_params)
392 | multiclass_classification_default_params["loss_function"] = "MultiClass"
393 | multiclass_classification_default_params["depth"] = 5
394 | multiclass_classification_default_params["learning_rate"] = 0.15
395 | 
396 | 
397 | AlgorithmsRegistry.add(
398 |     MULTICLASS_CLASSIFICATION,
399 |     CBClassifier,
400 |     multiclass_classification_params,
401 |     required_preprocessing,
402 |     additional,
403 |     multiclass_classification_default_params,
404 | )
405 | 
406 | regression_params = copy.deepcopy(classification_params)
407 | regression_params["loss_function"] = ["RMSE", "MAE", "MAPE"]
408 | 
409 | regression_required_preprocessing = [
410 |     "missing_values_inputation",
411 |     "datetime_transform",
412 |     "text_transform",
413 |     "target_scale",
414 | ]
415 | 
416 | 
417 | regression_default_params = {
418 |     "learning_rate": 0.1,
419 |     "depth": 6,
420 |     "rsm": 1,
421 |     "loss_function": "RMSE",
422 | }
423 | 
424 | 
425 | class CBRegressor(RegressorMixin, CatBoostAlgorithm):
426 |     pass
427 | 
428 | 
429 | AlgorithmsRegistry.add(
430 |     REGRESSION,
431 |     CBRegressor,
432 |     regression_params,
433 |     regression_required_preprocessing,
434 |     additional,
435 |     regression_default_params,
436 | )
437 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/optimization.py:
--------------------------------------------------------------------------------

```python
  1 | import numpy as np
  2 | 
  3 | 
  4 | class FairnessOptimization:
  5 |     @staticmethod
  6 |     def binary_classification(
  7 |         target,
  8 |         predicted_labels,
  9 |         sensitive_features,
 10 |         fairness_metric,
 11 |         fairness_threshold,
 12 |         privileged_groups=[],
 13 |         underprivileged_groups=[],
 14 |         previous_fairness_optimization=None,
 15 |         min_selection_rate=None,
 16 |         max_selection_rate=None,
 17 |     ):
 18 |         target = np.array(target).ravel()
 19 |         preds = np.array(predicted_labels)
 20 | 
 21 |         # fairness optimization stats
 22 |         sensitive_values = {}
 23 |         for col in sensitive_features.columns:
 24 |             col_name = col[10:]  # skip 'senstive_'
 25 |             values = list(sensitive_features[col].unique())
 26 |             sensitive_values[col] = values
 27 | 
 28 |             for v in values:
 29 |                 ii = sensitive_features[col] == v
 30 | 
 31 |             new_sensitive_values = {}
 32 |             for k, prev_values in sensitive_values.items():
 33 |                 if k == col:
 34 |                     continue
 35 |                 new_sensitive_values[f"{k}@{col}"] = []
 36 |                 for v in values:
 37 |                     for pv in prev_values:
 38 |                         if isinstance(pv, tuple):
 39 |                             new_sensitive_values[f"{k}@{col}"] += [(*pv, v)]
 40 |                         else:
 41 |                             new_sensitive_values[f"{k}@{col}"] += [(pv, v)]
 42 | 
 43 |             sensitive_values = {**sensitive_values, **new_sensitive_values}
 44 | 
 45 |         # print(sensitive_values)
 46 | 
 47 |         sensitive_indices = {}
 48 |         for k, values_list in sensitive_values.items():
 49 |             if k.count("@") == sensitive_features.shape[1] - 1:
 50 |                 # print(k)
 51 |                 # print("values_list",values_list)
 52 |                 cols = k.split("@")
 53 |                 for values in values_list:
 54 |                     if not isinstance(values, tuple):
 55 |                         values = (values,)
 56 |                     # print("values", values)
 57 | 
 58 |                     ii = None
 59 |                     for i, c in enumerate(cols):
 60 |                         if ii is None:
 61 |                             ii = sensitive_features[c] == values[i]
 62 |                         else:
 63 |                             ii &= sensitive_features[c] == values[i]
 64 | 
 65 |                     key = "@".join([str(s) for s in values])
 66 |                     # print(key, np.sum(ii))
 67 |                     sensitive_indices[key] = ii
 68 | 
 69 |         total_dp_ratio = min_selection_rate / max_selection_rate
 70 |         # print("total dp ratio", total_dp_ratio)
 71 | 
 72 |         c0 = np.sum(target == 0)
 73 |         c1 = np.sum(target == 1)
 74 | 
 75 |         selection_rates = {}
 76 |         weights = {}
 77 | 
 78 |         for key, indices in sensitive_indices.items():
 79 |             selection_rates[key] = np.sum((preds == 1) & indices) / np.sum(indices)
 80 |             # print(key, np.sum(indices), selection_rates[key])
 81 | 
 82 |             t = np.sum(indices)
 83 |             t0 = np.sum(indices & (target == 0))
 84 |             t1 = np.sum(indices & (target == 1))
 85 | 
 86 |             w0 = t / target.shape[0] * c0 / t0
 87 |             w1 = t / target.shape[0] * c1 / t1
 88 | 
 89 |             # print("----", key, w0, w1, t, t0, t1)
 90 |             weights[key] = [w0, w1]
 91 | 
 92 |         max_selection_rate = np.max(list(selection_rates.values()))
 93 |         min_selection_rate = np.min(list(selection_rates.values()))
 94 | 
 95 |         for k, v in selection_rates.items():
 96 |             selection_rates[k] = v / max_selection_rate
 97 | 
 98 |         # print("previous fairness optimization")
 99 |         # print(previous_fairness_optimization)
100 |         # print("********")
101 | 
102 |         previous_weights = {}
103 |         if previous_fairness_optimization is not None:
104 |             weights = previous_fairness_optimization.get("weights")
105 |             for key, indices in sensitive_indices.items():
106 |                 # print("Previous")
107 |                 # print(previous_fairness_optimization["selection_rates"][key], selection_rates[key])
108 | 
109 |                 direction = 0.0
110 |                 if (
111 |                     previous_fairness_optimization["selection_rates"][key]
112 |                     < selection_rates[key]
113 |                 ):
114 |                     # print("Improvement")
115 |                     direction = 1.0
116 |                 elif selection_rates[key] > 0.8:
117 |                     # print("GOOD")
118 |                     direction = 0.0
119 |                 else:
120 |                     # print("Decrease")
121 |                     direction = -0.5
122 | 
123 |                 # need to add previous weights instead 1.0
124 |                 prev_weights = previous_fairness_optimization.get(
125 |                     "previous_weights", {}
126 |                 ).get(key, [1, 1])
127 |                 # print("prev_weights", prev_weights)
128 |                 delta0 = weights[key][0] - prev_weights[0]
129 |                 delta1 = weights[key][1] - prev_weights[1]
130 | 
131 |                 previous_weights[key] = [weights[key][0], weights[key][1]]
132 | 
133 |                 # print("BEFORE")
134 |                 # print(weights[key])
135 |                 weights[key][0] += direction * delta0
136 |                 weights[key][1] += direction * delta1
137 |                 # print("AFTER")
138 |                 # print(weights[key])
139 |                 # print(previous_fairness_optimization["weights"][key])
140 | 
141 |         step = None
142 |         if previous_fairness_optimization is not None:
143 |             step = previous_fairness_optimization.get("step")
144 | 
145 |         if step is None:
146 |             step = 0
147 |         else:
148 |             step += 1
149 | 
150 |         return {
151 |             "selection_rates": selection_rates,
152 |             "previous_weights": previous_weights,
153 |             "weights": weights,
154 |             "total_dp_ratio": total_dp_ratio,
155 |             "step": step,
156 |             "fairness_threshold": fairness_threshold,
157 |         }
158 | 
159 |     @staticmethod
160 |     def regression(
161 |         target,
162 |         predictions,
163 |         sensitive_features,
164 |         fairness_metric,
165 |         fairness_threshold,
166 |         privileged_groups=[],
167 |         underprivileged_groups=[],
168 |         previous_fairness_optimization=None,
169 |         performance_metric=None,
170 |         performance_metric_name=None,
171 |     ):
172 |         target = np.array(target).ravel()
173 |         preds = np.array(predictions)
174 | 
175 |         # fairness optimization stats
176 |         sensitive_values = {}
177 |         for col in sensitive_features.columns:
178 |             col_name = col[10:]  # skip 'senstive_'
179 |             values = list(sensitive_features[col].unique())
180 |             sensitive_values[col] = values
181 | 
182 |             for v in values:
183 |                 ii = sensitive_features[col] == v
184 | 
185 |             new_sensitive_values = {}
186 |             for k, prev_values in sensitive_values.items():
187 |                 if k == col:
188 |                     continue
189 |                 new_sensitive_values[f"{k}@{col}"] = []
190 |                 for v in values:
191 |                     for pv in prev_values:
192 |                         if isinstance(pv, tuple):
193 |                             new_sensitive_values[f"{k}@{col}"] += [(*pv, v)]
194 |                         else:
195 |                             new_sensitive_values[f"{k}@{col}"] += [(pv, v)]
196 | 
197 |             sensitive_values = {**sensitive_values, **new_sensitive_values}
198 | 
199 |         sensitive_indices = {}
200 |         least_frequent_key = None
201 |         least_frequency = sensitive_features.shape[0]
202 |         for k, values_list in sensitive_values.items():
203 |             if k.count("@") == sensitive_features.shape[1] - 1:
204 |                 # print(k)
205 |                 # print("values_list",values_list)
206 |                 cols = k.split("@")
207 |                 for values in values_list:
208 |                     if not isinstance(values, tuple):
209 |                         values = (values,)
210 |                     # print("values", values)
211 | 
212 |                     ii = None
213 |                     for i, c in enumerate(cols):
214 |                         if ii is None:
215 |                             ii = sensitive_features[c] == values[i]
216 |                         else:
217 |                             ii &= sensitive_features[c] == values[i]
218 | 
219 |                     key = "@".join([str(s) for s in values])
220 |                     if np.sum(ii) > 0:
221 |                         sensitive_indices[key] = ii
222 |                         if np.sum(ii) < least_frequency:
223 |                             least_frequency = np.sum(ii)
224 |                             least_frequent_key = key
225 | 
226 |         weights = {}
227 |         performance = {}
228 | 
229 |         for key, indices in sensitive_indices.items():
230 |             w = target.shape[0] / len(sensitive_indices) / np.sum(indices)
231 |             weights[key] = w
232 |             performance[key] = performance_metric(target[indices], predictions[indices])
233 | 
234 |         # try to upscale more the largest weight
235 |         weights[least_frequent_key] *= 1.5
236 | 
237 |         denominator = np.max(list(performance.values()))
238 |         new_performance = {}
239 |         for k, v in performance.items():
240 |             new_performance[k] = np.round(v / denominator, 4)
241 |         performance = new_performance
242 | 
243 |         previous_weights = {}
244 |         if previous_fairness_optimization is not None:
245 |             weights = previous_fairness_optimization.get("weights")
246 |             for key, indices in sensitive_indices.items():
247 |                 direction = 0.0
248 |                 if (
249 |                     previous_fairness_optimization["performance"][key]
250 |                     < performance[key]
251 |                 ):
252 |                     direction = 1.0
253 |                 elif performance[key] > fairness_threshold:
254 |                     direction = 0.0
255 |                 else:
256 |                     direction = -0.5
257 | 
258 |                 # need to add previous weights instead 1.0
259 |                 prev_weights = previous_fairness_optimization.get(
260 |                     "previous_weights", {}
261 |                 ).get(key, 1)
262 |                 delta0 = weights[key] - prev_weights
263 |                 previous_weights[key] = weights[key]
264 |                 weights[key] = max(weights[key] + direction * delta0, 0.01)
265 | 
266 |         no_weights_change = False
267 |         if str(previous_weights) == str(weights):
268 |             no_weights_change = True
269 | 
270 |         step = None
271 |         if previous_fairness_optimization is not None:
272 |             step = previous_fairness_optimization.get("step")
273 | 
274 |         if step is None:
275 |             step = 0
276 |         else:
277 |             if not no_weights_change:
278 |                 step += 1
279 | 
280 |         return {
281 |             "performance": performance,
282 |             "previous_weights": previous_weights,
283 |             "weights": weights,
284 |             "step": step,
285 |             "fairness_threshold": fairness_threshold,
286 |         }
287 | 
288 |     @staticmethod
289 |     def multiclass_classification(
290 |         target,
291 |         predicted_labels,
292 |         sensitive_features,
293 |         fairness_metric,
294 |         fairness_threshold,
295 |         privileged_groups=[],
296 |         underprivileged_groups=[],
297 |         previous_fairness_optimization=None,
298 |     ):
299 |         target = np.array(target).ravel()
300 |         preds = np.array(predicted_labels)
301 |         target_values = list(np.unique(target))
302 | 
303 |         # fairness optimization stats
304 |         sensitive_values = {}
305 |         for col in sensitive_features.columns:
306 |             col_name = col[10:]  # skip 'senstive_'
307 |             values = list(sensitive_features[col].unique())
308 |             sensitive_values[col] = values
309 |             for v in values:
310 |                 ii = sensitive_features[col] == v
311 |             new_sensitive_values = {}
312 |             for k, prev_values in sensitive_values.items():
313 |                 if k == col:
314 |                     continue
315 |                 new_sensitive_values[f"{k}@{col}"] = []
316 |                 for v in values:
317 |                     for pv in prev_values:
318 |                         if isinstance(pv, tuple):
319 |                             new_sensitive_values[f"{k}@{col}"] += [(*pv, v)]
320 |                         else:
321 |                             new_sensitive_values[f"{k}@{col}"] += [(pv, v)]
322 | 
323 |             sensitive_values = {**sensitive_values, **new_sensitive_values}
324 | 
325 |         sensitive_indices = {}
326 |         for k, values_list in sensitive_values.items():
327 |             if k.count("@") == sensitive_features.shape[1] - 1:
328 |                 cols = k.split("@")
329 |                 for values in values_list:
330 |                     if not isinstance(values, tuple):
331 |                         values = (values,)
332 | 
333 |                     ii = None
334 |                     for i, c in enumerate(cols):
335 |                         if ii is None:
336 |                             ii = sensitive_features[c] == values[i]
337 |                         else:
338 |                             ii &= sensitive_features[c] == values[i]
339 | 
340 |                     key = "@".join([str(s) for s in values])
341 |                     sensitive_indices[key] = ii
342 | 
343 |         cs = {}
344 |         for t in target_values:
345 |             cs[t] = np.sum(target == t)
346 |         selection_rates = {}
347 |         weights = {}
348 | 
349 |         for key, indices in sensitive_indices.items():
350 |             weights[key] = []
351 |             sv = np.sum(indices)
352 |             selection_rates[key] = {}
353 |             for t in target_values:
354 |                 selection_rates[key][t] = np.sum((preds == t) & indices) / np.sum(
355 |                     indices
356 |                 )
357 | 
358 |                 t_k = np.sum(indices & (target == t))
359 |                 w_k = sv / target.shape[0] * cs[t] / t_k
360 |                 weights[key] += [w_k]
361 | 
362 |         for t in target_values:
363 |             values = []
364 |             for k, v in selection_rates.items():
365 |                 values += [v[t]]
366 |             max_selection_rate = np.max(values)
367 |             for k, v in selection_rates.items():
368 |                 v[t] /= max_selection_rate
369 | 
370 |         previous_weights = {}
371 |         if previous_fairness_optimization is not None:
372 |             weights = previous_fairness_optimization.get("weights")
373 |             for key, indices in sensitive_indices.items():
374 |                 previous_weights[key] = [1] * len(target_values)
375 |                 for i, t in enumerate(target_values):
376 |                     direction = 0.0
377 |                     if (
378 |                         previous_fairness_optimization["selection_rates"][key][t]
379 |                         < selection_rates[key][t]
380 |                     ):
381 |                         direction = 1.0
382 |                     elif selection_rates[key][t] > 0.8:
383 |                         direction = 0.0
384 |                     else:
385 |                         direction = -0.5
386 | 
387 |                     # need to add previous weights instead 1.0
388 |                     prev_weights = previous_fairness_optimization.get(
389 |                         "previous_weights", {}
390 |                     ).get(key, [1] * len(target_values))
391 | 
392 |                     delta_i = weights[key][i] - prev_weights[i]
393 | 
394 |                     previous_weights[key][i] = weights[key][i]
395 | 
396 |                     weights[key][i] += direction * delta_i
397 | 
398 |         step = None
399 |         if previous_fairness_optimization is not None:
400 |             step = previous_fairness_optimization.get("step")
401 | 
402 |         if step is None:
403 |             step = 0
404 |         else:
405 |             step += 1
406 | 
407 |         return {
408 |             "selection_rates": selection_rates,
409 |             "previous_weights": previous_weights,
410 |             "weights": weights,
411 |             "step": step,
412 |             "fairness_threshold": fairness_threshold,
413 |             "target_values": target_values,
414 |         }
415 | 
```

--------------------------------------------------------------------------------
/supervised/utils/automl_plots.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | import traceback # For exception details
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy as sp
  8 | # --- Added Import ---
  9 | from sklearn.preprocessing import MinMaxScaler
 10 | # --------------------
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | from supervised.utils.config import LOG_LEVEL
 14 | logger.setLevel(LOG_LEVEL)
 15 | # Add a handler if running standalone for testing
 16 | if not logger.hasHandlers():
 17 |     handler = logging.StreamHandler()
 18 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 19 |     handler.setFormatter(formatter)
 20 |     logger.addHandler(handler)
 21 | 
 22 | 
 23 | import warnings
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
 27 | 
 28 | class AutoMLPlots:
 29 |     # Original filename definitions
 30 |     features_heatmap_fname = "features_heatmap.png"
 31 |     correlation_heatmap_fname = "correlation_heatmap.png"
 32 |     # Filename for Scaled Plot
 33 |     features_heatmap_scaled_fname = "features_heatmap_scaled.png"
 34 | 
 35 |     @staticmethod
 36 |     def _plot_feature_heatmap(data_df, title, plot_path, cmap="Blues", vmin=None, vmax=None, cbar_label='Importance'):
 37 |         """ Helper method to generate and save a feature importance heatmap. """
 38 |         try:
 39 |             logger.info(f"Generating heatmap: '{title}'")
 40 |             # Adjust height dynamically based on number of features
 41 |             plot_height = max(7, len(data_df.index) * 0.35)
 42 |             fig, ax = plt.subplots(1, 1, figsize=(10, plot_height))
 43 | 
 44 |             image = ax.imshow(
 45 |                 data_df,
 46 |                 interpolation="nearest",
 47 |                 cmap=plt.cm.get_cmap(cmap),
 48 |                 aspect="auto",
 49 |                 vmin=vmin, # Use provided vmin
 50 |                 vmax=vmax  # Use provided vmax
 51 |             )
 52 |             cbar = plt.colorbar(mappable=image)
 53 |             cbar.set_label(cbar_label) # Use provided label
 54 | 
 55 |             x_tick_marks = np.arange(len(data_df.columns))
 56 |             y_tick_marks = np.arange(len(data_df.index))
 57 |             ax.set_xticks(x_tick_marks)
 58 |             ax.set_xticklabels(data_df.columns, rotation=90)
 59 |             ax.set_yticks(y_tick_marks)
 60 |             ax.set_yticklabels(data_df.index)
 61 |             ax.set_title(title)
 62 | 
 63 |             plt.tight_layout(pad=2.0)
 64 |             plt.savefig(plot_path)
 65 |             logger.info(f"Saved heatmap to: {plot_path}")
 66 |             plt.close(fig) # Close the specific figure
 67 | 
 68 |         except Exception as e:
 69 |             logger.error(f"Failed to generate heatmap '{title}': {e}")
 70 |             logger.error(traceback.format_exc())
 71 |             plt.close("all") # Close any potentially open plots on error
 72 | 
 73 | 
 74 |     @staticmethod
 75 |     def add(results_path, models, fout):
 76 |         """
 77 |         Adds plots to the report file stream. Now includes both unscaled and scaled importance.
 78 | 
 79 |         Args:
 80 |             results_path (str): Path to results directory.
 81 |             models (list): List of model objects.
 82 |             fout (file object): Writable file object for the report.
 83 |         """
 84 |         # Generate both feature importance plots
 85 |         AutoMLPlots.models_feature_importance(results_path, models)
 86 | 
 87 |         # --- Unscaled Feature Importance Section ---
 88 |         features_plot_path = os.path.join(
 89 |             results_path, AutoMLPlots.features_heatmap_fname # Use original filename
 90 |         )
 91 |         if os.path.exists(features_plot_path):
 92 |             fout.write("\n\n### Features Importance (Original Scale)\n") # Updated title
 93 |             fout.write(
 94 |                 f"![features importance across models]({AutoMLPlots.features_heatmap_fname})\n\n" # Use original filename
 95 |             )
 96 |         else:
 97 |             logger.warning(f"Original feature importance plot not found at: {features_plot_path}")
 98 | 
 99 | 
100 |         # --- Scaled Feature Importance Section ---
101 |         features_scaled_plot_path = os.path.join(
102 |             results_path, AutoMLPlots.features_heatmap_scaled_fname # Use scaled filename
103 |         )
104 |         if os.path.exists(features_scaled_plot_path):
105 |             fout.write("\n\n### Scaled Features Importance (MinMax per Model)\n") # Title for scaled plot
106 |             fout.write(
107 |                 f"![scaled features importance across models]({AutoMLPlots.features_heatmap_scaled_fname})\n\n" # Use scaled filename
108 |             )
109 |         else:
110 |             logger.warning(f"Scaled feature importance plot not found at: {features_scaled_plot_path}")
111 | 
112 | 
113 |         # --- Correlation Section (remains the same) ---
114 |         AutoMLPlots.models_correlation(results_path, models)
115 | 
116 |         correlation_plot_path = os.path.join(
117 |             results_path, AutoMLPlots.correlation_heatmap_fname
118 |         )
119 |         if os.path.exists(correlation_plot_path):
120 |             fout.write("\n\n### Spearman Correlation of Models\n")
121 |             fout.write(
122 |                 f"![models spearman correlation]({AutoMLPlots.correlation_heatmap_fname})\n\n"
123 |             )
124 |         else:
125 |             logger.warning(f"Model correlation plot not found at: {correlation_plot_path}")
126 | 
127 | 
128 |     @staticmethod
129 |     def models_feature_importance(results_path, models):
130 |         """
131 |         Generates and saves BOTH original and scaled feature importance heatmaps.
132 |         """
133 |         logger.info("Starting feature importance generation (original and scaled).")
134 |         try:
135 |             # --- Data Aggregation (Common part) ---
136 |             model_feature_imp = {}
137 |             # (Same robust reading logic as before)
138 |             for m in models:
139 |                 model_name = m.get_name()
140 |                 model_path = os.path.join(results_path, model_name)
141 |                 logger.debug(f"Processing model '{model_name}' in '{model_path}'")
142 |                 if not os.path.isdir(model_path):
143 |                     logger.warning(f"Directory not found for model '{model_name}'. Skipping.")
144 |                     continue
145 |                 try:
146 |                     all_files = os.listdir(model_path)
147 |                 except OSError as e:
148 |                     logger.error(f"Cannot list directory {model_path}: {e}. Skipping model '{model_name}'.")
149 |                     continue
150 |                 imp_data = [f for f in all_files if "_importance.csv" in f and "shap" not in f]
151 |                 if not imp_data:
152 |                     logger.warning(f"No suitable importance files found for model '{model_name}'. Skipping.")
153 |                     continue
154 |                 df_all = []
155 |                 for fname in imp_data:
156 |                     file_path = os.path.join(model_path, fname)
157 |                     try:
158 |                         df = pd.read_csv(file_path, index_col=0)
159 |                         numeric_df = df.select_dtypes(include=np.number)
160 |                         if numeric_df.empty or numeric_df.isnull().all().all():
161 |                             logger.warning(f"File {fname} (model '{model_name}') contains no valid numeric data. Skipping.")
162 |                             continue
163 |                         df_all.append(df)
164 |                     except Exception as read_e:
165 |                         logger.error(f"Error reading/processing file {fname} (model '{model_name}'): {read_e}. Skipping.")
166 |                         continue
167 |                 if not df_all:
168 |                     logger.warning(f"No valid importance dataframes read for model '{model_name}'. Skipping.")
169 |                     continue
170 |                 try:
171 |                     df_concat = pd.concat(df_all, axis=1, join='outer')
172 |                     numeric_df_concat = df_concat.select_dtypes(include=np.number)
173 |                     if not numeric_df_concat.empty:
174 |                          model_feature_imp[model_name] = numeric_df_concat.mean(axis=1).fillna(0)
175 |                     else:
176 |                          logger.warning(f"No numeric data after concat for model '{model_name}'. Skipping.")
177 |                 except Exception as concat_e:
178 |                      logger.error(f"Error aggregating importance for model '{model_name}': {concat_e}")
179 |                      continue
180 | 
181 |             logger.info(f"Collected feature importance for {len(model_feature_imp)} models.")
182 |             if len(model_feature_imp) < 2:
183 |                 logger.warning("Feature importance heatmaps require at least 2 models with data. Skipping plot generation.")
184 |                 return
185 | 
186 |             mfi = pd.concat(model_feature_imp, axis=1, join='outer').fillna(0)
187 |             logger.debug(f"Combined importance DataFrame shape: {mfi.shape}")
188 | 
189 |             # --- Sorting & Top N (Common part) ---
190 |             mfi["m"] = mfi.mean(axis=1)
191 |             mfi_sorted = mfi.sort_values(by="m", ascending=False)
192 |             mfi_sorted = mfi_sorted.drop("m", axis=1) # Keep original mfi for potential later use if needed
193 | 
194 |             num_features_original = mfi_sorted.shape[0]
195 |             mfi_plot_data = mfi_sorted # Default to using all sorted features
196 |             title_suffix = "Feature Importance"
197 |             scaled_title_suffix = "Scaled Feature Importance (MinMax per model)"
198 | 
199 |             if num_features_original > 25:
200 |                 mfi_plot_data = mfi_sorted.head(25)
201 |                 title_suffix = f"Top-25 ({num_features_original} total) Feature Importance"
202 |                 scaled_title_suffix = f"Top-25 ({num_features_original} total) Scaled Feature Importance (MinMax per model)"
203 |                 logger.info(f"Selecting top 25 features out of {num_features_original} for plotting.")
204 |             else:
205 |                  logger.info(f"Using all {num_features_original} features for plotting.")
206 | 
207 | 
208 |             # --- Plotting Unscaled Version ---
209 |             unscaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_fname)
210 |             AutoMLPlots._plot_feature_heatmap(
211 |                 data_df=mfi_plot_data,
212 |                 title=title_suffix + " (Original Scale)",
213 |                 plot_path=unscaled_plot_path,
214 |                 cbar_label='Importance'
215 |                 # vmin/vmax are auto-detected by default
216 |             )
217 | 
218 |             # --- Scaling Data ---
219 |             logger.debug("Applying Min-Max scaling for the second plot.")
220 |             scaler = MinMaxScaler()
221 |             mfi_scaled_array = scaler.fit_transform(mfi_plot_data) # Scale the potentially filtered data
222 |             mfi_scaled = pd.DataFrame(mfi_scaled_array, index=mfi_plot_data.index, columns=mfi_plot_data.columns)
223 | 
224 |             # --- Plotting Scaled Version ---
225 |             scaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_scaled_fname)
226 |             AutoMLPlots._plot_feature_heatmap(
227 |                 data_df=mfi_scaled,
228 |                 title=scaled_title_suffix,
229 |                 plot_path=scaled_plot_path,
230 |                 vmin=0, # Explicit range for scaled data
231 |                 vmax=1,
232 |                 cbar_label='Scaled Importance (MinMax per model)'
233 |             )
234 | 
235 |             logger.info("Finished generating feature importance plots.")
236 | 
237 |         except Exception as e:
238 |             logger.error(f"An error occurred during feature importance processing: {e}")
239 |             logger.error(traceback.format_exc())
240 |             plt.close("all") # Ensure plots are closed on unexpected error
241 | 
242 | 
243 |     # --- correlation and models_correlation methods remain the same as in the previous version ---
244 |     # (Include the improved versions from the previous response here)
245 |     @staticmethod
246 |     def correlation(oof1, oof2):
247 |         """ Calculates mean Spearman correlation between prediction columns """
248 |         # (Original code - unchanged)
249 |         cols = [c for c in oof1.columns if "prediction" in c]
250 |         # Check if prediction columns exist
251 |         if not cols or not all(c in oof2.columns for c in cols):
252 |             logger.warning("Prediction columns mismatch or not found for correlation calculation.")
253 |             return np.nan # Return NaN if predictions can't be compared
254 | 
255 |         with warnings.catch_warnings():
256 |             warnings.simplefilter(action="ignore")
257 |             v = []
258 |             for c in cols:
259 |                 try:
260 |                     # Calculate Spearman correlation, ignore p-value
261 |                     corr_val, _ = sp.stats.spearmanr(oof1[c], oof2[c])
262 |                     # Handle potential NaN result from spearmanr if input variance is zero
263 |                     if not np.isnan(corr_val):
264 |                         v.append(corr_val)
265 |                     else:
266 |                          logger.debug(f"NaN result from spearmanr for column {c}. Skipping.")
267 |                 except Exception as corr_e:
268 |                     logger.warning(f"Could not calculate Spearman correlation for column {c}: {corr_e}")
269 | 
270 |         # Return mean correlation, or NaN if no valid correlations were calculated
271 |         return np.mean(v) if v else np.nan
272 | 
273 | 
274 |     @staticmethod
275 |     def models_correlation(results_path, models):
276 |         """ Generates and saves model prediction correlation heatmap """
277 |         # (Original code - minor logging/error handling improvements)
278 |         logger.info("Starting model correlation heatmap generation.")
279 |         try:
280 |             if len(models) < 2:
281 |                 logger.warning("Model correlation heatmap requires at least 2 models. Skipping.")
282 |                 return
283 | 
284 |             names = []
285 |             oofs = []
286 |             valid_models_indices = [] # Keep track of models with valid OOF data
287 | 
288 |             for i, m in enumerate(models):
289 |                 try:
290 |                     oof_data = m.get_out_of_folds()
291 |                     # Basic validation of OOF data
292 |                     if oof_data is None or oof_data.empty or not any("prediction" in c for c in oof_data.columns):
293 |                         logger.warning(f"Model '{m.get_name()}' has invalid or missing out-of-folds prediction data. Excluding from correlation.")
294 |                         continue
295 | 
296 |                     names.append(m.get_name())
297 |                     oofs.append(oof_data)
298 |                     valid_models_indices.append(i) # Store original index if valid
299 |                     logger.debug(f"Got valid OOF data for model '{m.get_name()}'.")
300 | 
301 |                 except AttributeError:
302 |                      logger.warning(f"Model '{m.get_name()}' seems to be missing 'get_out_of_folds' method or it failed. Excluding from correlation.")
303 |                      continue
304 |                 except Exception as oof_e:
305 |                      logger.warning(f"Failed to get OOF data for model '{m.get_name()}': {oof_e}. Excluding from correlation.")
306 |                      continue
307 | 
308 | 
309 |             num_valid_models = len(names)
310 |             if num_valid_models < 2:
311 |                 logger.warning(f"Fewer than 2 models ({num_valid_models}) have valid OOF data for correlation. Skipping plot generation.")
312 |                 return
313 | 
314 |             logger.info(f"Calculating correlations for {num_valid_models} models.")
315 |             corrs = np.ones((num_valid_models, num_valid_models)) # Use num_valid_models dimension
316 |             for i in range(num_valid_models):
317 |                 for j in range(i + 1, num_valid_models):
318 |                     correlation_value = AutoMLPlots.correlation(oofs[i], oofs[j])
319 |                     # Fill with NaN if correlation calculation failed
320 |                     corrs[i, j] = corrs[j, i] = correlation_value if not np.isnan(correlation_value) else np.nan
321 | 
322 | 
323 |             # Check if all correlations are NaN
324 |             if np.isnan(corrs[np.triu_indices(num_valid_models, k=1)]).all():
325 |                  logger.warning("All pairwise model correlations resulted in NaN. Cannot generate heatmap.")
326 |                  return
327 | 
328 | 
329 |             logger.info("Generating model correlation heatmap.")
330 |             figsize = (15, 15) if num_valid_models > 15 else (10, 10) # Adjusted threshold
331 |             fig, ax = plt.subplots(1, 1, figsize=figsize)
332 | 
333 |             image = ax.imshow(
334 |                 corrs,
335 |                 interpolation="nearest",
336 |                 cmap=plt.cm.get_cmap("Blues"),
337 |                 aspect="auto",
338 |                 vmin=np.nanmin(corrs), # Use nanmin/nanmax to handle potential NaNs
339 |                 vmax=np.nanmax(corrs)
340 |             )
341 |             plt.colorbar(mappable=image)
342 | 
343 |             x_tick_marks = np.arange(num_valid_models)
344 |             y_tick_marks = np.arange(num_valid_models)
345 |             ax.set_xticks(x_tick_marks)
346 |             ax.set_xticklabels(names, rotation=90)
347 |             ax.set_yticks(y_tick_marks)
348 |             ax.set_yticklabels(names)
349 |             ax.set_title("Spearman Correlation of Models' OOF Predictions") # Slightly more descriptive title
350 | 
351 |             plt.tight_layout(pad=2.0)
352 | 
353 |             # --- Saving the Plot ---
354 |             os.makedirs(results_path, exist_ok=True) # Ensure directory exists
355 |             plot_path = os.path.join(
356 |                 results_path, AutoMLPlots.correlation_heatmap_fname
357 |             )
358 |             plt.savefig(plot_path)
359 |             logger.info(f"Saved model correlation heatmap to: {plot_path}")
360 |             plt.close("all") # Close plot to free memory
361 | 
362 |         except Exception as e:
363 |             # Log the exception with traceback
364 |             logger.error(f"An error occurred during model correlation plotting: {e}")
365 |             logger.error(traceback.format_exc())
366 |              # Ensure plot is closed if error occurred during saving/closing
367 |             plt.close("all")
368 | 
369 | 
370 | 
```