This is page 4 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /tests/tests_automl/test_explain_levels.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import pandas as pd 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | from supervised.algorithms.random_forest import additional 10 | 11 | additional["max_steps"] = 3 12 | additional["trees_in_step"] = 1 13 | 14 | from supervised.algorithms.xgboost import additional 15 | 16 | additional["max_rounds"] = 1 17 | 18 | 19 | class AutoMLExplainLevelsTest(unittest.TestCase): 20 | automl_dir = "AutoMLExplainLevelsTest" 21 | 22 | def setUp(self): 23 | shutil.rmtree(self.automl_dir, ignore_errors=True) 24 | 25 | def tearDown(self): 26 | shutil.rmtree(self.automl_dir, ignore_errors=True) 27 | 28 | def run_explain_default(self, task, alg): 29 | shutil.rmtree(self.automl_dir, ignore_errors=True) 30 | a = AutoML( 31 | results_path=self.automl_dir, 32 | total_time_limit=10, 33 | algorithms=[alg], 34 | train_ensemble=False, 35 | validation_strategy={ 36 | "validation_type": "kfold", 37 | "k_folds": 2, 38 | "shuffle": True, 39 | "stratify": True, 40 | }, 41 | start_random_models=1, 42 | ) 43 | 44 | if task == "binary": 45 | X, y = datasets.make_classification( 46 | n_samples=100, 47 | n_features=5, 48 | n_informative=4, 49 | n_redundant=1, 50 | n_classes=2, 51 | n_clusters_per_class=3, 52 | n_repeated=0, 53 | shuffle=False, 54 | random_state=0, 55 | ) 56 | elif task == "multi": 57 | X, y = datasets.make_classification( 58 | n_samples=100, 59 | n_features=5, 60 | n_informative=4, 61 | n_redundant=1, 62 | n_classes=5, 63 | n_clusters_per_class=3, 64 | n_repeated=0, 65 | shuffle=False, 66 | random_state=0, 67 | ) 68 | else: 69 | X, y = datasets.make_regression( 70 | n_samples=100, 71 | n_features=5, 72 | n_informative=4, 73 | shuffle=False, 74 | random_state=0, 75 | ) 76 | 77 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 78 | 79 | a.fit(X, y) 80 | 81 | result_files = os.listdir( 82 | os.path.join(self.automl_dir, f'1_Default_{alg.replace(" ", "")}') 83 | ) 84 | 85 | # There should be files with: 86 | # - permutation importance 87 | # - shap importance 88 | # - shap dependence 89 | # - shap decisions 90 | 91 | # Check permutation importance 92 | produced = False 93 | for f in result_files: 94 | if "importance.csv" in f and "shap" not in f: 95 | produced = True 96 | break 97 | self.assertTrue(produced) 98 | # Check shap importance 99 | produced = False 100 | for f in result_files: 101 | if "importance.csv" in f and "shap" in f: 102 | produced = True 103 | break 104 | self.assertTrue(produced) 105 | # Check shap dependence 106 | produced = False 107 | for f in result_files: 108 | if "shap_dependence" in f: 109 | produced = True 110 | break 111 | self.assertTrue(produced) 112 | # Check shap decisions 113 | produced = False 114 | for f in result_files: 115 | if "decisions.png" in f: 116 | produced = True 117 | break 118 | self.assertTrue(produced) 119 | 120 | # def test_explain_default(self): 121 | 122 | # for task in ["binary", "multi", "regression"]: 123 | # for alg in ["Xgboost", "Random Forest", "LightGBM"]: 124 | # self.run_explain_default(task, alg) 125 | 126 | def test_no_explain_linear(self): 127 | a = AutoML( 128 | results_path=self.automl_dir, 129 | total_time_limit=1, 130 | algorithms=["Linear"], 131 | train_ensemble=False, 132 | validation_strategy={ 133 | "validation_type": "kfold", 134 | "k_folds": 2, 135 | "shuffle": True, 136 | "stratify": True, 137 | }, 138 | explain_level=0, 139 | start_random_models=1, 140 | ) 141 | 142 | X, y = datasets.make_regression( 143 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 144 | ) 145 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 146 | 147 | a.fit(X, y) 148 | 149 | result_files = os.listdir(os.path.join(self.automl_dir, "1_Linear")) 150 | 151 | # There should be no files with: 152 | # - permutation importance 153 | # - shap importance 154 | # - shap dependence 155 | # - shap decisions 156 | 157 | # Check permutation importance 158 | produced = False 159 | for f in result_files: 160 | if "importance.csv" in f and "shap" not in f: 161 | produced = True 162 | break 163 | self.assertFalse(produced) 164 | # Check shap importance 165 | produced = False 166 | for f in result_files: 167 | if "importance.csv" in f and "shap" in f: 168 | produced = True 169 | break 170 | self.assertFalse(produced) 171 | # Check shap dependence 172 | produced = False 173 | for f in result_files: 174 | if "dependence.png" in f: 175 | produced = True 176 | break 177 | self.assertFalse(produced) 178 | # Check shap decisions 179 | produced = False 180 | for f in result_files: 181 | if "decisions.png" in f: 182 | produced = True 183 | break 184 | self.assertFalse(produced) 185 | # Check coefficients 186 | produced = False 187 | for f in result_files: 188 | if "coefs.csv" in f: 189 | produced = True 190 | break 191 | self.assertFalse(produced) 192 | 193 | def test_explain_just_permutation_importance(self): 194 | a = AutoML( 195 | results_path=self.automl_dir, 196 | total_time_limit=1, 197 | algorithms=["Xgboost"], 198 | train_ensemble=False, 199 | validation_strategy={ 200 | "validation_type": "kfold", 201 | "k_folds": 2, 202 | "shuffle": True, 203 | "stratify": True, 204 | }, 205 | explain_level=1, 206 | start_random_models=1, 207 | ) 208 | 209 | X, y = datasets.make_regression( 210 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 211 | ) 212 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 213 | 214 | a.fit(X, y) 215 | 216 | result_files = os.listdir(os.path.join(self.automl_dir, "1_Default_Xgboost")) 217 | 218 | # There should be no files with: 219 | # - permutation importance 220 | # - shap importance 221 | # - shap dependence 222 | # - shap decisions 223 | 224 | # Check permutation importance 225 | produced = False 226 | for f in result_files: 227 | if "importance.csv" in f and "shap" not in f: 228 | produced = True 229 | break 230 | self.assertTrue(produced) 231 | # Check shap importance 232 | produced = False 233 | for f in result_files: 234 | if "importance.csv" in f and "shap" in f: 235 | produced = True 236 | break 237 | self.assertFalse(produced) 238 | # Check shap dependence 239 | produced = False 240 | for f in result_files: 241 | if "dependence.png" in f: 242 | produced = True 243 | break 244 | self.assertFalse(produced) 245 | # Check shap decisions 246 | produced = False 247 | for f in result_files: 248 | if "decisions.png" in f: 249 | produced = True 250 | break 251 | self.assertFalse(produced) 252 | 253 | def test_build_decision_tree(self): 254 | a = AutoML( 255 | results_path=self.automl_dir, 256 | total_time_limit=10, 257 | algorithms=["Decision Tree"], 258 | train_ensemble=False, 259 | validation_strategy={ 260 | "validation_type": "kfold", 261 | "k_folds": 2, 262 | "shuffle": True, 263 | "stratify": True, 264 | }, 265 | explain_level=2, 266 | start_random_models=1, 267 | ) 268 | 269 | X, y = datasets.make_regression( 270 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 271 | ) 272 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 273 | 274 | a.fit(X, y) 275 | 276 | result_files = os.listdir(os.path.join(self.automl_dir, "1_DecisionTree")) 277 | 278 | # There should be files with: 279 | # - decision tree visualization 280 | # - permutation importance 281 | # - shap importance 282 | # - shap dependence 283 | # - shap decisions 284 | 285 | # Check Decision Tree visualization 286 | produced = False 287 | for f in result_files: 288 | if "tree.svg" in f: 289 | produced = True 290 | break 291 | # disable ??? TODO 292 | # self.assertTrue(produced) 293 | 294 | # Check permutation importance 295 | produced = False 296 | for f in result_files: 297 | if "importance.csv" in f and "shap" not in f: 298 | produced = True 299 | break 300 | self.assertTrue(produced) 301 | # Check shap importance 302 | produced = False 303 | for f in result_files: 304 | if "importance.csv" in f and "shap" in f: 305 | produced = True 306 | break 307 | self.assertTrue(produced) 308 | # Check shap dependence 309 | produced = False 310 | for f in result_files: 311 | if "dependence.png" in f: 312 | produced = True 313 | break 314 | self.assertTrue(produced) 315 | # Check shap decisions 316 | produced = False 317 | for f in result_files: 318 | if "decisions.png" in f: 319 | produced = True 320 | break 321 | self.assertTrue(produced) 322 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_targets.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | import pytest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from supervised import AutoML 9 | from supervised.algorithms.xgboost import additional 10 | from supervised.exceptions import AutoMLException 11 | 12 | additional["max_rounds"] = 1 13 | 14 | 15 | class AutoMLTargetsTest(unittest.TestCase): 16 | automl_dir = "automl_tests" 17 | rows = 50 18 | 19 | def tearDown(self): 20 | shutil.rmtree(self.automl_dir, ignore_errors=True) 21 | 22 | def test_bin_class_01(self): 23 | X = np.random.rand(self.rows, 3) 24 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 25 | y = np.random.randint(0, 2, self.rows) 26 | 27 | automl = AutoML( 28 | results_path=self.automl_dir, 29 | total_time_limit=1, 30 | algorithms=["Xgboost"], 31 | train_ensemble=False, 32 | explain_level=0, 33 | start_random_models=1, 34 | ) 35 | automl.fit(X, y) 36 | pred = automl.predict(X) 37 | 38 | u = np.unique(pred) 39 | self.assertTrue(0 in u or 1 in u) 40 | self.assertTrue(len(u) <= 2) 41 | 42 | def test_bin_class_11(self): 43 | X = np.random.rand(self.rows, 3) 44 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 45 | y = np.random.randint(0, 2, self.rows) * 2 - 1 46 | 47 | automl = AutoML( 48 | results_path=self.automl_dir, 49 | total_time_limit=1, 50 | algorithms=["Xgboost"], 51 | train_ensemble=False, 52 | explain_level=0, 53 | start_random_models=1, 54 | ) 55 | automl.fit(X, y) 56 | p = automl.predict(X) 57 | pred = automl.predict(X) 58 | 59 | u = np.unique(pred) 60 | 61 | self.assertTrue(-1 in u or 1 in u) 62 | self.assertTrue(0 not in u) 63 | self.assertTrue(len(u) <= 2) 64 | 65 | def test_bin_class_AB(self): 66 | X = np.random.rand(self.rows, 3) 67 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 68 | y = np.random.permutation(["a", "B"] * int(self.rows / 2)) 69 | 70 | automl = AutoML( 71 | results_path=self.automl_dir, 72 | total_time_limit=1, 73 | algorithms=["Xgboost"], 74 | train_ensemble=False, 75 | explain_level=0, 76 | start_random_models=1, 77 | ) 78 | automl.fit(X, y) 79 | p = automl.predict(X) 80 | pred = automl.predict(X) 81 | u = np.unique(pred) 82 | self.assertTrue("a" in u or "B" in u) 83 | self.assertTrue(len(u) <= 2) 84 | 85 | def test_bin_class_AB_missing_targets(self): 86 | X = np.random.rand(self.rows, 3) 87 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 88 | y = pd.Series( 89 | np.random.permutation(["a", "B"] * int(self.rows / 2)), name="target" 90 | ) 91 | 92 | y.iloc[1] = None 93 | y.iloc[3] = np.NaN 94 | y.iloc[13] = np.nan 95 | 96 | automl = AutoML( 97 | results_path=self.automl_dir, 98 | total_time_limit=1, 99 | algorithms=["Xgboost"], 100 | train_ensemble=False, 101 | explain_level=0, 102 | start_random_models=1, 103 | ) 104 | 105 | with pytest.warns( 106 | expected_warning=UserWarning, 107 | match="There are samples with missing target values in the data which will be excluded for further analysis", 108 | ) as record: 109 | automl.fit(X, y) 110 | 111 | # check that only one warning was raised 112 | self.assertEqual(len(record), 1) 113 | 114 | p = automl.predict(X) 115 | pred = automl.predict(X) 116 | 117 | u = np.unique(pred) 118 | self.assertTrue("a" in u or "B" in u) 119 | self.assertTrue(len(u) <= 2) 120 | 121 | def test_multi_class_0123_floats(self): 122 | X = np.random.rand(self.rows * 4, 3) 123 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 124 | y = np.random.randint(0, 4, self.rows * 4) 125 | y = y.astype(float) 126 | 127 | automl = AutoML( 128 | results_path=self.automl_dir, 129 | total_time_limit=1, 130 | algorithms=["Xgboost"], 131 | train_ensemble=False, 132 | explain_level=0, 133 | start_random_models=1, 134 | ) 135 | automl.fit(X, y) 136 | pred = automl.predict(X) 137 | 138 | u = np.unique(pred) 139 | 140 | self.assertTrue(0.0 in u or 1.0 in u or 2.0 in u or 3.0 in u) 141 | self.assertTrue(len(u) <= 4) 142 | 143 | def test_multi_class_0123(self): 144 | X = np.random.rand(self.rows * 4, 3) 145 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 146 | y = np.random.randint(0, 4, self.rows * 4) 147 | 148 | automl = AutoML( 149 | results_path=self.automl_dir, 150 | total_time_limit=1, 151 | algorithms=["Xgboost"], 152 | train_ensemble=False, 153 | explain_level=0, 154 | start_random_models=1, 155 | ) 156 | automl.fit(X, y) 157 | pred = automl.predict(X) 158 | 159 | u = np.unique(pred) 160 | 161 | self.assertTrue(0 in u or 1 in u or 2 in u or 3 in u) 162 | self.assertTrue(len(u) <= 4) 163 | 164 | def test_multi_class_0123_strings(self): 165 | X = np.random.rand(self.rows * 4, 3) 166 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 167 | y = np.random.randint(0, 4, self.rows * 4) 168 | y = y.astype(str) 169 | 170 | automl = AutoML( 171 | results_path=self.automl_dir, 172 | total_time_limit=1, 173 | algorithms=["Xgboost"], 174 | train_ensemble=False, 175 | explain_level=0, 176 | start_random_models=1, 177 | ) 178 | automl.fit(X, y) 179 | pred = automl.predict(X) 180 | 181 | u = np.unique(pred) 182 | 183 | self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u) 184 | self.assertTrue(len(u) <= 4) 185 | 186 | def test_multi_class_abcd(self): 187 | X = np.random.rand(self.rows * 4, 3) 188 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 189 | y = pd.Series( 190 | np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target" 191 | ) 192 | 193 | automl = AutoML( 194 | results_path=self.automl_dir, 195 | total_time_limit=1, 196 | algorithms=["Xgboost"], 197 | train_ensemble=False, 198 | explain_level=0, 199 | start_random_models=1, 200 | ) 201 | automl.fit(X, y) 202 | pred = automl.predict(X) 203 | 204 | u = np.unique(pred) 205 | 206 | self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) 207 | self.assertTrue(len(u) <= 4) 208 | 209 | def test_multi_class_abcd_np_array(self): 210 | X = np.random.rand(self.rows * 4, 3) 211 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 212 | y = np.random.permutation([None, "B", "CC", "d"] * self.rows) 213 | 214 | automl = AutoML( 215 | results_path=self.automl_dir, 216 | total_time_limit=1, 217 | algorithms=["Xgboost"], 218 | train_ensemble=False, 219 | explain_level=0, 220 | start_random_models=1, 221 | ) 222 | automl.fit(X, y) 223 | pred = automl.predict(X) 224 | 225 | u = np.unique(pred) 226 | 227 | self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) 228 | self.assertTrue(len(u) <= 4) 229 | 230 | def test_multi_class_abcd_mixed_int(self): 231 | X = np.random.rand(self.rows * 4, 3) 232 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 233 | y = pd.Series( 234 | np.random.permutation([1, "B", "CC", "d"] * self.rows), name="target" 235 | ) 236 | 237 | automl = AutoML( 238 | results_path=self.automl_dir, 239 | total_time_limit=1, 240 | algorithms=["Xgboost"], 241 | train_ensemble=False, 242 | explain_level=0, 243 | start_random_models=1, 244 | ) 245 | automl.fit(X, y) 246 | pred = automl.predict(X) 247 | u = np.unique(pred) 248 | 249 | self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) 250 | self.assertTrue(len(u) <= 4) 251 | 252 | def test_multi_class_abcd_missing_target(self): 253 | X = np.random.rand(self.rows * 4, 3) 254 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 255 | y = pd.Series( 256 | np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target" 257 | ) 258 | 259 | y.iloc[0] = None 260 | y.iloc[1] = None 261 | automl = AutoML( 262 | results_path=self.automl_dir, 263 | total_time_limit=1, 264 | algorithms=["Xgboost"], 265 | train_ensemble=False, 266 | explain_level=0, 267 | start_random_models=1, 268 | ) 269 | 270 | with pytest.warns( 271 | expected_warning=UserWarning, 272 | match="There are samples with missing target values in the data which will be excluded for further analysis", 273 | ) as record: 274 | automl.fit(X, y) 275 | 276 | # check that only one warning was raised 277 | self.assertEqual(len(record), 1) 278 | 279 | pred = automl.predict(X) 280 | 281 | u = np.unique(pred) 282 | 283 | self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) 284 | self.assertTrue(len(u) <= 4) 285 | 286 | def test_regression(self): 287 | X = np.random.rand(self.rows, 3) 288 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 289 | y = np.random.rand(self.rows) 290 | 291 | automl = AutoML( 292 | results_path=self.automl_dir, 293 | total_time_limit=1, 294 | algorithms=["Xgboost"], 295 | train_ensemble=False, 296 | explain_level=0, 297 | start_random_models=1, 298 | ) 299 | automl.fit(X, y) 300 | pred = automl.predict(X) 301 | 302 | self.assertIsInstance(pred, np.ndarray) 303 | self.assertEqual(len(pred), X.shape[0]) 304 | 305 | def test_regression_missing_target(self): 306 | X = np.random.rand(self.rows, 3) 307 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 308 | y = pd.Series(np.random.rand(self.rows), name="target") 309 | 310 | y.iloc[1] = None 311 | 312 | automl = AutoML( 313 | results_path=self.automl_dir, 314 | total_time_limit=1, 315 | algorithms=["Xgboost"], 316 | train_ensemble=False, 317 | explain_level=0, 318 | start_random_models=1, 319 | ) 320 | 321 | with pytest.warns( 322 | match="There are samples with missing target values in the data which will be excluded for further analysis" 323 | ) as record: 324 | automl.fit(X, y) 325 | 326 | self.assertEqual(len(record), 1) 327 | 328 | pred = automl.predict(X) 329 | 330 | self.assertIsInstance(pred, np.ndarray) 331 | self.assertEqual(len(pred), X.shape[0]) 332 | 333 | def test_predict_on_empty_dataframe(self): 334 | X = np.random.rand(self.rows, 3) 335 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 336 | y = pd.Series(np.random.rand(self.rows), name="target") 337 | 338 | automl = AutoML( 339 | results_path=self.automl_dir, 340 | total_time_limit=1, 341 | algorithms=["Xgboost"], 342 | train_ensemble=False, 343 | explain_level=0, 344 | start_random_models=1, 345 | ) 346 | automl.fit(X, y) 347 | 348 | with self.assertRaises(AutoMLException) as context: 349 | pred = automl.predict(pd.DataFrame()) 350 | 351 | with self.assertRaises(AutoMLException) as context: 352 | pred = automl.predict(np.empty(shape=(0, 3))) 353 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/goldenfeatures_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import itertools 2 | import json 3 | import os 4 | import time 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from joblib import Parallel, delayed 9 | from sklearn.metrics import log_loss, mean_squared_error 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 12 | 13 | from supervised.algorithms.registry import ( 14 | BINARY_CLASSIFICATION, 15 | MULTICLASS_CLASSIFICATION, 16 | REGRESSION, 17 | ) 18 | from supervised.exceptions import AutoMLException 19 | from supervised.utils.jsonencoder import MLJSONEncoder 20 | 21 | 22 | def get_binary_score(X_train, y_train, X_test, y_test): 23 | clf = DecisionTreeClassifier(max_depth=3) 24 | clf.fit(X_train, y_train) 25 | pred = clf.predict_proba(X_test)[:, 1] 26 | ll = log_loss(y_test, pred) 27 | return ll 28 | 29 | 30 | def get_regression_score(X_train, y_train, X_test, y_test): 31 | clf = DecisionTreeRegressor(max_depth=3) 32 | clf.fit(X_train, y_train) 33 | pred = clf.predict(X_test) 34 | ll = mean_squared_error(y_test, pred) 35 | return ll 36 | 37 | 38 | def get_multiclass_score(X_train, y_train, X_test, y_test): 39 | clf = DecisionTreeClassifier(max_depth=3) 40 | clf.fit(X_train, y_train) 41 | pred = clf.predict_proba(X_test) 42 | ll = log_loss(y_test, pred) 43 | return ll 44 | 45 | 46 | def get_score(item): 47 | col1 = item[0] 48 | col2 = item[1] 49 | X_train = item[2] 50 | y_train = item[3] 51 | X_test = item[4] 52 | y_test = item[5] 53 | scorer = item[6] 54 | 55 | try: 56 | x_train = np.array(X_train[col1] - X_train[col2]).reshape(-1, 1) 57 | x_test = np.array(X_test[col1] - X_test[col2]).reshape(-1, 1) 58 | diff_score = scorer(x_train, y_train, x_test, y_test) 59 | except Exception as e: 60 | diff_score = None 61 | print(str(e)) 62 | 63 | try: 64 | a, b = ( 65 | np.array(X_train[col1], dtype=float), 66 | np.array(X_train[col2], dtype=float), 67 | ) 68 | x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) 69 | a, b = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float) 70 | x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) 71 | ratio_1_score = scorer(x_train, y_train, x_test, y_test) 72 | except Exception as e: 73 | print(str(e)) 74 | ratio_1_score = None 75 | 76 | try: 77 | b, a = ( 78 | np.array(X_train[col1], dtype=float), 79 | np.array(X_train[col2], dtype=float), 80 | ) 81 | x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) 82 | b, a = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float) 83 | x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) 84 | ratio_2_score = scorer(x_train, y_train, x_test, y_test) 85 | except Exception as e: 86 | print(str(e)) 87 | ratio_2_score = None 88 | 89 | try: 90 | x_train = np.array(X_train[col1] + X_train[col2]).reshape(-1, 1) 91 | x_test = np.array(X_test[col1] + X_test[col2]).reshape(-1, 1) 92 | sum_score = scorer(x_train, y_train, x_test, y_test) 93 | except Exception as e: 94 | sum_score = None 95 | print(str(e)) 96 | 97 | try: 98 | x_train = np.array(X_train[col1] * X_train[col2]).reshape(-1, 1) 99 | x_test = np.array(X_test[col1] * X_test[col2]).reshape(-1, 1) 100 | multiply_score = scorer(x_train, y_train, x_test, y_test) 101 | except Exception as e: 102 | multiply_score = None 103 | print(str(e)) 104 | 105 | return (diff_score, ratio_1_score, ratio_2_score, sum_score, multiply_score) 106 | 107 | 108 | class GoldenFeaturesTransformer(object): 109 | def __init__(self, results_path=None, ml_task=None, features_count=None, n_jobs=-1): 110 | self._new_features = [] 111 | self._new_columns = [] 112 | self._ml_task = ml_task 113 | self._features_count = features_count 114 | self._n_jobs = n_jobs 115 | self._scorer = None 116 | if self._ml_task == BINARY_CLASSIFICATION: 117 | self._scorer = get_binary_score 118 | elif self._ml_task == MULTICLASS_CLASSIFICATION: 119 | self._scorer = get_multiclass_score 120 | else: 121 | self._scorer = get_regression_score 122 | 123 | self._error = None 124 | 125 | self._result_file = "golden_features.json" 126 | if results_path is not None: 127 | self._result_path = os.path.join(results_path, self._result_file) 128 | 129 | if os.path.exists(self._result_path): 130 | with open(self._result_path, "r") as file: 131 | self.from_json(json.load(file), results_path) 132 | 133 | def fit(self, X, y): 134 | if self._new_features: 135 | return 136 | if self._error is not None and self._error: 137 | raise AutoMLException( 138 | "Golden Features not created due to error (please check errors.md). " 139 | + self._error 140 | ) 141 | return 142 | if X.shape[1] == 0: 143 | self._error = f"Golden Features not created. No continous features. Input data shape: {X.shape}, {y.shape}" 144 | self.save() 145 | raise AutoMLException("Golden Features not created. No continous features.") 146 | 147 | start_time = time.time() 148 | combinations = itertools.combinations(X.columns, r=2) 149 | items = [i for i in combinations] 150 | if len(items) > 250000: 151 | si = np.random.choice(len(items), 250000, replace=False) 152 | items = [items[i] for i in si] 153 | 154 | X_train, X_test, y_train, y_test = self._subsample(X, y) 155 | 156 | for i in range(len(items)): 157 | items[i] += (X_train, y_train, X_test, y_test, self._scorer) 158 | 159 | scores = [] 160 | # parallel version 161 | scores = Parallel(n_jobs=self._n_jobs, backend="loky")( 162 | delayed(get_score)(i) for i in items 163 | ) 164 | 165 | # single process version 166 | # for item in items: 167 | # scores += [get_score(item)] 168 | 169 | if not scores: 170 | self._error = f"Golden Features not created. Empty scores. Input data shape: {X.shape}, {y.shape}" 171 | self.save() 172 | raise AutoMLException("Golden Features not created. Empty scores.") 173 | 174 | result = [] 175 | for i in range(len(items)): 176 | if scores[i][0] is not None: 177 | result += [(items[i][0], items[i][1], "diff", scores[i][0])] 178 | if scores[i][1] is not None: 179 | result += [(items[i][0], items[i][1], "ratio", scores[i][1])] 180 | if scores[i][2] is not None: 181 | result += [(items[i][1], items[i][0], "ratio", scores[i][2])] 182 | if scores[i][3] is not None: 183 | result += [(items[i][1], items[i][0], "sum", scores[i][3])] 184 | if scores[i][4] is not None: 185 | result += [(items[i][1], items[i][0], "multiply", scores[i][4])] 186 | 187 | df = pd.DataFrame( 188 | result, columns=["feature1", "feature2", "operation", "score"] 189 | ) 190 | df.sort_values(by="score", inplace=True) 191 | 192 | new_cols_cnt = np.min([100, np.max([10, int(0.1 * X.shape[1])])]) 193 | 194 | if ( 195 | self._features_count is not None 196 | and self._features_count > 0 197 | and self._features_count < df.shape[0] 198 | ): 199 | new_cols_cnt = self._features_count 200 | 201 | print(self._features_count, new_cols_cnt) 202 | self._new_features = json.loads(df.head(new_cols_cnt).to_json(orient="records")) 203 | 204 | for new_feature in self._new_features: 205 | new_col = "_".join( 206 | [ 207 | new_feature["feature1"], 208 | new_feature["operation"], 209 | new_feature["feature2"], 210 | ] 211 | ) 212 | self._new_columns += [new_col] 213 | print(f"Add Golden Feature: {new_col}") 214 | 215 | self.save() 216 | 217 | print( 218 | f"Created {len(self._new_features)} Golden Features in {np.round(time.time() - start_time,2)} seconds." 219 | ) 220 | 221 | def transform(self, X): 222 | for new_feature in self._new_features: 223 | new_col = "_".join( 224 | [ 225 | new_feature["feature1"], 226 | new_feature["operation"], 227 | new_feature["feature2"], 228 | ] 229 | ) 230 | if new_feature["operation"] == "diff": 231 | X[new_col] = X[new_feature["feature1"]] - X[new_feature["feature2"]] 232 | elif new_feature["operation"] == "ratio": 233 | a, b = ( 234 | np.array(X[new_feature["feature1"]], dtype=float), 235 | np.array(X[new_feature["feature2"]], dtype=float), 236 | ) 237 | X[new_col] = np.divide( 238 | a, b, out=np.zeros_like(a), where=b != 0 239 | ).reshape(-1, 1) 240 | elif new_feature["operation"] == "sum": 241 | X[new_col] = X[new_feature["feature1"]] + X[new_feature["feature2"]] 242 | elif new_feature["operation"] == "multiply": 243 | X[new_col] = X[new_feature["feature1"]] * X[new_feature["feature2"]] 244 | 245 | return X 246 | 247 | def to_json(self): 248 | data_json = { 249 | "new_features": self._new_features, 250 | "new_columns": self._new_columns, 251 | "ml_task": self._ml_task, 252 | } 253 | if self._error is not None and self._error: 254 | data_json["error"] = self._error 255 | return data_json 256 | 257 | def from_json(self, data_json, results_path): 258 | self._new_features = data_json.get("new_features", []) 259 | self._new_columns = data_json.get("new_columns", []) 260 | self._ml_task = data_json.get("ml_task") 261 | self._error = data_json.get("error") 262 | self._result_path = os.path.join(results_path, self._result_file) 263 | 264 | def save(self): 265 | with open(self._result_path, "w") as fout: 266 | fout.write(json.dumps(self.to_json(), indent=4, cls=MLJSONEncoder)) 267 | 268 | def _subsample(self, X, y): 269 | MAX_SIZE = 10000 270 | TRAIN_SIZE = 2500 271 | 272 | shuffle = True 273 | stratify = None 274 | 275 | if X.shape[0] > MAX_SIZE: 276 | if self._ml_task != REGRESSION: 277 | stratify = y 278 | X_train, _, y_train, _ = train_test_split( 279 | X, 280 | y, 281 | train_size=MAX_SIZE, 282 | shuffle=shuffle, 283 | stratify=stratify, 284 | random_state=1, 285 | ) 286 | if self._ml_task != REGRESSION: 287 | stratify = y_train 288 | 289 | X_train, X_test, y_train, y_test = train_test_split( 290 | X_train, 291 | y_train, 292 | train_size=TRAIN_SIZE, 293 | shuffle=shuffle, 294 | stratify=stratify, 295 | random_state=1, 296 | ) 297 | else: 298 | if self._ml_task != REGRESSION: 299 | stratify = y 300 | train_size = X.shape[0] // 4 301 | X_train, X_test, y_train, y_test = train_test_split( 302 | X, 303 | y, 304 | train_size=train_size, 305 | shuffle=shuffle, 306 | stratify=stratify, 307 | random_state=1, 308 | ) 309 | 310 | return X_train, X_test, y_train, y_test 311 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/tuner.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import warnings 4 | 5 | import joblib 6 | import matplotlib 7 | import optuna 8 | from matplotlib import pyplot as plt 9 | 10 | from supervised.exceptions import AutoMLException 11 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 12 | from supervised.tuner.optuna.catboost import CatBoostObjective 13 | from supervised.tuner.optuna.extra_trees import ExtraTreesObjective 14 | from supervised.tuner.optuna.knn import KNNObjective 15 | from supervised.tuner.optuna.lightgbm import LightgbmObjective 16 | from supervised.tuner.optuna.nn import NeuralNetworkObjective 17 | from supervised.tuner.optuna.random_forest import RandomForestObjective 18 | from supervised.tuner.optuna.xgboost import XgboostObjective 19 | from supervised.utils.jsonencoder import MLJSONEncoder 20 | from supervised.utils.metric import Metric 21 | 22 | 23 | class OptunaTuner: 24 | def __init__( 25 | self, 26 | results_path, 27 | ml_task, 28 | eval_metric, 29 | time_budget=3600, 30 | init_params={}, 31 | verbose=True, 32 | n_jobs=-1, 33 | random_state=42, 34 | ): 35 | if eval_metric.name not in [ 36 | "auc", 37 | "logloss", 38 | "rmse", 39 | "mse", 40 | "mae", 41 | "mape", 42 | "r2", 43 | "spearman", 44 | "pearson", 45 | "f1", 46 | "average_precision", 47 | "accuracy", 48 | "user_defined_metric", 49 | ]: 50 | raise AutoMLException(f"Metric {eval_metric.name} is not supported") 51 | 52 | self.study_dir = os.path.join(results_path, "optuna") 53 | if not os.path.exists(self.study_dir): 54 | try: 55 | os.mkdir(self.study_dir) 56 | except Exception as e: 57 | print("Problem while creating directory for optuna studies.", str(e)) 58 | self.tuning_fname = os.path.join(self.study_dir, "optuna.json") 59 | self.tuning = init_params 60 | self.eval_metric = eval_metric 61 | 62 | self.direction = ( 63 | "maximize" if Metric.optimize_negative(eval_metric.name) else "minimize" 64 | ) 65 | self.n_warmup_steps = ( 66 | 500 # set large enough to give small learning rates a chance 67 | ) 68 | self.time_budget = time_budget 69 | self.verbose = verbose 70 | self.ml_task = ml_task 71 | self.n_jobs = n_jobs 72 | self.random_state = random_state 73 | self.cat_features_indices = [] 74 | self.load() 75 | if not self.verbose: 76 | optuna.logging.set_verbosity(optuna.logging.CRITICAL) 77 | 78 | @staticmethod 79 | def is_optimizable(algorithm_name): 80 | return algorithm_name in [ 81 | "Extra Trees", 82 | "Random Forest", 83 | "CatBoost", 84 | "Xgboost", 85 | "LightGBM", 86 | "Nearest Neighbors", 87 | "Neural Network", 88 | ] 89 | 90 | def optimize( 91 | self, 92 | algorithm, 93 | data_type, 94 | X_train, 95 | y_train, 96 | sample_weight, 97 | X_validation, 98 | y_validation, 99 | sample_weight_validation, 100 | learner_params, 101 | ): 102 | # only tune models with original data type 103 | if data_type != "original": 104 | return learner_params 105 | 106 | key = f"{data_type}_{algorithm}" 107 | if key in self.tuning: 108 | return self.update_learner_params(learner_params, self.tuning[key]) 109 | 110 | if self.verbose: 111 | print( 112 | f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds " 113 | f"eval_metric {self.eval_metric.name} ({self.direction})" 114 | ) 115 | 116 | self.cat_features_indices = [] 117 | for i in range(X_train.shape[1]): 118 | if PreprocessingUtils.is_categorical(X_train.iloc[:, i]): 119 | self.cat_features_indices += [i] 120 | 121 | study = optuna.create_study( 122 | direction=self.direction, 123 | sampler=optuna.samplers.TPESampler(seed=self.random_state), 124 | pruner=optuna.pruners.MedianPruner(n_warmup_steps=self.n_warmup_steps), 125 | ) 126 | obejctive = None 127 | if algorithm == "LightGBM": 128 | objective = LightgbmObjective( 129 | self.ml_task, 130 | X_train, 131 | y_train, 132 | sample_weight, 133 | X_validation, 134 | y_validation, 135 | sample_weight_validation, 136 | self.eval_metric, 137 | self.cat_features_indices, 138 | self.n_jobs, 139 | self.random_state, 140 | ) 141 | elif algorithm == "Xgboost": 142 | objective = XgboostObjective( 143 | self.ml_task, 144 | X_train, 145 | y_train, 146 | sample_weight, 147 | X_validation, 148 | y_validation, 149 | sample_weight_validation, 150 | self.eval_metric, 151 | self.n_jobs, 152 | self.random_state, 153 | ) 154 | elif algorithm == "CatBoost": 155 | objective = CatBoostObjective( 156 | self.ml_task, 157 | X_train, 158 | y_train, 159 | sample_weight, 160 | X_validation, 161 | y_validation, 162 | sample_weight_validation, 163 | self.eval_metric, 164 | self.cat_features_indices, 165 | self.n_jobs, 166 | self.random_state, 167 | ) 168 | elif algorithm == "Random Forest": 169 | objective = RandomForestObjective( 170 | self.ml_task, 171 | X_train, 172 | y_train, 173 | sample_weight, 174 | X_validation, 175 | y_validation, 176 | sample_weight_validation, 177 | self.eval_metric, 178 | self.n_jobs, 179 | self.random_state, 180 | ) 181 | elif algorithm == "Extra Trees": 182 | objective = ExtraTreesObjective( 183 | self.ml_task, 184 | X_train, 185 | y_train, 186 | sample_weight, 187 | X_validation, 188 | y_validation, 189 | sample_weight_validation, 190 | self.eval_metric, 191 | self.n_jobs, 192 | self.random_state, 193 | ) 194 | elif algorithm == "Nearest Neighbors": 195 | objective = KNNObjective( 196 | self.ml_task, 197 | X_train, 198 | y_train, 199 | sample_weight, 200 | X_validation, 201 | y_validation, 202 | sample_weight_validation, 203 | self.eval_metric, 204 | self.n_jobs, 205 | self.random_state, 206 | ) 207 | elif algorithm == "Neural Network": 208 | objective = NeuralNetworkObjective( 209 | self.ml_task, 210 | X_train, 211 | y_train, 212 | sample_weight, 213 | X_validation, 214 | y_validation, 215 | sample_weight_validation, 216 | self.eval_metric, 217 | self.n_jobs, 218 | self.random_state, 219 | ) 220 | 221 | study.optimize( 222 | objective, n_trials=5000, timeout=self.time_budget, gc_after_trial=True 223 | ) 224 | 225 | self.plot_study(algorithm, data_type, study) 226 | 227 | joblib.dump(study, os.path.join(self.study_dir, key + ".joblib")) 228 | 229 | best = study.best_params 230 | 231 | if algorithm == "LightGBM": 232 | best["metric"] = objective.eval_metric_name 233 | best["custom_eval_metric_name"] = objective.custom_eval_metric_name 234 | best["num_boost_round"] = objective.rounds 235 | best["early_stopping_rounds"] = objective.early_stopping_rounds 236 | # best["learning_rate"] = objective.learning_rate 237 | best["cat_feature"] = self.cat_features_indices 238 | best["feature_pre_filter"] = False 239 | best["seed"] = objective.seed 240 | elif algorithm == "CatBoost": 241 | best["eval_metric"] = objective.eval_metric_name 242 | best["num_boost_round"] = objective.rounds 243 | best["early_stopping_rounds"] = objective.early_stopping_rounds 244 | # best["bootstrap_type"] = "Bernoulli" 245 | # best["learning_rate"] = objective.learning_rate 246 | best["seed"] = objective.seed 247 | elif algorithm == "Xgboost": 248 | best["objective"] = objective.objective 249 | best["eval_metric"] = objective.eval_metric_name 250 | # best["eta"] = objective.learning_rate 251 | best["max_rounds"] = objective.rounds 252 | best["early_stopping_rounds"] = objective.early_stopping_rounds 253 | best["seed"] = objective.seed 254 | elif algorithm == "Extra Trees": 255 | # Extra Trees are not using early stopping 256 | best["max_steps"] = objective.max_steps # each step has 100 trees 257 | best["seed"] = objective.seed 258 | best["eval_metric_name"] = self.eval_metric.name 259 | elif algorithm == "Random Forest": 260 | # Random Forest is not using early stopping 261 | best["max_steps"] = objective.max_steps # each step has 100 trees 262 | best["seed"] = objective.seed 263 | best["eval_metric_name"] = self.eval_metric.name 264 | elif algorithm == "Nearest Neighbors": 265 | best["rows_limit"] = 100000 266 | elif algorithm == "Neural Network": 267 | pass 268 | 269 | self.tuning[key] = best 270 | self.save() 271 | 272 | return self.update_learner_params(learner_params, best) 273 | 274 | def update_learner_params(self, learner_params, best): 275 | for k, v in best.items(): 276 | learner_params[k] = v 277 | return learner_params 278 | 279 | def save(self): 280 | with open(self.tuning_fname, "w") as fout: 281 | fout.write(json.dumps(self.tuning, indent=4, cls=MLJSONEncoder)) 282 | 283 | def load(self): 284 | if os.path.exists(self.tuning_fname): 285 | params = json.loads(open(self.tuning_fname).read()) 286 | for k, v in params.items(): 287 | self.tuning[k] = v 288 | 289 | def plot_study(self, algorithm, data_type, study): 290 | key = f"{data_type}_{algorithm}" 291 | 292 | plots = [ 293 | ( 294 | optuna.visualization.matplotlib.plot_optimization_history, 295 | "optimization_history", 296 | ), 297 | ( 298 | optuna.visualization.matplotlib.plot_parallel_coordinate, 299 | "parallel_coordinate", 300 | ), 301 | ( 302 | optuna.visualization.matplotlib.plot_param_importances, 303 | "param_importances", 304 | ), 305 | # (optuna.visualization.matplotlib.plot_slice, "slice"), 306 | ] 307 | 308 | matplotlib_default_figsize = matplotlib.rcParams["figure.figsize"] 309 | matplotlib.rcParams["figure.figsize"] = (11, 7) 310 | 311 | md = f"# Optuna tuning for {algorithm} on {data_type} data\n\n" 312 | for plot, title in plots: 313 | try: 314 | with warnings.catch_warnings(): 315 | warnings.simplefilter("ignore") 316 | plt.figure() 317 | plt.rcParams["axes.grid"] = title != "parallel_coordinate" 318 | plot(study) 319 | plt.tight_layout(pad=2.0) 320 | fname = f"{key}_{title}.png" 321 | plt.savefig(os.path.join(self.study_dir, fname)) 322 | plt.close("all") 323 | 324 | md += f'## {algorithm} {title.replace("_", " ").title()}\n\n' 325 | md += f"\n\n" 326 | 327 | except Exception as e: 328 | print(str(e)) 329 | 330 | matplotlib.rcParams["figure.figsize"] = matplotlib_default_figsize 331 | plt.style.use("default") 332 | 333 | with open(os.path.join(self.study_dir, "README.md"), "a") as fout: 334 | fout.write(md) 335 | fout.write("\n\n[<< Go back](../README.md)\n") 336 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/lightgbm.py: -------------------------------------------------------------------------------- ```python 1 | import contextlib 2 | import copy 3 | import logging 4 | import os 5 | 6 | import lightgbm as lgb 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.base import ClassifierMixin, RegressorMixin 10 | 11 | from supervised.algorithms.algorithm import BaseAlgorithm 12 | from supervised.algorithms.registry import ( 13 | BINARY_CLASSIFICATION, 14 | MULTICLASS_CLASSIFICATION, 15 | REGRESSION, 16 | AlgorithmsRegistry, 17 | ) 18 | from supervised.utils.config import LOG_LEVEL 19 | from supervised.utils.metric import ( 20 | lightgbm_eval_metric_accuracy, 21 | lightgbm_eval_metric_average_precision, 22 | lightgbm_eval_metric_f1, 23 | lightgbm_eval_metric_pearson, 24 | lightgbm_eval_metric_r2, 25 | lightgbm_eval_metric_spearman, 26 | lightgbm_eval_metric_user_defined, 27 | ) 28 | 29 | logger = logging.getLogger(__name__) 30 | logger.setLevel(LOG_LEVEL) 31 | 32 | 33 | def lightgbm_objective(ml_task, automl_eval_metric): 34 | objective = "regression" 35 | if ml_task == BINARY_CLASSIFICATION: 36 | objective = "binary" 37 | elif ml_task == MULTICLASS_CLASSIFICATION: 38 | objective = "multiclass" 39 | else: # ml_task == REGRESSION 40 | objective = "regression" 41 | return objective 42 | 43 | 44 | def lightgbm_eval_metric(ml_task, automl_eval_metric): 45 | if automl_eval_metric == "user_defined_metric": 46 | return "custom", automl_eval_metric 47 | metric_name_mapping = { 48 | BINARY_CLASSIFICATION: { 49 | "auc": "auc", 50 | "logloss": "binary_logloss", 51 | "f1": "custom", 52 | "average_precision": "custom", 53 | "accuracy": "custom", 54 | }, 55 | MULTICLASS_CLASSIFICATION: { 56 | "logloss": "multi_logloss", 57 | "f1": "custom", 58 | "accuracy": "custom", 59 | }, 60 | REGRESSION: { 61 | "rmse": "rmse", 62 | "mse": "l2", 63 | "mae": "l1", 64 | "mape": "mape", 65 | "r2": "custom", 66 | "spearman": "custom", 67 | "pearson": "custom", 68 | }, 69 | } 70 | 71 | metric = metric_name_mapping[ml_task][automl_eval_metric] 72 | custom_eval_metric = None 73 | 74 | if automl_eval_metric in [ 75 | "r2", 76 | "spearman", 77 | "pearson", 78 | "f1", 79 | "average_precision", 80 | "accuracy", 81 | ]: 82 | custom_eval_metric = automl_eval_metric 83 | 84 | return metric, custom_eval_metric 85 | 86 | 87 | class LightgbmAlgorithm(BaseAlgorithm): 88 | algorithm_name = "LightGBM" 89 | algorithm_short_name = "LightGBM" 90 | 91 | def __init__(self, params): 92 | super(LightgbmAlgorithm, self).__init__(params) 93 | self.library_version = lgb.__version__ 94 | 95 | self.explain_level = params.get("explain_level", 0) 96 | self.rounds = additional.get("max_rounds", 10000) 97 | self.max_iters = 1 98 | self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) 99 | 100 | n_jobs = self.params.get("n_jobs", 0) 101 | # 0 is the default for LightGBM to use all cores 102 | if n_jobs == -1: 103 | n_jobs = 0 104 | 105 | self.learner_params = { 106 | "boosting_type": "gbdt", 107 | "objective": self.params.get("objective", "binary"), 108 | "metric": self.params.get("metric", "binary_logloss"), 109 | "num_leaves": self.params.get("num_leaves", 31), 110 | "learning_rate": self.params.get("learning_rate", 0.1), 111 | "feature_fraction": self.params.get("feature_fraction", 1.0), 112 | "bagging_fraction": self.params.get("bagging_fraction", 1.0), 113 | "min_data_in_leaf": self.params.get("min_data_in_leaf", 20), 114 | "num_threads": n_jobs, 115 | "verbose": -1, 116 | "seed": self.params.get("seed", 1), 117 | "extra_trees": self.params.get("extra_trees", False), 118 | } 119 | 120 | for extra_param in [ 121 | "lambda_l1", 122 | "lambda_l2", 123 | "bagging_freq", 124 | "feature_pre_filter", 125 | "cat_feature", 126 | "cat_l2", 127 | "cat_smooth", 128 | "max_bin", 129 | ]: 130 | if extra_param in self.params: 131 | self.learner_params[extra_param] = self.params[extra_param] 132 | 133 | if "num_boost_round" in self.params: 134 | self.rounds = self.params["num_boost_round"] 135 | if "early_stopping_rounds" in self.params: 136 | self.early_stopping_rounds = self.params["early_stopping_rounds"] 137 | 138 | if "num_class" in self.params: # multiclass classification 139 | self.learner_params["num_class"] = self.params.get("num_class") 140 | 141 | self.custom_eval_metric = None 142 | if self.params.get("custom_eval_metric_name") is not None: 143 | if self.params["custom_eval_metric_name"] == "r2": 144 | self.custom_eval_metric = lightgbm_eval_metric_r2 145 | elif self.params["custom_eval_metric_name"] == "spearman": 146 | self.custom_eval_metric = lightgbm_eval_metric_spearman 147 | elif self.params["custom_eval_metric_name"] == "pearson": 148 | self.custom_eval_metric = lightgbm_eval_metric_pearson 149 | elif self.params["custom_eval_metric_name"] == "f1": 150 | self.custom_eval_metric = lightgbm_eval_metric_f1 151 | elif self.params["custom_eval_metric_name"] == "average_precision": 152 | self.custom_eval_metric = lightgbm_eval_metric_average_precision 153 | elif self.params["custom_eval_metric_name"] == "accuracy": 154 | self.custom_eval_metric = lightgbm_eval_metric_accuracy 155 | elif self.params["custom_eval_metric_name"] == "user_defined_metric": 156 | self.custom_eval_metric = lightgbm_eval_metric_user_defined 157 | 158 | logger.debug("LightgbmLearner __init__") 159 | 160 | def file_extension(self): 161 | return "lightgbm" 162 | 163 | def update(self, update_params): 164 | pass 165 | 166 | """ 167 | def get_boosting_rounds(self, lgb_train, valid_sets, esr, max_time): 168 | if max_time is None: 169 | max_time = 3600.0 170 | start_time = time.time() 171 | evals_result = {} 172 | model = lgb.train( 173 | self.learner_params, 174 | lgb_train, 175 | num_boost_round=2, 176 | valid_sets=valid_sets, 177 | early_stopping_rounds=esr, 178 | evals_result=evals_result, 179 | verbose_eval=False, 180 | ) 181 | time_1_iter = (time.time() - start_time) / 2.0 182 | 183 | # 2.0 is just a scaling factor 184 | # purely heuristic 185 | iters = int(max_time / time_1_iter * 2.0) 186 | iters = max(iters, 100) 187 | iters = min(iters, 10000) 188 | return iters 189 | """ 190 | 191 | def fit( 192 | self, 193 | X, 194 | y, 195 | sample_weight=None, 196 | X_validation=None, 197 | y_validation=None, 198 | sample_weight_validation=None, 199 | log_to_file=None, 200 | max_time=None, 201 | ): 202 | lgb_train = lgb.Dataset( 203 | X.values if isinstance(X, pd.DataFrame) else X, 204 | y, 205 | weight=sample_weight, 206 | ) 207 | valid_sets = None 208 | if self.early_stopping_rounds == 0: 209 | self.model = lgb.train( 210 | self.learner_params, 211 | lgb_train, 212 | num_boost_round=self.rounds, 213 | init_model=self.model, 214 | ) 215 | else: 216 | valid_names = None 217 | esr = None 218 | if X_validation is not None and y_validation is not None: 219 | valid_sets = [ 220 | lgb_train, 221 | lgb.Dataset( 222 | X_validation.values 223 | if isinstance(X_validation, pd.DataFrame) 224 | else X_validation, 225 | y_validation, 226 | weight=sample_weight_validation, 227 | ), 228 | ] 229 | valid_names = ["train", "validation"] 230 | esr = self.early_stopping_rounds 231 | evals_result = {} 232 | 233 | # disable for now ... 234 | # boosting_rounds = self.get_boosting_rounds(lgb_train, valid_sets, esr, max_time) 235 | 236 | self.model = lgb.train( 237 | self.learner_params, 238 | lgb_train, 239 | num_boost_round=self.rounds, 240 | valid_sets=valid_sets, 241 | valid_names=valid_names, 242 | feval=self.custom_eval_metric, 243 | callbacks=[ 244 | lgb.early_stopping(esr, verbose=False), 245 | lgb.record_evaluation(evals_result), 246 | ], 247 | ) 248 | 249 | del lgb_train 250 | if valid_sets is not None: 251 | del valid_sets[0] 252 | del valid_sets 253 | 254 | if log_to_file is not None: 255 | metric_name = list(evals_result["train"].keys())[0] 256 | result = pd.DataFrame( 257 | { 258 | "iteration": range(len(evals_result["train"][metric_name])), 259 | "train": evals_result["train"][metric_name], 260 | "validation": evals_result["validation"][metric_name], 261 | } 262 | ) 263 | result.to_csv(log_to_file, index=False, header=False) 264 | 265 | if self.params["ml_task"] != REGRESSION: 266 | self.classes_ = np.unique(y) 267 | 268 | def is_fitted(self): 269 | return self.model is not None 270 | 271 | def predict(self, X): 272 | self.reload() 273 | return self.model.predict(X.values if isinstance(X, pd.DataFrame) else X) 274 | 275 | def copy(self): 276 | with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): 277 | return copy.deepcopy(self) 278 | 279 | def save(self, model_file_path): 280 | self.model.save_model(model_file_path) 281 | self.model_file_path = model_file_path 282 | logger.debug("LightgbmAlgorithm save model to %s" % model_file_path) 283 | 284 | def load(self, model_file_path): 285 | logger.debug("LightgbmAlgorithm load model from %s" % model_file_path) 286 | self.model_file_path = model_file_path 287 | self.model = lgb.Booster(model_file=model_file_path) 288 | 289 | def get_metric_name(self): 290 | metric = self.params.get("metric") 291 | custom_metric = self.params.get("custom_eval_metric_name") 292 | 293 | if metric is None: 294 | return None 295 | if metric == "custom": 296 | return custom_metric 297 | if metric == "binary_logloss": 298 | return "logloss" 299 | elif metric == "multi_logloss": 300 | return "logloss" 301 | return metric 302 | 303 | 304 | lgbm_bin_params = { 305 | "objective": ["binary"], 306 | "num_leaves": [15, 31, 63, 95, 127], 307 | "learning_rate": [0.05, 0.1, 0.2], 308 | "feature_fraction": [0.5, 0.8, 0.9, 1.0], 309 | "bagging_fraction": [0.5, 0.8, 0.9, 1.0], 310 | "min_data_in_leaf": [5, 10, 15, 20, 30, 50], 311 | } 312 | 313 | classification_bin_default_params = { 314 | "objective": "binary", 315 | "num_leaves": 63, 316 | "learning_rate": 0.05, 317 | "feature_fraction": 0.9, 318 | "bagging_fraction": 0.9, 319 | "min_data_in_leaf": 10, 320 | } 321 | 322 | 323 | additional = { 324 | "max_rounds": 10000, 325 | "early_stopping_rounds": 50, 326 | "max_rows_limit": None, 327 | "max_cols_limit": None, 328 | } 329 | 330 | required_preprocessing = [ 331 | "missing_values_inputation", 332 | "convert_categorical", 333 | "datetime_transform", 334 | "text_transform", 335 | "target_as_integer", 336 | ] 337 | 338 | lgbm_multi_params = copy.deepcopy(lgbm_bin_params) 339 | lgbm_multi_params["objective"] = ["multiclass"] 340 | 341 | classification_multi_default_params = { 342 | "objective": "multiclass", 343 | "num_leaves": 63, 344 | "learning_rate": 0.05, 345 | "feature_fraction": 0.9, 346 | "bagging_fraction": 0.9, 347 | "min_data_in_leaf": 10, 348 | } 349 | 350 | lgbr_params = copy.deepcopy(lgbm_bin_params) 351 | lgbr_params["objective"] = ["regression"] 352 | 353 | 354 | class LgbmClassifier(ClassifierMixin, LightgbmAlgorithm): 355 | pass 356 | 357 | 358 | AlgorithmsRegistry.add( 359 | BINARY_CLASSIFICATION, 360 | LgbmClassifier, 361 | lgbm_bin_params, 362 | required_preprocessing, 363 | additional, 364 | classification_bin_default_params, 365 | ) 366 | 367 | AlgorithmsRegistry.add( 368 | MULTICLASS_CLASSIFICATION, 369 | LgbmClassifier, 370 | lgbm_multi_params, 371 | required_preprocessing, 372 | additional, 373 | classification_multi_default_params, 374 | ) 375 | 376 | regression_required_preprocessing = [ 377 | "missing_values_inputation", 378 | "convert_categorical", 379 | "datetime_transform", 380 | "text_transform", 381 | "target_scale", 382 | ] 383 | 384 | 385 | regression_default_params = { 386 | "objective": "regression", 387 | "num_leaves": 63, 388 | "learning_rate": 0.05, 389 | "feature_fraction": 0.9, 390 | "bagging_fraction": 0.9, 391 | "min_data_in_leaf": 10, 392 | } 393 | 394 | 395 | class LgbmRegressor(RegressorMixin, LightgbmAlgorithm): 396 | pass 397 | 398 | 399 | AlgorithmsRegistry.add( 400 | REGRESSION, 401 | LgbmRegressor, 402 | lgbr_params, 403 | regression_required_preprocessing, 404 | additional, 405 | regression_default_params, 406 | ) 407 | ``` -------------------------------------------------------------------------------- /supervised/utils/shap.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | shap_pacakge_available = False 8 | try: 9 | # I'm tired of all shap dependency hell 10 | # ugh 11 | import shap 12 | shap_pacakge_available = True 13 | except Exception: 14 | pass 15 | 16 | from sklearn.preprocessing import OneHotEncoder 17 | 18 | from supervised.algorithms.registry import ( 19 | BINARY_CLASSIFICATION, 20 | MULTICLASS_CLASSIFICATION, 21 | REGRESSION, 22 | ) 23 | 24 | logger = logging.getLogger(__name__) 25 | from supervised.utils.config import LOG_LEVEL 26 | 27 | logger.setLevel(LOG_LEVEL) 28 | import warnings 29 | 30 | 31 | class PlotSHAP: 32 | @staticmethod 33 | def is_available(algorithm, X_train, y_train, ml_task): 34 | if not shap_pacakge_available: 35 | return False 36 | # https://github.com/mljar/mljar-supervised/issues/112 disable for NN 37 | # https://github.com/mljar/mljar-supervised/issues/114 disable for CatBoost 38 | if algorithm.algorithm_short_name in ["Baseline", "Neural Network", "CatBoost"]: 39 | return False 40 | if ( 41 | algorithm.algorithm_short_name == "Xgboost" 42 | and algorithm.learner_params["booster"] == "gblinear" 43 | ): 44 | # Xgboost gblinear is not supported by SHAP 45 | return False 46 | # disable for large number of columns 47 | if X_train.shape[1] > 500: 48 | warnings.warn( 49 | "Disable SHAP explanations because of number of columns > 500." 50 | ) 51 | return False 52 | if ml_task == MULTICLASS_CLASSIFICATION and len(np.unique(y_train)) > 100: 53 | warnings.warn( 54 | "Disable SHAP explanations because of large number of classes (> 100)." 55 | ) 56 | return False 57 | if X_train.shape[0] < 20: 58 | warnings.warn( 59 | "Disable SHAP explanations because of small number of samples (< 20)." 60 | ) 61 | return False 62 | return True 63 | 64 | @staticmethod 65 | def get_explainer(algorithm, X_train): 66 | explainer = None 67 | if algorithm.algorithm_short_name in [ 68 | "Xgboost", 69 | "Decision Tree", 70 | "Random Forest", 71 | "LightGBM", 72 | "Extra Trees", 73 | "CatBoost", 74 | ]: 75 | explainer = shap.TreeExplainer(algorithm.model) 76 | elif algorithm.algorithm_short_name in ["Linear"]: 77 | explainer = shap.LinearExplainer(algorithm.model, X_train) 78 | # elif algorithm.algorithm_short_name in ["Neural Network"]: 79 | # explainer = shap.KernelExplainer(algorithm.model.predict, X_train) # slow 80 | 81 | return explainer 82 | 83 | @staticmethod 84 | def get_sample(X_validation, y_validation): 85 | # too many samples in the data, down-sample it 86 | SAMPLES_LIMIT = 1000 87 | if X_validation.shape[0] > SAMPLES_LIMIT: 88 | X_validation.reset_index(inplace=True, drop=True) 89 | y_validation.reset_index(inplace=True, drop=True) 90 | X_vald = X_validation.sample(SAMPLES_LIMIT) 91 | y_vald = y_validation[X_vald.index] 92 | else: 93 | X_vald = X_validation 94 | y_vald = y_validation 95 | return X_vald, y_vald 96 | 97 | def get_predictions(algorithm, X_vald, y_vald, ml_task): 98 | # compute predictions on down-sampled data 99 | predictions = algorithm.predict(X_vald) 100 | 101 | if ml_task == MULTICLASS_CLASSIFICATION: 102 | oh = OneHotEncoder(sparse_output=False) 103 | y_encoded = oh.fit_transform(np.array(y_vald).reshape(-1, 1)) 104 | residua = np.sum(np.abs(np.array(y_encoded) - predictions), axis=1) 105 | else: 106 | residua = np.abs(np.array(y_vald) - predictions) 107 | 108 | df_preds = pd.DataFrame( 109 | {"res": residua, "lp": range(residua.shape[0]), "target": np.array(y_vald)}, 110 | index=X_vald.index, 111 | ) 112 | df_preds = df_preds.sort_values(by="res", ascending=False) 113 | 114 | return df_preds 115 | 116 | @staticmethod 117 | def summary(shap_values, X_vald, model_file_path, learner_name, class_names): 118 | fig = plt.gcf() 119 | classes = None 120 | if class_names is not None and len(class_names): 121 | classes = class_names 122 | with warnings.catch_warnings(): 123 | warnings.simplefilter("ignore") 124 | shap.summary_plot( 125 | shap_values, X_vald, plot_type="bar", show=False, class_names=classes 126 | ) 127 | fig.tight_layout(pad=2.0) 128 | fig.savefig(os.path.join(model_file_path, f"{learner_name}_shap_summary.png")) 129 | plt.close("all") 130 | 131 | vals = None 132 | if isinstance(shap_values, list): 133 | for sh in shap_values: 134 | v = np.abs(sh).mean(0) 135 | vals = v if vals is None else vals + v 136 | else: 137 | vals = np.abs(shap_values).mean(0) 138 | feature_importance = pd.DataFrame( 139 | list(zip(X_vald.columns, vals)), columns=["feature", "shap_importance"] 140 | ) 141 | feature_importance.sort_values( 142 | by=["shap_importance"], ascending=False, inplace=True 143 | ) 144 | feature_importance.to_csv( 145 | os.path.join(model_file_path, f"{learner_name}_shap_importance.csv"), 146 | index=False, 147 | ) 148 | 149 | @staticmethod 150 | def dependence(shap_values, X_vald, model_file_path, learner_name, file_postfix=""): 151 | with warnings.catch_warnings(): 152 | warnings.simplefilter("ignore") 153 | fig = plt.figure(figsize=(14, 7)) 154 | plots_cnt = np.min([9, X_vald.shape[1]]) 155 | cols_cnt = 3 156 | rows_cnt = 3 157 | if plots_cnt < 4: 158 | rows_cnt = 1 159 | elif plots_cnt < 7: 160 | rows_cnt = 2 161 | for i in range(plots_cnt): 162 | ax = fig.add_subplot(rows_cnt, cols_cnt, i + 1) 163 | shap.dependence_plot( 164 | f"rank({i})", 165 | shap_values, 166 | X_vald, 167 | show=False, 168 | title=f"Importance #{i+1}", 169 | ax=ax, 170 | ) 171 | 172 | fig.tight_layout(pad=2.0) 173 | fig.savefig( 174 | os.path.join( 175 | model_file_path, f"{learner_name}_shap_dependence{file_postfix}.png" 176 | ) 177 | ) 178 | plt.close("all") 179 | 180 | @staticmethod 181 | def compute( 182 | algorithm, 183 | X_train, 184 | y_train, 185 | X_validation, 186 | y_validation, 187 | model_file_path, 188 | learner_name, 189 | class_names, 190 | ml_task, 191 | ): 192 | if not PlotSHAP.is_available(algorithm, X_train, y_train, ml_task): 193 | return 194 | try: 195 | with warnings.catch_warnings(): 196 | warnings.simplefilter("ignore") 197 | explainer = PlotSHAP.get_explainer(algorithm, X_train) 198 | X_vald, y_vald = PlotSHAP.get_sample(X_validation, y_validation) 199 | shap_values = explainer.shap_values(X_vald) 200 | 201 | # fix problem with 1 or 2 dimensions for binary classification 202 | expected_value = explainer.expected_value 203 | if ml_task == BINARY_CLASSIFICATION and isinstance(shap_values, list): 204 | shap_values = shap_values[1] 205 | expected_value = explainer.expected_value[1] 206 | 207 | # Summary SHAP plot 208 | PlotSHAP.summary( 209 | shap_values, X_vald, model_file_path, learner_name, class_names 210 | ) 211 | # Dependence SHAP plots 212 | if ml_task == MULTICLASS_CLASSIFICATION: 213 | for t in np.unique(y_vald): 214 | PlotSHAP.dependence( 215 | shap_values[t], 216 | X_vald, 217 | model_file_path, 218 | learner_name, 219 | f"_class_{class_names[t]}", 220 | ) 221 | else: 222 | PlotSHAP.dependence(shap_values, X_vald, model_file_path, learner_name) 223 | 224 | # Decision SHAP plots 225 | df_preds = PlotSHAP.get_predictions(algorithm, X_vald, y_vald, ml_task) 226 | 227 | if ml_task == REGRESSION: 228 | PlotSHAP.decisions_regression( 229 | df_preds, 230 | shap_values, 231 | expected_value, 232 | X_vald, 233 | y_vald, 234 | model_file_path, 235 | learner_name, 236 | ) 237 | elif ml_task == BINARY_CLASSIFICATION: 238 | PlotSHAP.decisions_binary( 239 | df_preds, 240 | shap_values, 241 | expected_value, 242 | X_vald, 243 | y_vald, 244 | model_file_path, 245 | learner_name, 246 | ) 247 | else: 248 | PlotSHAP.decisions_multiclass( 249 | df_preds, 250 | shap_values, 251 | expected_value, 252 | X_vald, 253 | y_vald, 254 | model_file_path, 255 | learner_name, 256 | class_names, 257 | ) 258 | except Exception as e: 259 | pass 260 | # print( 261 | # f"Exception while producing SHAP explanations. {str(e)}\nContinuing ..." 262 | # ) 263 | 264 | @staticmethod 265 | def decisions_regression( 266 | df_preds, 267 | shap_values, 268 | expected_value, 269 | X_vald, 270 | y_vald, 271 | model_file_path, 272 | learner_name, 273 | ): 274 | fig = plt.gcf() 275 | shap.decision_plot( 276 | expected_value, 277 | shap_values[df_preds.lp[:10], :], 278 | X_vald.loc[df_preds.index[:10]], 279 | show=False, 280 | ) 281 | fig.tight_layout(pad=2.0) 282 | fig.savefig( 283 | os.path.join(model_file_path, f"{learner_name}_shap_worst_decisions.png") 284 | ) 285 | plt.close("all") 286 | 287 | fig = plt.gcf() 288 | shap.decision_plot( 289 | expected_value, 290 | shap_values[df_preds.lp[-10:], :], 291 | X_vald.loc[df_preds.index[-10:]], 292 | show=False, 293 | ) 294 | fig.tight_layout(pad=2.0) 295 | fig.savefig( 296 | os.path.join(model_file_path, f"{learner_name}_shap_best_decisions.png") 297 | ) 298 | plt.close("all") 299 | 300 | @staticmethod 301 | def decisions_binary( 302 | df_preds, 303 | shap_values, 304 | expected_value, 305 | X_vald, 306 | y_vald, 307 | model_file_path, 308 | learner_name, 309 | ): 310 | # classes are from 0 ... 311 | for t in np.unique(y_vald): 312 | fig = plt.gcf() 313 | shap.decision_plot( 314 | expected_value, 315 | shap_values[df_preds[df_preds.target == t].lp[:10], :], 316 | X_vald.loc[df_preds[df_preds.target == t].index[:10]], 317 | show=False, 318 | ) 319 | fig.tight_layout(pad=2.0) 320 | fig.savefig( 321 | os.path.join( 322 | model_file_path, 323 | f"{learner_name}_shap_class_{t}_worst_decisions.png", 324 | ) 325 | ) 326 | plt.close("all") 327 | 328 | fig = plt.gcf() 329 | shap.decision_plot( 330 | expected_value, 331 | shap_values[df_preds[df_preds.target == t].lp[-10:], :], 332 | X_vald.loc[df_preds[df_preds.target == t].index[-10:]], 333 | show=False, 334 | ) 335 | fig.tight_layout(pad=2.0) 336 | fig.savefig( 337 | os.path.join( 338 | model_file_path, f"{learner_name}_shap_class_{t}_best_decisions.png" 339 | ) 340 | ) 341 | plt.close("all") 342 | 343 | @staticmethod 344 | def decisions_multiclass( 345 | df_preds, 346 | shap_values, 347 | expected_value, 348 | X_vald, 349 | y_vald, 350 | model_file_path, 351 | learner_name, 352 | class_names, 353 | ): 354 | for decision_type in ["worst", "best"]: 355 | m = 1 if decision_type == "worst" else -1 356 | for i in range(4): 357 | fig = plt.gcf() 358 | shap.multioutput_decision_plot( 359 | list(expected_value), 360 | shap_values, 361 | row_index=df_preds.lp.iloc[m * i], 362 | show=False, 363 | legend_labels=class_names, 364 | title=f"It should be {class_names[df_preds.target.iloc[m*i]]}", 365 | ) 366 | fig.tight_layout(pad=2.0) 367 | fig.savefig( 368 | os.path.join( 369 | model_file_path, 370 | f"{learner_name}_sample_{i}_{decision_type}_decisions.png", 371 | ) 372 | ) 373 | plt.close("all") 374 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/xgboost.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | import logging 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import xgboost as xgb 7 | from sklearn.base import ClassifierMixin, RegressorMixin 8 | 9 | from supervised.algorithms.algorithm import BaseAlgorithm 10 | from supervised.algorithms.registry import ( 11 | BINARY_CLASSIFICATION, 12 | MULTICLASS_CLASSIFICATION, 13 | REGRESSION, 14 | AlgorithmsRegistry, 15 | ) 16 | from supervised.utils.config import LOG_LEVEL 17 | from supervised.utils.metric import ( 18 | xgboost_eval_metric_accuracy, 19 | xgboost_eval_metric_average_precision, 20 | xgboost_eval_metric_f1, 21 | xgboost_eval_metric_mse, 22 | xgboost_eval_metric_pearson, 23 | xgboost_eval_metric_r2, 24 | xgboost_eval_metric_spearman, 25 | xgboost_eval_metric_user_defined, 26 | ) 27 | 28 | logger = logging.getLogger(__name__) 29 | logger.setLevel(LOG_LEVEL) 30 | 31 | 32 | class XgbAlgorithmException(Exception): 33 | def __init__(self, message): 34 | super(XgbAlgorithmException, self).__init__(message) 35 | logger.error(message) 36 | 37 | 38 | def time_constraint(env): 39 | # print("time constraint") 40 | pass 41 | 42 | 43 | def xgboost_eval_metric(ml_task, automl_eval_metric): 44 | # the mapping is almost the same 45 | eval_metric_name = automl_eval_metric 46 | if ml_task == MULTICLASS_CLASSIFICATION: 47 | if automl_eval_metric == "logloss": 48 | eval_metric_name = "mlogloss" 49 | return eval_metric_name 50 | 51 | 52 | def xgboost_objective(ml_task, automl_eval_metric): 53 | objective = "reg:squarederror" 54 | if ml_task == BINARY_CLASSIFICATION: 55 | objective = "binary:logistic" 56 | elif ml_task == MULTICLASS_CLASSIFICATION: 57 | objective = "multi:softprob" 58 | else: # ml_task == REGRESSION 59 | objective = "reg:squarederror" 60 | return objective 61 | 62 | 63 | class XgbAlgorithm(BaseAlgorithm): 64 | """ 65 | This is a wrapper over xgboost algorithm. 66 | """ 67 | 68 | algorithm_name = "Extreme Gradient Boosting" 69 | algorithm_short_name = "Xgboost" 70 | 71 | def __init__(self, params): 72 | super(XgbAlgorithm, self).__init__(params) 73 | self.library_version = xgb.__version__ 74 | 75 | self.explain_level = params.get("explain_level", 0) 76 | self.boosting_rounds = additional.get("max_rounds", 10000) 77 | self.max_iters = 1 78 | self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) 79 | self.learner_params = { 80 | "tree_method": "hist", 81 | "booster": "gbtree", 82 | "objective": self.params.get("objective"), 83 | "eval_metric": self.params.get("eval_metric"), 84 | "eta": self.params.get("eta", 0.01), 85 | "max_depth": self.params.get("max_depth", 1), 86 | "min_child_weight": self.params.get("min_child_weight", 1), 87 | "subsample": self.params.get("subsample", 0.8), 88 | "colsample_bytree": self.params.get("colsample_bytree", 0.8), 89 | "n_jobs": self.params.get("n_jobs", -1), 90 | # "silent": self.params.get("silent", 1), 91 | "seed": self.params.get("seed", 1), 92 | "verbosity": 0, 93 | } 94 | 95 | if "lambda" in self.params: 96 | self.learner_params["lambda"] = self.params["lambda"] 97 | if "alpha" in self.params: 98 | self.learner_params["alpha"] = self.params["alpha"] 99 | 100 | # check https://github.com/dmlc/xgboost/issues/5637 101 | if self.learner_params["seed"] > 2147483647: 102 | self.learner_params["seed"] = self.learner_params["seed"] % 2147483647 103 | if "num_class" in self.params: # multiclass classification 104 | self.learner_params["num_class"] = self.params.get("num_class") 105 | 106 | if "max_rounds" in self.params: 107 | self.boosting_rounds = self.params["max_rounds"] 108 | 109 | self.custom_eval_metric = None 110 | if self.params.get("eval_metric", "") == "r2": 111 | self.custom_eval_metric = xgboost_eval_metric_r2 112 | elif self.params.get("eval_metric", "") == "spearman": 113 | self.custom_eval_metric = xgboost_eval_metric_spearman 114 | elif self.params.get("eval_metric", "") == "pearson": 115 | self.custom_eval_metric = xgboost_eval_metric_pearson 116 | elif self.params.get("eval_metric", "") == "f1": 117 | self.custom_eval_metric = xgboost_eval_metric_f1 118 | elif self.params.get("eval_metric", "") == "average_precision": 119 | self.custom_eval_metric = xgboost_eval_metric_average_precision 120 | elif self.params.get("eval_metric", "") == "accuracy": 121 | self.custom_eval_metric = xgboost_eval_metric_accuracy 122 | elif self.params.get("eval_metric", "") == "mse": 123 | self.custom_eval_metric = xgboost_eval_metric_mse 124 | elif self.params.get("eval_metric", "") == "user_defined_metric": 125 | self.custom_eval_metric = xgboost_eval_metric_user_defined 126 | 127 | logger.debug("XgbLearner __init__") 128 | 129 | """ 130 | def get_boosting_rounds(self, dtrain, evals, esr, max_time): 131 | if max_time is None: 132 | return self.boosting_rounds 133 | 134 | start_time = time.time() 135 | evals_result = {} 136 | model = xgb.train( 137 | self.learner_params, 138 | dtrain, 139 | 2, 140 | evals=evals, 141 | early_stopping_rounds=esr, 142 | evals_result=evals_result, 143 | verbose_eval=False, 144 | ) 145 | time_1_iter = (time.time() - start_time) / 2.0 146 | 147 | # 2.0 is just a scaling factor 148 | # purely heuristic 149 | iters = int(max_time / time_1_iter * 2.0) 150 | iters = max(iters, 100) 151 | iters = min(iters, 10000) 152 | return iters 153 | """ 154 | 155 | def fit( 156 | self, 157 | X, 158 | y, 159 | sample_weight=None, 160 | X_validation=None, 161 | y_validation=None, 162 | sample_weight_validation=None, 163 | log_to_file=None, 164 | max_time=None, 165 | ): 166 | dtrain = xgb.DMatrix( 167 | X.values if isinstance(X, pd.DataFrame) else X, 168 | label=y, 169 | missing=np.NaN, 170 | weight=sample_weight, 171 | ) 172 | 173 | if X_validation is not None and y_validation is not None: 174 | dvalidation = xgb.DMatrix( 175 | X_validation.values 176 | if isinstance(X_validation, pd.DataFrame) 177 | else X_validation, 178 | label=y_validation, 179 | missing=np.NaN, 180 | weight=sample_weight_validation, 181 | ) 182 | else: 183 | dvalidation = None 184 | 185 | evals_result = {} 186 | 187 | evals = [] 188 | esr = None 189 | if X_validation is not None and y_validation is not None: 190 | evals = [(dtrain, "train"), (dvalidation, "validation")] 191 | esr = self.early_stopping_rounds 192 | 193 | # disable for now, dont have better idea how to handle time limit ... 194 | # looks like there is better not to limit the algorithm 195 | # just wait till they converge ... 196 | # boosting_rounds = self.get_boosting_rounds(dtrain, evals, esr, max_time) 197 | 198 | if self.custom_eval_metric is not None: 199 | del self.learner_params["eval_metric"] 200 | 201 | self.model = xgb.train( 202 | self.learner_params, 203 | dtrain, 204 | self.boosting_rounds, 205 | evals=evals, 206 | early_stopping_rounds=esr, 207 | evals_result=evals_result, 208 | verbose_eval=False, 209 | custom_metric=self.custom_eval_metric 210 | # callbacks=[time_constraint] # callback slows down by factor ~8 211 | ) 212 | 213 | del dtrain 214 | del dvalidation 215 | 216 | if log_to_file is not None: 217 | metric_name = list(evals_result["train"].keys())[-1] 218 | 219 | result = pd.DataFrame( 220 | { 221 | "iteration": range(len(evals_result["train"][metric_name])), 222 | "train": evals_result["train"][metric_name], 223 | "validation": evals_result["validation"][metric_name], 224 | } 225 | ) 226 | # it a is custom metric 227 | # that is always minimized 228 | # we need to revert it 229 | if metric_name in [ 230 | "r2", 231 | "spearman", 232 | "pearson", 233 | "f1", 234 | "average_precision", 235 | "accuracy", 236 | ]: 237 | result["train"] *= -1.0 238 | result["validation"] *= -1.0 239 | 240 | result.to_csv(log_to_file, index=False, header=False) 241 | 242 | if self.params["ml_task"] != REGRESSION: 243 | self.classes_ = np.unique(y) 244 | 245 | # fix high memory consumption in xgboost, 246 | # waiting for release with fix 247 | # https://github.com/dmlc/xgboost/issues/5474 248 | """ 249 | # disable, for now all learners are saved to hard disk and then deleted from RAM 250 | with tempfile.NamedTemporaryFile() as tmp: 251 | self.model.save_model(tmp.name) 252 | del self.model 253 | self.model = xgb.Booster() 254 | self.model.load_model(tmp.name) 255 | """ 256 | 257 | def is_fitted(self): 258 | return self.model is not None 259 | 260 | def predict(self, X): 261 | self.reload() 262 | 263 | if self.model is None: 264 | raise XgbAlgorithmException("Xgboost model is None") 265 | 266 | dtrain = xgb.DMatrix( 267 | X.values if isinstance(X, pd.DataFrame) else X, missing=np.NaN 268 | ) 269 | # xgboost > 2.0.0 version 270 | if hasattr(self.model, "best_iteration"): 271 | a = self.model.predict( 272 | dtrain, iteration_range=(0, self.model.best_iteration + 1) 273 | ) 274 | else: 275 | a = self.model.predict(dtrain) 276 | 277 | return a 278 | 279 | def copy(self): 280 | return copy.deepcopy(self) 281 | 282 | def save(self, model_file_path): 283 | self.model.save_model(model_file_path) 284 | self.model_file_path = model_file_path 285 | logger.debug("XgbAlgorithm save model to %s" % model_file_path) 286 | 287 | def load(self, model_file_path): 288 | logger.debug("XgbLearner load model from %s" % model_file_path) 289 | self.model = xgb.Booster() # init model 290 | self.model.load_model(model_file_path) 291 | self.model_file_path = model_file_path 292 | 293 | def file_extension(self): 294 | # we need to keep models as json files 295 | # to keep information about best_iteration 296 | return "xgboost.json" 297 | 298 | def get_metric_name(self): 299 | metric = self.params.get("eval_metric") 300 | if metric is None: 301 | return None 302 | if metric == "mlogloss": 303 | return "logloss" 304 | return metric 305 | 306 | 307 | # For binary classification target should be 0, 1. There should be no NaNs in target. 308 | xgb_bin_class_params = { 309 | "objective": ["binary:logistic"], 310 | "eta": [0.05, 0.075, 0.1, 0.15], 311 | "max_depth": [4, 5, 6, 7, 8, 9], 312 | "min_child_weight": [1, 5, 10, 25, 50], 313 | "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 314 | "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 315 | } 316 | 317 | classification_bin_default_params = { 318 | "objective": "binary:logistic", 319 | "eta": 0.075, 320 | "max_depth": 6, 321 | "min_child_weight": 1, 322 | "subsample": 1.0, 323 | "colsample_bytree": 1.0, 324 | } 325 | 326 | xgb_regression_params = dict(xgb_bin_class_params) 327 | xgb_regression_params["objective"] = ["reg:squarederror"] 328 | # xgb_regression_params["eval_metric"] = ["rmse", "mae", "mape"] 329 | xgb_regression_params["max_depth"] = [4, 5, 6, 7, 8, 9] 330 | 331 | 332 | xgb_multi_class_params = dict(xgb_bin_class_params) 333 | xgb_multi_class_params["objective"] = ["multi:softprob"] 334 | # xgb_multi_class_params["eval_metric"] = ["mlogloss"] 335 | 336 | classification_multi_default_params = { 337 | "objective": "multi:softprob", 338 | "eta": 0.075, 339 | "max_depth": 6, 340 | "min_child_weight": 1, 341 | "subsample": 1.0, 342 | "colsample_bytree": 1.0, 343 | } 344 | 345 | 346 | regression_default_params = { 347 | "objective": "reg:squarederror", 348 | "eta": 0.075, 349 | "max_depth": 6, 350 | "min_child_weight": 1, 351 | "subsample": 1.0, 352 | "colsample_bytree": 1.0, 353 | } 354 | 355 | additional = { 356 | "max_rounds": 10000, 357 | "early_stopping_rounds": 50, 358 | "max_rows_limit": None, 359 | "max_cols_limit": None, 360 | } 361 | required_preprocessing = [ 362 | "missing_values_inputation", 363 | "convert_categorical", 364 | "datetime_transform", 365 | "text_transform", 366 | "target_as_integer", 367 | ] 368 | 369 | 370 | class XgbClassifier(ClassifierMixin, XgbAlgorithm): 371 | pass 372 | 373 | 374 | AlgorithmsRegistry.add( 375 | BINARY_CLASSIFICATION, 376 | XgbClassifier, 377 | xgb_bin_class_params, 378 | required_preprocessing, 379 | additional, 380 | classification_bin_default_params, 381 | ) 382 | 383 | AlgorithmsRegistry.add( 384 | MULTICLASS_CLASSIFICATION, 385 | XgbClassifier, 386 | xgb_multi_class_params, 387 | required_preprocessing, 388 | additional, 389 | classification_multi_default_params, 390 | ) 391 | 392 | regression_required_preprocessing = [ 393 | "missing_values_inputation", 394 | "convert_categorical", 395 | "datetime_transform", 396 | "text_transform", 397 | "target_scale", 398 | ] 399 | 400 | 401 | class XgbRegressor(RegressorMixin, XgbAlgorithm): 402 | pass 403 | 404 | 405 | AlgorithmsRegistry.add( 406 | REGRESSION, 407 | XgbRegressor, 408 | xgb_regression_params, 409 | regression_required_preprocessing, 410 | additional, 411 | regression_default_params, 412 | ) 413 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | from sklearn import datasets 10 | from sklearn.decomposition import PCA 11 | from sklearn.pipeline import make_pipeline 12 | 13 | from supervised import AutoML 14 | from supervised.exceptions import AutoMLException 15 | 16 | iris = datasets.load_iris() 17 | housing = datasets.fetch_california_housing() 18 | # limit data size for faster tests 19 | housing.data = housing.data[:500] 20 | housing.target = housing.target[:500] 21 | breast_cancer = datasets.load_breast_cancer() 22 | 23 | 24 | @pytest.mark.usefixtures("data_folder") 25 | class AutoMLTest(unittest.TestCase): 26 | automl_dir = "AutoMLTest" 27 | data_folder: Path 28 | 29 | def tearDown(self): 30 | shutil.rmtree(self.automl_dir, ignore_errors=True) 31 | 32 | def setUp(self): 33 | shutil.rmtree(self.automl_dir, ignore_errors=True) 34 | 35 | def test_new_directory(self): 36 | """Directory does not exist, create it""" 37 | # Assert directory does not exist 38 | self.assertTrue(not os.path.exists(self.automl_dir)) 39 | # Create model with dir 40 | model = AutoML(results_path=self.automl_dir) 41 | # Generate data 42 | X, y = datasets.make_classification(n_samples=30) 43 | # Fit data 44 | model.fit(X, y) # AutoML only validates constructor params on `fit()` call 45 | # Assert directory was created 46 | self.assertTrue(os.path.exists(self.automl_dir)) 47 | 48 | def test_empty_directory(self): 49 | """Directory exists and is empty, use it""" 50 | # Assert directory does not exist 51 | self.assertTrue(not os.path.exists(self.automl_dir)) 52 | # Make dir 53 | os.mkdir(self.automl_dir) 54 | # Assert dir exists 55 | self.assertTrue(os.path.exists(self.automl_dir)) 56 | # Create automl with dir 57 | model = AutoML(results_path=self.automl_dir) 58 | # Generate data 59 | X, y = datasets.make_classification(n_samples=30) 60 | # Fit data 61 | model.fit(X, y) # AutoML only validates constructor params on `fit()` call 62 | self.assertTrue(os.path.exists(self.automl_dir)) 63 | 64 | def test_not_empty_directory(self): 65 | """ 66 | Directory exists and is not empty, 67 | there is no params.json file in it, dont use it, raise exception 68 | """ 69 | # Assert directory does not exist 70 | self.assertTrue(not os.path.exists(self.automl_dir)) 71 | # Create directory 72 | os.mkdir(self.automl_dir) 73 | # Write some content to directory 74 | open(os.path.join(self.automl_dir, "test.file"), "w").close() 75 | # Assert directory exists 76 | self.assertTrue(os.path.exists(self.automl_dir)) 77 | # Generate data 78 | X, y = datasets.make_classification(n_samples=30) 79 | # Assert than an Exception is raised 80 | with self.assertRaises(AutoMLException) as context: 81 | a = AutoML(results_path=self.automl_dir) 82 | a.fit(X, y) # AutoML only validates constructor params on `fit()` call 83 | 84 | self.assertTrue("not empty" in str(context.exception)) 85 | 86 | def test_use_directory_if_non_empty_exists_with_params_json(self): 87 | """ 88 | Directory exists and is not empty, 89 | there is params.json in it, try to load it, 90 | raise exception because of fake params.json 91 | """ 92 | # Assert directory does not exist 93 | self.assertTrue(not os.path.exists(self.automl_dir)) 94 | # Create dir 95 | os.mkdir(self.automl_dir) 96 | # Write `params.json` to directory 97 | open(os.path.join(self.automl_dir, "params.json"), "w").close() 98 | # Assert directory exists 99 | self.assertTrue(os.path.exists(self.automl_dir)) 100 | # Generate data 101 | X, y = datasets.make_classification(n_samples=30) 102 | with self.assertRaises(AutoMLException) as context: 103 | a = AutoML(results_path=self.automl_dir) 104 | a.predict(X) # AutoML tries to load on predict call 105 | self.assertTrue("Cannot load" in str(context.exception)) 106 | 107 | def test_get_params(self): 108 | """ 109 | Passes params in AutoML constructor and uses `get_params()` after fitting. 110 | Initial params must be equal to the ones returned by `get_params()`. 111 | """ 112 | # Create model 113 | model = AutoML( 114 | hill_climbing_steps=3, start_random_models=1, results_path=self.automl_dir 115 | ) 116 | # Get params before fit 117 | params_before_fit = model.get_params() 118 | # Generate data 119 | X, y = datasets.make_classification(n_samples=30) 120 | # Fit data 121 | model.fit(X, y) 122 | # Get params after fit 123 | params_after_fit = model.get_params() 124 | # Assert before and after params are equal 125 | self.assertEqual(params_before_fit, params_after_fit) 126 | 127 | def test_scikit_learn_pipeline_integration(self): 128 | """ 129 | Tests if AutoML is working on a scikit-learn's pipeline 130 | """ 131 | # Create dataset 132 | X, y = datasets.make_classification(n_samples=30) 133 | # apply PCA to X 134 | new_X = PCA(random_state=0).fit_transform(X) 135 | # Create default model 136 | default_model = AutoML( 137 | algorithms=["Linear"], random_state=0, results_path=self.automl_dir 138 | ) 139 | # Fit default model with transformed X and y, and predict transformed X 140 | y_pred_default = default_model.fit(new_X, y).predict(new_X) 141 | 142 | # Create pipeline with PCA and AutoML 143 | pipeline = make_pipeline( 144 | PCA(random_state=0), AutoML(algorithms=["Linear"], random_state=0) 145 | ) 146 | # Fit with original X and y and predict X 147 | y_pred_pipe = pipeline.fit(X, y).predict(X) 148 | # y_pred_default must be equal to y_pred_pipe 149 | self.assertTrue((y_pred_pipe == y_pred_default).all()) 150 | 151 | def test_predict_proba_in_regression(self): 152 | model = AutoML( 153 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 154 | ) 155 | model.fit(housing.data, housing.target) 156 | with self.assertRaises(AutoMLException) as context: 157 | # Try to call predict_proba in regression task 158 | model.predict_proba(housing.data) 159 | 160 | def test_iris_dataset(self): 161 | """Tests AutoML in the iris dataset (Multiclass classification)""" 162 | model = AutoML( 163 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 164 | ) 165 | score = model.fit(iris.data, iris.target).score(iris.data, iris.target) 166 | self.assertGreater(score, 0.5) 167 | 168 | def test_housing_dataset(self): 169 | """Tests AutoML in the housing dataset (Regression)""" 170 | model = AutoML( 171 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 172 | ) 173 | score = model.fit(housing.data, housing.target).score( 174 | housing.data, housing.target 175 | ) 176 | self.assertGreater(score, 0.5) 177 | 178 | def test_breast_cancer_dataset(self): 179 | """Tests AutoML in the breast cancer (binary classification)""" 180 | model = AutoML( 181 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 182 | ) 183 | score = model.fit(breast_cancer.data, breast_cancer.target).score( 184 | breast_cancer.data, breast_cancer.target 185 | ) 186 | self.assertGreater(score, 0.5) 187 | 188 | def test_titatic_dataset(self): 189 | """Tets AutoML in the titanic dataset (binary classification) with categorial features""" 190 | data_folder = self.data_folder 191 | automl = AutoML( 192 | algorithms=["Xgboost"], mode="Explain", results_path=self.automl_dir 193 | ) 194 | 195 | df = pd.read_csv((data_folder / "Titanic/train.csv")) 196 | 197 | X = df[df.columns[2:]] 198 | y = df["Survived"] 199 | 200 | automl.fit(X, y) 201 | 202 | test = pd.read_csv(data_folder / "Titanic/test_with_Survived.csv") 203 | test_cols = [ 204 | "Parch", 205 | "Ticket", 206 | "Fare", 207 | "Pclass", 208 | "Name", 209 | "Sex", 210 | "Age", 211 | "SibSp", 212 | "Cabin", 213 | "Embarked", 214 | ] 215 | score = automl.score(test[test_cols], test["Survived"]) 216 | self.assertGreater(score, 0.5) 217 | 218 | def test_score_without_y(self): 219 | """Tests the use of `score()` without passing y. Should raise AutoMLException""" 220 | model = AutoML( 221 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 222 | ) 223 | # Assert than an Exception is raised 224 | with self.assertRaises(AutoMLException) as context: 225 | # Try to score without passing 'y' 226 | score = model.fit(breast_cancer.data, breast_cancer.target).score( 227 | breast_cancer.data 228 | ) 229 | 230 | self.assertTrue("y must be specified" in str(context.exception)) 231 | 232 | def test_no_constructor_args(self): 233 | """Tests the use of AutoML without passing any args. Should work without any arguments""" 234 | # Create model with no arguments 235 | model = AutoML() 236 | model.results_path = self.automl_dir 237 | # Assert than an Exception is raised 238 | score = model.fit(iris.data, iris.target).score(iris.data, iris.target) 239 | self.assertGreater(score, 0.5) 240 | 241 | def test_fit_returns_self(self): 242 | """Tests if the `fit()` method returns `self`. This allows to quickly implement one-liners with AutoML""" 243 | model = AutoML() 244 | model.results_path = self.automl_dir 245 | self.assertTrue( 246 | isinstance(model.fit(iris.data, iris.target), AutoML), 247 | "`fit()` method must return 'self'", 248 | ) 249 | 250 | def test_invalid_mode(self): 251 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 252 | param = {"mode": "invalid_mode"} 253 | model.set_params(**param) 254 | with self.assertRaises(ValueError) as context: 255 | model.fit(iris.data, iris.target) 256 | 257 | def test_invalid_ml_task(self): 258 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 259 | param = {"ml_task": "invalid_task"} 260 | model.set_params(**param) 261 | with self.assertRaises(ValueError) as context: 262 | model.fit(iris.data, iris.target) 263 | 264 | def test_invalid_results_path(self): 265 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 266 | param = {"results_path": 2} 267 | model.set_params(**param) 268 | with self.assertRaises(ValueError) as context: 269 | model.fit(iris.data, iris.target) 270 | 271 | def test_invalid_total_time_limit(self): 272 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 273 | param = {"total_time_limit": -1} 274 | model.set_params(**param) 275 | with self.assertRaises(ValueError) as context: 276 | model.fit(iris.data, iris.target) 277 | 278 | def test_invalid_model_time_limit(self): 279 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 280 | param = {"model_time_limit": -1} 281 | model.set_params(**param) 282 | with self.assertRaises(ValueError) as context: 283 | model.fit(iris.data, iris.target) 284 | 285 | def test_invalid_algorithm_name(self): 286 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 287 | param = {"algorithms": ["Baseline", "Neural Netrk"]} 288 | model.set_params(**param) 289 | with self.assertRaises(ValueError) as context: 290 | model.fit(iris.data, iris.target) 291 | 292 | def test_invalid_train_ensemble(self): 293 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 294 | param = {"train_ensemble": "not bool"} 295 | model.set_params(**param) 296 | with self.assertRaises(ValueError) as context: 297 | model.fit(iris.data, iris.target) 298 | 299 | def test_invalid_stack_models(self): 300 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 301 | param = {"stack_models": "not bool"} 302 | model.set_params(**param) 303 | with self.assertRaises(ValueError) as context: 304 | model.fit(iris.data, iris.target) 305 | 306 | def test_invalid_eval_metric(self): 307 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 308 | param = {"eval_metric": "not_real_metric"} 309 | model.set_params(**param) 310 | with self.assertRaises(ValueError) as context: 311 | model.fit(iris.data, iris.target) 312 | 313 | def test_invalid_validation_strategy(self): 314 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 315 | param = {"validation_strategy": "test"} 316 | model.set_params(**param) 317 | with self.assertRaises(ValueError) as context: 318 | model.fit(iris.data, iris.target) 319 | 320 | def test_invalid_verbose(self): 321 | model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) 322 | param = {"verbose": -1} 323 | model.set_params(**param) 324 | with self.assertRaises(ValueError) as context: 325 | model.fit(iris.data, iris.target) 326 | 327 | def test_too_small_time_limit(self): 328 | rows = 1000000 329 | X = np.random.uniform(size=(rows, 100)) 330 | y = np.random.randint(0, 2, size=(rows,)) 331 | 332 | automl = AutoML( 333 | results_path=self.automl_dir, total_time_limit=1, train_ensemble=False 334 | ) 335 | with self.assertRaises(AutoMLException) as context: 336 | automl.fit(X, y) 337 | ``` -------------------------------------------------------------------------------- /supervised/utils/metric.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy as sp 8 | from sklearn.metrics import ( 9 | accuracy_score, 10 | average_precision_score, 11 | f1_score, 12 | log_loss, 13 | mean_absolute_error, 14 | mean_absolute_percentage_error, 15 | mean_squared_error, 16 | mean_squared_log_error, 17 | r2_score, 18 | roc_auc_score, 19 | ) 20 | 21 | 22 | def logloss(y_true, y_predicted, sample_weight=None): 23 | # convert predicted values to float32 to avoid warnings 24 | ll = log_loss(y_true, y_predicted.astype(np.float32), sample_weight=sample_weight) 25 | return ll 26 | 27 | 28 | def rmse(y_true, y_predicted, sample_weight=None): 29 | val = mean_squared_error(y_true, y_predicted, sample_weight=sample_weight) 30 | return np.sqrt(val) if val > 0 else -np.Inf 31 | 32 | 33 | def rmsle(y_true, y_predicted, sample_weight=None): 34 | val = mean_squared_log_error(y_true, y_predicted, sample_weight=sample_weight) 35 | return np.sqrt(val) if val > 0 else -np.Inf 36 | 37 | 38 | def negative_auc(y_true, y_predicted, sample_weight=None): 39 | val = roc_auc_score(y_true, y_predicted, sample_weight=sample_weight) 40 | return -1.0 * val 41 | 42 | 43 | def negative_r2(y_true, y_predicted, sample_weight=None): 44 | val = r2_score(y_true, y_predicted, sample_weight=sample_weight) 45 | return -1.0 * val 46 | 47 | 48 | def negative_f1(y_true, y_predicted, sample_weight=None): 49 | if isinstance(y_true, pd.DataFrame): 50 | y_true = np.array(y_true) 51 | if isinstance(y_predicted, pd.DataFrame): 52 | y_predicted = np.array(y_predicted) 53 | 54 | if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1: 55 | y_predicted = y_predicted.ravel() 56 | 57 | average = None 58 | if len(y_predicted.shape) == 1: 59 | y_predicted = (y_predicted > 0.5).astype(int) 60 | average = "binary" 61 | else: 62 | y_predicted = np.argmax(y_predicted, axis=1) 63 | average = "micro" 64 | 65 | val = f1_score(y_true, y_predicted, sample_weight=sample_weight, average=average) 66 | 67 | return -val 68 | 69 | 70 | def negative_accuracy(y_true, y_predicted, sample_weight=None): 71 | if isinstance(y_true, pd.DataFrame): 72 | y_true = np.array(y_true) 73 | if isinstance(y_predicted, pd.DataFrame): 74 | y_predicted = np.array(y_predicted) 75 | 76 | if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1: 77 | y_predicted = y_predicted.ravel() 78 | 79 | if len(y_predicted.shape) == 1: 80 | y_predicted = (y_predicted > 0.5).astype(int) 81 | else: 82 | y_predicted = np.argmax(y_predicted, axis=1) 83 | 84 | val = accuracy_score(y_true, y_predicted, sample_weight=sample_weight) 85 | 86 | return -val 87 | 88 | 89 | def negative_average_precision(y_true, y_predicted, sample_weight=None): 90 | if isinstance(y_true, pd.DataFrame): 91 | y_true = np.array(y_true) 92 | if isinstance(y_predicted, pd.DataFrame): 93 | y_predicted = np.array(y_predicted) 94 | 95 | val = average_precision_score(y_true, y_predicted, sample_weight=sample_weight) 96 | 97 | return -val 98 | 99 | 100 | def negative_spearman(y_true, y_predicted, sample_weight=None): 101 | # sample weight is ignored 102 | c, _ = sp.stats.spearmanr(y_true, y_predicted) 103 | return -c 104 | 105 | 106 | def spearman(y_true, y_predicted, sample_weight=None): 107 | # sample weight is ignored 108 | c, _ = sp.stats.spearmanr(y_true, y_predicted) 109 | return c 110 | 111 | 112 | def negative_pearson(y_true, y_predicted, sample_weight=None): 113 | # sample weight is ignored 114 | if isinstance(y_true, pd.DataFrame): 115 | y_true = np.array(y_true).ravel() 116 | if isinstance(y_predicted, pd.DataFrame): 117 | y_predicted = np.array(y_predicted).ravel() 118 | return -np.corrcoef(y_true, y_predicted)[0, 1] 119 | 120 | 121 | def pearson(y_true, y_predicted, sample_weight=None): 122 | return -negative_pearson(y_true, y_predicted, sample_weight) 123 | 124 | 125 | class MetricException(Exception): 126 | def __init__(self, message): 127 | Exception.__init__(self, message) 128 | log.error(message) 129 | 130 | 131 | def xgboost_eval_metric_r2(preds, dtrain): 132 | # Xgboost needs to minimize eval_metric 133 | target = dtrain.get_label() 134 | weight = dtrain.get_weight() 135 | if len(weight) == 0: 136 | weight = None 137 | return "r2", -r2_score(target, preds, sample_weight=weight) 138 | 139 | 140 | def xgboost_eval_metric_spearman(preds, dtrain): 141 | # Xgboost needs to minimize eval_metric 142 | target = dtrain.get_label() 143 | return "spearman", negative_spearman(target, preds) 144 | 145 | 146 | def xgboost_eval_metric_pearson(preds, dtrain): 147 | # Xgboost needs to minimize eval_metric 148 | target = dtrain.get_label() 149 | return "pearson", negative_pearson(target, preds) 150 | 151 | 152 | def xgboost_eval_metric_f1(preds, dtrain): 153 | # Xgboost needs to minimize eval_metric 154 | target = dtrain.get_label() 155 | weight = dtrain.get_weight() 156 | if len(weight) == 0: 157 | weight = None 158 | return "f1", negative_f1(target, preds, weight) 159 | 160 | 161 | def xgboost_eval_metric_average_precision(preds, dtrain): 162 | # Xgboost needs to minimize eval_metric 163 | target = dtrain.get_label() 164 | weight = dtrain.get_weight() 165 | if len(weight) == 0: 166 | weight = None 167 | return "average_precision", negative_average_precision(target, preds, weight) 168 | 169 | 170 | def xgboost_eval_metric_accuracy(preds, dtrain): 171 | # Xgboost needs to minimize eval_metric 172 | target = dtrain.get_label() 173 | weight = dtrain.get_weight() 174 | if len(weight) == 0: 175 | weight = None 176 | return "accuracy", negative_accuracy(target, preds, weight) 177 | 178 | 179 | def xgboost_eval_metric_mse(preds, dtrain): 180 | # Xgboost needs to minimize eval_metric 181 | target = dtrain.get_label() 182 | weight = dtrain.get_weight() 183 | if len(weight) == 0: 184 | weight = None 185 | return "mse", mean_squared_error(target, preds, sample_weight=weight) 186 | 187 | 188 | def lightgbm_eval_metric_r2(preds, dtrain): 189 | target = dtrain.get_label() 190 | weight = dtrain.get_weight() 191 | return "r2", r2_score(target, preds, sample_weight=weight), True 192 | 193 | 194 | def lightgbm_eval_metric_spearman(preds, dtrain): 195 | target = dtrain.get_label() 196 | return "spearman", -negative_spearman(target, preds), True 197 | 198 | 199 | def lightgbm_eval_metric_pearson(preds, dtrain): 200 | target = dtrain.get_label() 201 | return "pearson", -negative_pearson(target, preds), True 202 | 203 | 204 | def lightgbm_eval_metric_f1(preds, dtrain): 205 | target = dtrain.get_label() 206 | weight = dtrain.get_weight() 207 | 208 | unique_targets = np.unique(target) 209 | if len(unique_targets) > 2: 210 | cols = len(unique_targets) 211 | rows = int(preds.shape[0] / len(unique_targets)) 212 | preds = np.reshape(preds, (rows, cols), order="F") 213 | 214 | return "f1", -negative_f1(target, preds, weight), True 215 | 216 | 217 | def lightgbm_eval_metric_average_precision(preds, dtrain): 218 | target = dtrain.get_label() 219 | weight = dtrain.get_weight() 220 | 221 | return "average_precision", -negative_average_precision(target, preds, weight), True 222 | 223 | 224 | def lightgbm_eval_metric_accuracy(preds, dtrain): 225 | target = dtrain.get_label() 226 | weight = dtrain.get_weight() 227 | 228 | return "accuracy", -negative_accuracy(target, preds, weight), True 229 | 230 | 231 | class CatBoostEvalMetricSpearman(object): 232 | def get_final_error(self, error, weight): 233 | return error 234 | 235 | def is_max_optimal(self): 236 | return True 237 | 238 | def evaluate(self, approxes, target, weight): 239 | assert len(approxes) == 1 240 | assert len(target) == len(approxes[0]) 241 | 242 | preds = np.array(approxes[0]) 243 | target = np.array(target) 244 | 245 | return -negative_spearman(target, preds), 0 246 | 247 | 248 | class CatBoostEvalMetricPearson(object): 249 | def get_final_error(self, error, weight): 250 | return error 251 | 252 | def is_max_optimal(self): 253 | return True 254 | 255 | def evaluate(self, approxes, target, weight): 256 | assert len(approxes) == 1 257 | assert len(target) == len(approxes[0]) 258 | 259 | preds = np.array(approxes[0]) 260 | target = np.array(target) 261 | 262 | return -negative_pearson(target, preds), 0 263 | 264 | 265 | class CatBoostEvalMetricAveragePrecision(object): 266 | def get_final_error(self, error, weight): 267 | return error 268 | 269 | def is_max_optimal(self): 270 | return True 271 | 272 | def evaluate(self, approxes, target, weight): 273 | assert len(approxes) == 1 274 | assert len(target) == len(approxes[0]) 275 | 276 | preds = np.array(approxes[0]) 277 | target = np.array(target) 278 | if weight is not None: 279 | weight = np.array(weight) 280 | 281 | return -negative_average_precision(target, preds, weight), 0 282 | 283 | 284 | class CatBoostEvalMetricMSE(object): 285 | def get_final_error(self, error, weight): 286 | return error 287 | 288 | def is_max_optimal(self): 289 | return False 290 | 291 | def evaluate(self, approxes, target, weight): 292 | assert len(approxes) == 1 293 | assert len(target) == len(approxes[0]) 294 | 295 | preds = np.array(approxes[0]) 296 | target = np.array(target) 297 | if weight is not None: 298 | weight = np.array(weight) 299 | 300 | return mean_squared_error(target, preds, sample_weight=weight), 0 301 | 302 | 303 | class UserDefinedEvalMetric: 304 | # should always minimize 305 | eval_metric = mean_squared_error # set the default 306 | 307 | def set_metric(self, feval): 308 | UserDefinedEvalMetric.eval_metric = feval 309 | 310 | def __call__(self, y_true, y_predicted, sample_weight=None): 311 | return UserDefinedEvalMetric.eval_metric(y_true, y_predicted, sample_weight) 312 | 313 | 314 | def xgboost_eval_metric_user_defined(preds, dtrain): 315 | target = dtrain.get_label() 316 | weight = dtrain.get_weight() 317 | if len(weight) == 0: 318 | weight = None 319 | metric = UserDefinedEvalMetric() 320 | return "user_defined_metric", metric(target, preds, sample_weight=weight) 321 | 322 | 323 | def lightgbm_eval_metric_user_defined(preds, dtrain): 324 | target = dtrain.get_label() 325 | weight = dtrain.get_weight() 326 | metric = UserDefinedEvalMetric() 327 | return "user_defined_metric", metric(target, preds, sample_weight=weight), False 328 | 329 | 330 | class CatBoostEvalMetricUserDefined(object): 331 | def get_final_error(self, error, weight): 332 | return error 333 | 334 | def is_max_optimal(self): 335 | return False 336 | 337 | def evaluate(self, approxes, target, weight): 338 | assert len(approxes) == 1 339 | assert len(target) == len(approxes[0]) 340 | 341 | preds = np.array(approxes[0]) 342 | target = np.array(target) 343 | if weight is not None: 344 | weight = np.array(weight) 345 | 346 | metric = UserDefinedEvalMetric() 347 | return metric(target, preds, sample_weight=weight), 0 348 | 349 | 350 | class Metric(object): 351 | def __init__(self, params): 352 | if params is None: 353 | raise MetricException("Metric params not defined") 354 | self.params = params 355 | self.name = self.params.get("name") 356 | if self.name is None: 357 | raise MetricException("Metric name not defined") 358 | 359 | self.minimize_direction = self.name in [ 360 | "logloss", 361 | "auc", # negative auc 362 | "rmse", 363 | "mae", 364 | "mse", 365 | "r2", # negative r2 366 | "mape", 367 | "spearman", # negative 368 | "pearson", # negative 369 | "f1", # negative 370 | "average_precision", # negative 371 | "accuracy", # negative 372 | "user_defined_metric", 373 | ] 374 | if self.name == "logloss": 375 | self.metric = logloss 376 | elif self.name == "auc": 377 | self.metric = negative_auc 378 | elif self.name == "acc": 379 | self.metric = accuracy_score 380 | elif self.name == "rmse": 381 | self.metric = rmse 382 | elif self.name == "mse": 383 | self.metric = mean_squared_error 384 | elif self.name == "mae": 385 | self.metric = mean_absolute_error 386 | elif self.name == "r2": 387 | self.metric = negative_r2 388 | elif self.name == "mape": 389 | self.metric = mean_absolute_percentage_error 390 | elif self.name == "spearman": 391 | self.metric = negative_spearman 392 | elif self.name == "pearson": 393 | self.metric = negative_pearson 394 | elif self.name == "f1": 395 | self.metric = negative_f1 396 | elif self.name == "average_precision": 397 | self.metric = negative_average_precision 398 | elif self.name == "accuracy": 399 | self.metric = negative_accuracy 400 | elif self.name == "user_defined_metric": 401 | self.metric = UserDefinedEvalMetric.eval_metric 402 | # elif self.name == "rmsle": # need to update target preprocessing 403 | # self.metric = rmsle # to assure that target is not negative ... 404 | else: 405 | raise MetricException(f"Unknown metric '{self.name}'") 406 | 407 | def __call__(self, y_true, y_predicted, sample_weight=None): 408 | return self.metric(y_true, y_predicted, sample_weight=sample_weight) 409 | 410 | def improvement(self, previous, current): 411 | if self.minimize_direction: 412 | return current < previous 413 | return current > previous 414 | 415 | def get_maximum(self): 416 | if self.minimize_direction: 417 | return 10e12 418 | else: 419 | return -10e12 420 | 421 | def worst_value(self): 422 | if self.minimize_direction: 423 | return np.Inf 424 | return -np.Inf 425 | 426 | def get_minimize_direction(self): 427 | return self.minimize_direction 428 | 429 | def is_negative(self): 430 | return self.name in [ 431 | "auc", 432 | "r2", 433 | "spearman", 434 | "pearson", 435 | "f1", 436 | "average_precision", 437 | "accuracy", 438 | ] 439 | 440 | @staticmethod 441 | def optimize_negative(metric_name): 442 | return metric_name in [ 443 | "auc", 444 | "r2", 445 | "spearman", 446 | "pearson", 447 | "f1", 448 | "average_precision", 449 | "accuracy", 450 | ] 451 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/catboost.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | import logging 3 | import time 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.base import ClassifierMixin, RegressorMixin 8 | 9 | from supervised.algorithms.algorithm import BaseAlgorithm 10 | from supervised.algorithms.registry import ( 11 | BINARY_CLASSIFICATION, 12 | MULTICLASS_CLASSIFICATION, 13 | REGRESSION, 14 | AlgorithmsRegistry, 15 | ) 16 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 17 | from supervised.utils.config import LOG_LEVEL 18 | from supervised.utils.metric import ( 19 | CatBoostEvalMetricAveragePrecision, 20 | CatBoostEvalMetricMSE, 21 | CatBoostEvalMetricPearson, 22 | CatBoostEvalMetricSpearman, 23 | CatBoostEvalMetricUserDefined, 24 | ) 25 | 26 | logger = logging.getLogger(__name__) 27 | logger.setLevel(LOG_LEVEL) 28 | 29 | import catboost 30 | from catboost import CatBoostClassifier, CatBoostRegressor, Pool 31 | 32 | 33 | def catboost_eval_metric(ml_task, eval_metric): 34 | if eval_metric == "user_defined_metric": 35 | return eval_metric 36 | metric_name_mapping = { 37 | BINARY_CLASSIFICATION: { 38 | "auc": "AUC", 39 | "logloss": "Logloss", 40 | "f1": "F1", 41 | "average_precision": "average_precision", 42 | "accuracy": "Accuracy", 43 | }, 44 | MULTICLASS_CLASSIFICATION: { 45 | "logloss": "MultiClass", 46 | "f1": "TotalF1:average=Micro", 47 | "accuracy": "Accuracy", 48 | }, 49 | REGRESSION: { 50 | "rmse": "RMSE", 51 | "mse": "mse", 52 | "mae": "MAE", 53 | "mape": "MAPE", 54 | "r2": "R2", 55 | "spearman": "spearman", 56 | "pearson": "pearson", 57 | }, 58 | } 59 | return metric_name_mapping[ml_task][eval_metric] 60 | 61 | 62 | def catboost_objective(ml_task, eval_metric): 63 | objective = "RMSE" 64 | if ml_task == BINARY_CLASSIFICATION: 65 | objective = "Logloss" 66 | elif ml_task == MULTICLASS_CLASSIFICATION: 67 | objective = "MultiClass" 68 | else: # ml_task == REGRESSION 69 | objective = catboost_eval_metric(REGRESSION, eval_metric) 70 | if objective in [ 71 | "mse", 72 | "R2", 73 | "spearman", 74 | "pearson", 75 | "user_defined_metric", 76 | ]: # cant optimize them directly 77 | objective = "RMSE" 78 | return objective 79 | 80 | 81 | class CatBoostAlgorithm(BaseAlgorithm): 82 | algorithm_name = "CatBoost" 83 | algorithm_short_name = "CatBoost" 84 | warmup_iterations = 20 85 | 86 | def __init__(self, params): 87 | super(CatBoostAlgorithm, self).__init__(params) 88 | self.library_version = catboost.__version__ 89 | self.snapshot_file_path = "training_snapshot" 90 | 91 | self.explain_level = params.get("explain_level", 0) 92 | self.rounds = additional.get("max_rounds", 10000) 93 | self.max_iters = 1 94 | self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) 95 | 96 | Algo = CatBoostClassifier 97 | loss_function = "Logloss" 98 | if self.params["ml_task"] == BINARY_CLASSIFICATION: 99 | loss_function = self.params.get("loss_function", "Logloss") 100 | elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: 101 | loss_function = self.params.get("loss_function", "MultiClass") 102 | elif self.params["ml_task"] == REGRESSION: 103 | loss_function = self.params.get("loss_function", "RMSE") 104 | Algo = CatBoostRegressor 105 | 106 | cat_params = { 107 | "iterations": self.params.get("num_boost_round", self.rounds), 108 | "learning_rate": self.params.get("learning_rate", 0.1), 109 | "depth": self.params.get("depth", 3), 110 | "rsm": self.params.get("rsm", 1.0), 111 | "l2_leaf_reg": self.params.get("l2_leaf_reg", 3.0), 112 | "random_strength": self.params.get("random_strength", 1.0), 113 | "loss_function": loss_function, 114 | "eval_metric": self.params.get("eval_metric", loss_function), 115 | # "custom_metric": self.params.get("eval_metric", loss_function), 116 | "thread_count": self.params.get("n_jobs", -1), 117 | "verbose": False, 118 | "allow_writing_files": False, 119 | "random_seed": self.params.get("seed", 1), 120 | } 121 | 122 | for extra_param in [ 123 | "min_data_in_leaf", 124 | "bootstrap_type", 125 | "bagging_temperature", 126 | "subsample", 127 | "border_count", 128 | ]: 129 | if extra_param in self.params: 130 | cat_params[extra_param] = self.params[extra_param] 131 | 132 | self.log_metric_name = cat_params["eval_metric"] 133 | if cat_params["eval_metric"] == "spearman": 134 | cat_params["eval_metric"] = CatBoostEvalMetricSpearman() 135 | self.log_metric_name = "CatBoostEvalMetricSpearman" 136 | elif cat_params["eval_metric"] == "pearson": 137 | cat_params["eval_metric"] = CatBoostEvalMetricPearson() 138 | self.log_metric_name = "CatBoostEvalMetricPearson" 139 | elif cat_params["eval_metric"] == "average_precision": 140 | cat_params["eval_metric"] = CatBoostEvalMetricAveragePrecision() 141 | self.log_metric_name = "CatBoostEvalMetricAveragePrecision" 142 | elif cat_params["eval_metric"] == "mse": 143 | cat_params["eval_metric"] = CatBoostEvalMetricMSE() 144 | self.log_metric_name = "CatBoostEvalMetricMSE" 145 | elif cat_params["eval_metric"] == "user_defined_metric": 146 | cat_params["eval_metric"] = CatBoostEvalMetricUserDefined() 147 | self.log_metric_name = "CatBoostEvalMetricUserDefined" 148 | 149 | self.model = Algo(**cat_params) 150 | self.cat_features = None 151 | self.best_ntree_limit = 0 152 | 153 | logger.debug("CatBoostAlgorithm.__init__") 154 | 155 | def _assess_iterations(self, X, y, sample_weight, eval_set, max_time=None): 156 | if max_time is None: 157 | max_time = 3600 158 | try: 159 | model = copy.deepcopy(self.model) 160 | model.set_params(iterations=self.warmup_iterations) 161 | start_time = time.time() 162 | model.fit( 163 | X, 164 | y, 165 | sample_weight=sample_weight, 166 | cat_features=self.cat_features, 167 | init_model=None if self.model.tree_count_ is None else self.model, 168 | eval_set=eval_set, 169 | early_stopping_rounds=self.early_stopping_rounds, 170 | verbose_eval=False, 171 | ) 172 | elapsed_time = (time.time() - start_time) / float(self.warmup_iterations) 173 | # print(max_time, elapsed_time, max_time / elapsed_time, np.round(time.time() - start_time, 2)) 174 | new_rounds = int(min(10000, max_time / elapsed_time)) 175 | new_rounds = max(new_rounds, 10) 176 | return model, new_rounds 177 | except Exception as e: 178 | # print(str(e)) 179 | return None, 1000 180 | 181 | def fit( 182 | self, 183 | X, 184 | y, 185 | sample_weight=None, 186 | X_validation=None, 187 | y_validation=None, 188 | sample_weight_validation=None, 189 | log_to_file=None, 190 | max_time=None, 191 | ): 192 | if self.is_fitted(): 193 | print("CatBoost model already fitted. Skip fit().") 194 | return 195 | 196 | if self.cat_features is None: 197 | self.cat_features = [] 198 | for i in range(X.shape[1]): 199 | if PreprocessingUtils.is_categorical(X.iloc[:, i]): 200 | self.cat_features += [i] 201 | col_name = X.columns[i] 202 | X[col_name] = X[col_name].astype(str) 203 | if X_validation is not None: 204 | X_validation[col_name] = X_validation[col_name].astype(str) 205 | 206 | eval_set = None 207 | if X_validation is not None and y_validation is not None: 208 | eval_set = Pool( 209 | data=X_validation, 210 | label=y_validation, 211 | cat_features=self.cat_features, 212 | weight=sample_weight_validation, 213 | ) 214 | 215 | if self.params.get("num_boost_round") is None: 216 | model_init, new_iterations = self._assess_iterations( 217 | X, y, sample_weight, eval_set, max_time 218 | ) 219 | self.model.set_params(iterations=new_iterations) 220 | else: 221 | model_init = None 222 | self.model.set_params(iterations=self.params.get("num_boost_round")) 223 | self.early_stopping_rounds = self.params.get("early_stopping_rounds", 50) 224 | 225 | self.model.fit( 226 | X, 227 | y, 228 | sample_weight=sample_weight, 229 | cat_features=self.cat_features, 230 | init_model=model_init, 231 | eval_set=eval_set, 232 | early_stopping_rounds=self.early_stopping_rounds, 233 | verbose_eval=False, 234 | ) 235 | 236 | if self.model.best_iteration_ is not None: 237 | if model_init is not None: 238 | self.best_ntree_limit = ( 239 | self.model.best_iteration_ + model_init.tree_count_ + 1 240 | ) 241 | else: 242 | self.best_ntree_limit = self.model.best_iteration_ + 1 243 | 244 | else: 245 | # just take all the trees 246 | # the warm-up trees are already included 247 | # dont need to add +1 248 | self.best_ntree_limit = self.model.tree_count_ 249 | 250 | if log_to_file is not None: 251 | train_scores = self.model.evals_result_["learn"].get(self.log_metric_name) 252 | validation_scores = self.model.evals_result_["validation"].get( 253 | self.log_metric_name 254 | ) 255 | if model_init is not None: 256 | if train_scores is not None: 257 | train_scores = ( 258 | model_init.evals_result_["learn"].get(self.log_metric_name) 259 | + train_scores 260 | ) 261 | if validation_scores is not None: 262 | validation_scores = ( 263 | model_init.evals_result_["validation"].get(self.log_metric_name) 264 | + validation_scores 265 | ) 266 | iteration = None 267 | if train_scores is not None: 268 | iteration = range(len(validation_scores)) 269 | elif validation_scores is not None: 270 | iteration = range(len(validation_scores)) 271 | 272 | result = pd.DataFrame( 273 | { 274 | "iteration": iteration, 275 | "train": train_scores, 276 | "validation": validation_scores, 277 | } 278 | ) 279 | result.to_csv(log_to_file, index=False, header=False) 280 | 281 | if self.params["ml_task"] != REGRESSION: 282 | self.classes_ = np.unique(y) 283 | 284 | def is_fitted(self): 285 | return self.model is not None and self.model.tree_count_ is not None 286 | 287 | def predict(self, X): 288 | self.reload() 289 | if self.params["ml_task"] == BINARY_CLASSIFICATION: 290 | return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)[:, 1] 291 | elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: 292 | return self.model.predict_proba(X, ntree_end=self.best_ntree_limit) 293 | 294 | return self.model.predict(X, ntree_end=self.best_ntree_limit) 295 | 296 | def copy(self): 297 | return copy.deepcopy(self) 298 | 299 | def save(self, model_file_path): 300 | self.model.save_model(model_file_path) 301 | self.model_file_path = model_file_path 302 | logger.debug("CatBoostAlgorithm save model to %s" % model_file_path) 303 | 304 | def load(self, model_file_path): 305 | logger.debug("CatBoostLearner load model from %s" % model_file_path) 306 | 307 | # waiting for fix https://github.com/catboost/catboost/issues/696 308 | Algo = CatBoostClassifier 309 | if self.params["ml_task"] == REGRESSION: 310 | Algo = CatBoostRegressor 311 | 312 | # loading might throw warnings in the case of custom eval_metric 313 | # check https://github.com/catboost/catboost/issues/1169 314 | self.model = Algo().load_model(model_file_path) 315 | self.model_file_path = model_file_path 316 | 317 | def file_extension(self): 318 | return "catboost" 319 | 320 | def get_metric_name(self): 321 | metric = self.params.get("eval_metric") 322 | if metric is None: 323 | return None 324 | if metric == "Logloss": 325 | return "logloss" 326 | elif metric == "AUC": 327 | return "auc" 328 | elif metric == "MultiClass": 329 | return "logloss" 330 | elif metric == "RMSE": 331 | return "rmse" 332 | elif metric == "MSE": 333 | return "mse" 334 | elif metric == "MAE": 335 | return "mae" 336 | elif metric == "MAPE": 337 | return "mape" 338 | elif metric in ["F1", "TotalF1:average=Micro"]: 339 | return "f1" 340 | elif metric == "Accuracy": 341 | return "accuracy" 342 | return metric 343 | 344 | 345 | classification_params = { 346 | "learning_rate": [0.025, 0.05, 0.1, 0.2], 347 | "depth": [4, 5, 6, 7, 8, 9], 348 | "rsm": [0.7, 0.8, 0.9, 1], # random subspace method 349 | "loss_function": ["Logloss"], 350 | } 351 | 352 | classification_default_params = { 353 | "learning_rate": 0.1, 354 | "depth": 6, 355 | "rsm": 1, 356 | "loss_function": "Logloss", 357 | } 358 | 359 | additional = { 360 | "max_rounds": 10000, 361 | "early_stopping_rounds": 50, 362 | "max_rows_limit": None, 363 | "max_cols_limit": None, 364 | } 365 | required_preprocessing = [ 366 | "missing_values_inputation", 367 | "datetime_transform", 368 | "text_transform", 369 | "target_as_integer", 370 | ] 371 | 372 | 373 | class CBClassifier(ClassifierMixin, CatBoostAlgorithm): 374 | pass 375 | 376 | 377 | AlgorithmsRegistry.add( 378 | BINARY_CLASSIFICATION, 379 | CBClassifier, 380 | classification_params, 381 | required_preprocessing, 382 | additional, 383 | classification_default_params, 384 | ) 385 | 386 | multiclass_classification_params = copy.deepcopy(classification_params) 387 | multiclass_classification_params["loss_function"] = ["MultiClass"] 388 | multiclass_classification_params["depth"] = [3, 4, 5, 6] 389 | multiclass_classification_params["learning_rate"] = [0.1, 0.15, 0.2] 390 | 391 | multiclass_classification_default_params = copy.deepcopy(classification_default_params) 392 | multiclass_classification_default_params["loss_function"] = "MultiClass" 393 | multiclass_classification_default_params["depth"] = 5 394 | multiclass_classification_default_params["learning_rate"] = 0.15 395 | 396 | 397 | AlgorithmsRegistry.add( 398 | MULTICLASS_CLASSIFICATION, 399 | CBClassifier, 400 | multiclass_classification_params, 401 | required_preprocessing, 402 | additional, 403 | multiclass_classification_default_params, 404 | ) 405 | 406 | regression_params = copy.deepcopy(classification_params) 407 | regression_params["loss_function"] = ["RMSE", "MAE", "MAPE"] 408 | 409 | regression_required_preprocessing = [ 410 | "missing_values_inputation", 411 | "datetime_transform", 412 | "text_transform", 413 | "target_scale", 414 | ] 415 | 416 | 417 | regression_default_params = { 418 | "learning_rate": 0.1, 419 | "depth": 6, 420 | "rsm": 1, 421 | "loss_function": "RMSE", 422 | } 423 | 424 | 425 | class CBRegressor(RegressorMixin, CatBoostAlgorithm): 426 | pass 427 | 428 | 429 | AlgorithmsRegistry.add( 430 | REGRESSION, 431 | CBRegressor, 432 | regression_params, 433 | regression_required_preprocessing, 434 | additional, 435 | regression_default_params, 436 | ) 437 | ``` -------------------------------------------------------------------------------- /supervised/fairness/optimization.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | 3 | 4 | class FairnessOptimization: 5 | @staticmethod 6 | def binary_classification( 7 | target, 8 | predicted_labels, 9 | sensitive_features, 10 | fairness_metric, 11 | fairness_threshold, 12 | privileged_groups=[], 13 | underprivileged_groups=[], 14 | previous_fairness_optimization=None, 15 | min_selection_rate=None, 16 | max_selection_rate=None, 17 | ): 18 | target = np.array(target).ravel() 19 | preds = np.array(predicted_labels) 20 | 21 | # fairness optimization stats 22 | sensitive_values = {} 23 | for col in sensitive_features.columns: 24 | col_name = col[10:] # skip 'senstive_' 25 | values = list(sensitive_features[col].unique()) 26 | sensitive_values[col] = values 27 | 28 | for v in values: 29 | ii = sensitive_features[col] == v 30 | 31 | new_sensitive_values = {} 32 | for k, prev_values in sensitive_values.items(): 33 | if k == col: 34 | continue 35 | new_sensitive_values[f"{k}@{col}"] = [] 36 | for v in values: 37 | for pv in prev_values: 38 | if isinstance(pv, tuple): 39 | new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] 40 | else: 41 | new_sensitive_values[f"{k}@{col}"] += [(pv, v)] 42 | 43 | sensitive_values = {**sensitive_values, **new_sensitive_values} 44 | 45 | # print(sensitive_values) 46 | 47 | sensitive_indices = {} 48 | for k, values_list in sensitive_values.items(): 49 | if k.count("@") == sensitive_features.shape[1] - 1: 50 | # print(k) 51 | # print("values_list",values_list) 52 | cols = k.split("@") 53 | for values in values_list: 54 | if not isinstance(values, tuple): 55 | values = (values,) 56 | # print("values", values) 57 | 58 | ii = None 59 | for i, c in enumerate(cols): 60 | if ii is None: 61 | ii = sensitive_features[c] == values[i] 62 | else: 63 | ii &= sensitive_features[c] == values[i] 64 | 65 | key = "@".join([str(s) for s in values]) 66 | # print(key, np.sum(ii)) 67 | sensitive_indices[key] = ii 68 | 69 | total_dp_ratio = min_selection_rate / max_selection_rate 70 | # print("total dp ratio", total_dp_ratio) 71 | 72 | c0 = np.sum(target == 0) 73 | c1 = np.sum(target == 1) 74 | 75 | selection_rates = {} 76 | weights = {} 77 | 78 | for key, indices in sensitive_indices.items(): 79 | selection_rates[key] = np.sum((preds == 1) & indices) / np.sum(indices) 80 | # print(key, np.sum(indices), selection_rates[key]) 81 | 82 | t = np.sum(indices) 83 | t0 = np.sum(indices & (target == 0)) 84 | t1 = np.sum(indices & (target == 1)) 85 | 86 | w0 = t / target.shape[0] * c0 / t0 87 | w1 = t / target.shape[0] * c1 / t1 88 | 89 | # print("----", key, w0, w1, t, t0, t1) 90 | weights[key] = [w0, w1] 91 | 92 | max_selection_rate = np.max(list(selection_rates.values())) 93 | min_selection_rate = np.min(list(selection_rates.values())) 94 | 95 | for k, v in selection_rates.items(): 96 | selection_rates[k] = v / max_selection_rate 97 | 98 | # print("previous fairness optimization") 99 | # print(previous_fairness_optimization) 100 | # print("********") 101 | 102 | previous_weights = {} 103 | if previous_fairness_optimization is not None: 104 | weights = previous_fairness_optimization.get("weights") 105 | for key, indices in sensitive_indices.items(): 106 | # print("Previous") 107 | # print(previous_fairness_optimization["selection_rates"][key], selection_rates[key]) 108 | 109 | direction = 0.0 110 | if ( 111 | previous_fairness_optimization["selection_rates"][key] 112 | < selection_rates[key] 113 | ): 114 | # print("Improvement") 115 | direction = 1.0 116 | elif selection_rates[key] > 0.8: 117 | # print("GOOD") 118 | direction = 0.0 119 | else: 120 | # print("Decrease") 121 | direction = -0.5 122 | 123 | # need to add previous weights instead 1.0 124 | prev_weights = previous_fairness_optimization.get( 125 | "previous_weights", {} 126 | ).get(key, [1, 1]) 127 | # print("prev_weights", prev_weights) 128 | delta0 = weights[key][0] - prev_weights[0] 129 | delta1 = weights[key][1] - prev_weights[1] 130 | 131 | previous_weights[key] = [weights[key][0], weights[key][1]] 132 | 133 | # print("BEFORE") 134 | # print(weights[key]) 135 | weights[key][0] += direction * delta0 136 | weights[key][1] += direction * delta1 137 | # print("AFTER") 138 | # print(weights[key]) 139 | # print(previous_fairness_optimization["weights"][key]) 140 | 141 | step = None 142 | if previous_fairness_optimization is not None: 143 | step = previous_fairness_optimization.get("step") 144 | 145 | if step is None: 146 | step = 0 147 | else: 148 | step += 1 149 | 150 | return { 151 | "selection_rates": selection_rates, 152 | "previous_weights": previous_weights, 153 | "weights": weights, 154 | "total_dp_ratio": total_dp_ratio, 155 | "step": step, 156 | "fairness_threshold": fairness_threshold, 157 | } 158 | 159 | @staticmethod 160 | def regression( 161 | target, 162 | predictions, 163 | sensitive_features, 164 | fairness_metric, 165 | fairness_threshold, 166 | privileged_groups=[], 167 | underprivileged_groups=[], 168 | previous_fairness_optimization=None, 169 | performance_metric=None, 170 | performance_metric_name=None, 171 | ): 172 | target = np.array(target).ravel() 173 | preds = np.array(predictions) 174 | 175 | # fairness optimization stats 176 | sensitive_values = {} 177 | for col in sensitive_features.columns: 178 | col_name = col[10:] # skip 'senstive_' 179 | values = list(sensitive_features[col].unique()) 180 | sensitive_values[col] = values 181 | 182 | for v in values: 183 | ii = sensitive_features[col] == v 184 | 185 | new_sensitive_values = {} 186 | for k, prev_values in sensitive_values.items(): 187 | if k == col: 188 | continue 189 | new_sensitive_values[f"{k}@{col}"] = [] 190 | for v in values: 191 | for pv in prev_values: 192 | if isinstance(pv, tuple): 193 | new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] 194 | else: 195 | new_sensitive_values[f"{k}@{col}"] += [(pv, v)] 196 | 197 | sensitive_values = {**sensitive_values, **new_sensitive_values} 198 | 199 | sensitive_indices = {} 200 | least_frequent_key = None 201 | least_frequency = sensitive_features.shape[0] 202 | for k, values_list in sensitive_values.items(): 203 | if k.count("@") == sensitive_features.shape[1] - 1: 204 | # print(k) 205 | # print("values_list",values_list) 206 | cols = k.split("@") 207 | for values in values_list: 208 | if not isinstance(values, tuple): 209 | values = (values,) 210 | # print("values", values) 211 | 212 | ii = None 213 | for i, c in enumerate(cols): 214 | if ii is None: 215 | ii = sensitive_features[c] == values[i] 216 | else: 217 | ii &= sensitive_features[c] == values[i] 218 | 219 | key = "@".join([str(s) for s in values]) 220 | if np.sum(ii) > 0: 221 | sensitive_indices[key] = ii 222 | if np.sum(ii) < least_frequency: 223 | least_frequency = np.sum(ii) 224 | least_frequent_key = key 225 | 226 | weights = {} 227 | performance = {} 228 | 229 | for key, indices in sensitive_indices.items(): 230 | w = target.shape[0] / len(sensitive_indices) / np.sum(indices) 231 | weights[key] = w 232 | performance[key] = performance_metric(target[indices], predictions[indices]) 233 | 234 | # try to upscale more the largest weight 235 | weights[least_frequent_key] *= 1.5 236 | 237 | denominator = np.max(list(performance.values())) 238 | new_performance = {} 239 | for k, v in performance.items(): 240 | new_performance[k] = np.round(v / denominator, 4) 241 | performance = new_performance 242 | 243 | previous_weights = {} 244 | if previous_fairness_optimization is not None: 245 | weights = previous_fairness_optimization.get("weights") 246 | for key, indices in sensitive_indices.items(): 247 | direction = 0.0 248 | if ( 249 | previous_fairness_optimization["performance"][key] 250 | < performance[key] 251 | ): 252 | direction = 1.0 253 | elif performance[key] > fairness_threshold: 254 | direction = 0.0 255 | else: 256 | direction = -0.5 257 | 258 | # need to add previous weights instead 1.0 259 | prev_weights = previous_fairness_optimization.get( 260 | "previous_weights", {} 261 | ).get(key, 1) 262 | delta0 = weights[key] - prev_weights 263 | previous_weights[key] = weights[key] 264 | weights[key] = max(weights[key] + direction * delta0, 0.01) 265 | 266 | no_weights_change = False 267 | if str(previous_weights) == str(weights): 268 | no_weights_change = True 269 | 270 | step = None 271 | if previous_fairness_optimization is not None: 272 | step = previous_fairness_optimization.get("step") 273 | 274 | if step is None: 275 | step = 0 276 | else: 277 | if not no_weights_change: 278 | step += 1 279 | 280 | return { 281 | "performance": performance, 282 | "previous_weights": previous_weights, 283 | "weights": weights, 284 | "step": step, 285 | "fairness_threshold": fairness_threshold, 286 | } 287 | 288 | @staticmethod 289 | def multiclass_classification( 290 | target, 291 | predicted_labels, 292 | sensitive_features, 293 | fairness_metric, 294 | fairness_threshold, 295 | privileged_groups=[], 296 | underprivileged_groups=[], 297 | previous_fairness_optimization=None, 298 | ): 299 | target = np.array(target).ravel() 300 | preds = np.array(predicted_labels) 301 | target_values = list(np.unique(target)) 302 | 303 | # fairness optimization stats 304 | sensitive_values = {} 305 | for col in sensitive_features.columns: 306 | col_name = col[10:] # skip 'senstive_' 307 | values = list(sensitive_features[col].unique()) 308 | sensitive_values[col] = values 309 | for v in values: 310 | ii = sensitive_features[col] == v 311 | new_sensitive_values = {} 312 | for k, prev_values in sensitive_values.items(): 313 | if k == col: 314 | continue 315 | new_sensitive_values[f"{k}@{col}"] = [] 316 | for v in values: 317 | for pv in prev_values: 318 | if isinstance(pv, tuple): 319 | new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] 320 | else: 321 | new_sensitive_values[f"{k}@{col}"] += [(pv, v)] 322 | 323 | sensitive_values = {**sensitive_values, **new_sensitive_values} 324 | 325 | sensitive_indices = {} 326 | for k, values_list in sensitive_values.items(): 327 | if k.count("@") == sensitive_features.shape[1] - 1: 328 | cols = k.split("@") 329 | for values in values_list: 330 | if not isinstance(values, tuple): 331 | values = (values,) 332 | 333 | ii = None 334 | for i, c in enumerate(cols): 335 | if ii is None: 336 | ii = sensitive_features[c] == values[i] 337 | else: 338 | ii &= sensitive_features[c] == values[i] 339 | 340 | key = "@".join([str(s) for s in values]) 341 | sensitive_indices[key] = ii 342 | 343 | cs = {} 344 | for t in target_values: 345 | cs[t] = np.sum(target == t) 346 | selection_rates = {} 347 | weights = {} 348 | 349 | for key, indices in sensitive_indices.items(): 350 | weights[key] = [] 351 | sv = np.sum(indices) 352 | selection_rates[key] = {} 353 | for t in target_values: 354 | selection_rates[key][t] = np.sum((preds == t) & indices) / np.sum( 355 | indices 356 | ) 357 | 358 | t_k = np.sum(indices & (target == t)) 359 | w_k = sv / target.shape[0] * cs[t] / t_k 360 | weights[key] += [w_k] 361 | 362 | for t in target_values: 363 | values = [] 364 | for k, v in selection_rates.items(): 365 | values += [v[t]] 366 | max_selection_rate = np.max(values) 367 | for k, v in selection_rates.items(): 368 | v[t] /= max_selection_rate 369 | 370 | previous_weights = {} 371 | if previous_fairness_optimization is not None: 372 | weights = previous_fairness_optimization.get("weights") 373 | for key, indices in sensitive_indices.items(): 374 | previous_weights[key] = [1] * len(target_values) 375 | for i, t in enumerate(target_values): 376 | direction = 0.0 377 | if ( 378 | previous_fairness_optimization["selection_rates"][key][t] 379 | < selection_rates[key][t] 380 | ): 381 | direction = 1.0 382 | elif selection_rates[key][t] > 0.8: 383 | direction = 0.0 384 | else: 385 | direction = -0.5 386 | 387 | # need to add previous weights instead 1.0 388 | prev_weights = previous_fairness_optimization.get( 389 | "previous_weights", {} 390 | ).get(key, [1] * len(target_values)) 391 | 392 | delta_i = weights[key][i] - prev_weights[i] 393 | 394 | previous_weights[key][i] = weights[key][i] 395 | 396 | weights[key][i] += direction * delta_i 397 | 398 | step = None 399 | if previous_fairness_optimization is not None: 400 | step = previous_fairness_optimization.get("step") 401 | 402 | if step is None: 403 | step = 0 404 | else: 405 | step += 1 406 | 407 | return { 408 | "selection_rates": selection_rates, 409 | "previous_weights": previous_weights, 410 | "weights": weights, 411 | "step": step, 412 | "fairness_threshold": fairness_threshold, 413 | "target_values": target_values, 414 | } 415 | ``` -------------------------------------------------------------------------------- /supervised/utils/automl_plots.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | import traceback # For exception details 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy as sp 8 | # --- Added Import --- 9 | from sklearn.preprocessing import MinMaxScaler 10 | # -------------------- 11 | 12 | logger = logging.getLogger(__name__) 13 | from supervised.utils.config import LOG_LEVEL 14 | logger.setLevel(LOG_LEVEL) 15 | # Add a handler if running standalone for testing 16 | if not logger.hasHandlers(): 17 | handler = logging.StreamHandler() 18 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | handler.setFormatter(formatter) 20 | logger.addHandler(handler) 21 | 22 | 23 | import warnings 24 | import matplotlib.pyplot as plt 25 | 26 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 27 | 28 | class AutoMLPlots: 29 | # Original filename definitions 30 | features_heatmap_fname = "features_heatmap.png" 31 | correlation_heatmap_fname = "correlation_heatmap.png" 32 | # Filename for Scaled Plot 33 | features_heatmap_scaled_fname = "features_heatmap_scaled.png" 34 | 35 | @staticmethod 36 | def _plot_feature_heatmap(data_df, title, plot_path, cmap="Blues", vmin=None, vmax=None, cbar_label='Importance'): 37 | """ Helper method to generate and save a feature importance heatmap. """ 38 | try: 39 | logger.info(f"Generating heatmap: '{title}'") 40 | # Adjust height dynamically based on number of features 41 | plot_height = max(7, len(data_df.index) * 0.35) 42 | fig, ax = plt.subplots(1, 1, figsize=(10, plot_height)) 43 | 44 | image = ax.imshow( 45 | data_df, 46 | interpolation="nearest", 47 | cmap=plt.cm.get_cmap(cmap), 48 | aspect="auto", 49 | vmin=vmin, # Use provided vmin 50 | vmax=vmax # Use provided vmax 51 | ) 52 | cbar = plt.colorbar(mappable=image) 53 | cbar.set_label(cbar_label) # Use provided label 54 | 55 | x_tick_marks = np.arange(len(data_df.columns)) 56 | y_tick_marks = np.arange(len(data_df.index)) 57 | ax.set_xticks(x_tick_marks) 58 | ax.set_xticklabels(data_df.columns, rotation=90) 59 | ax.set_yticks(y_tick_marks) 60 | ax.set_yticklabels(data_df.index) 61 | ax.set_title(title) 62 | 63 | plt.tight_layout(pad=2.0) 64 | plt.savefig(plot_path) 65 | logger.info(f"Saved heatmap to: {plot_path}") 66 | plt.close(fig) # Close the specific figure 67 | 68 | except Exception as e: 69 | logger.error(f"Failed to generate heatmap '{title}': {e}") 70 | logger.error(traceback.format_exc()) 71 | plt.close("all") # Close any potentially open plots on error 72 | 73 | 74 | @staticmethod 75 | def add(results_path, models, fout): 76 | """ 77 | Adds plots to the report file stream. Now includes both unscaled and scaled importance. 78 | 79 | Args: 80 | results_path (str): Path to results directory. 81 | models (list): List of model objects. 82 | fout (file object): Writable file object for the report. 83 | """ 84 | # Generate both feature importance plots 85 | AutoMLPlots.models_feature_importance(results_path, models) 86 | 87 | # --- Unscaled Feature Importance Section --- 88 | features_plot_path = os.path.join( 89 | results_path, AutoMLPlots.features_heatmap_fname # Use original filename 90 | ) 91 | if os.path.exists(features_plot_path): 92 | fout.write("\n\n### Features Importance (Original Scale)\n") # Updated title 93 | fout.write( 94 | f"\n\n" # Use original filename 95 | ) 96 | else: 97 | logger.warning(f"Original feature importance plot not found at: {features_plot_path}") 98 | 99 | 100 | # --- Scaled Feature Importance Section --- 101 | features_scaled_plot_path = os.path.join( 102 | results_path, AutoMLPlots.features_heatmap_scaled_fname # Use scaled filename 103 | ) 104 | if os.path.exists(features_scaled_plot_path): 105 | fout.write("\n\n### Scaled Features Importance (MinMax per Model)\n") # Title for scaled plot 106 | fout.write( 107 | f"\n\n" # Use scaled filename 108 | ) 109 | else: 110 | logger.warning(f"Scaled feature importance plot not found at: {features_scaled_plot_path}") 111 | 112 | 113 | # --- Correlation Section (remains the same) --- 114 | AutoMLPlots.models_correlation(results_path, models) 115 | 116 | correlation_plot_path = os.path.join( 117 | results_path, AutoMLPlots.correlation_heatmap_fname 118 | ) 119 | if os.path.exists(correlation_plot_path): 120 | fout.write("\n\n### Spearman Correlation of Models\n") 121 | fout.write( 122 | f"\n\n" 123 | ) 124 | else: 125 | logger.warning(f"Model correlation plot not found at: {correlation_plot_path}") 126 | 127 | 128 | @staticmethod 129 | def models_feature_importance(results_path, models): 130 | """ 131 | Generates and saves BOTH original and scaled feature importance heatmaps. 132 | """ 133 | logger.info("Starting feature importance generation (original and scaled).") 134 | try: 135 | # --- Data Aggregation (Common part) --- 136 | model_feature_imp = {} 137 | # (Same robust reading logic as before) 138 | for m in models: 139 | model_name = m.get_name() 140 | model_path = os.path.join(results_path, model_name) 141 | logger.debug(f"Processing model '{model_name}' in '{model_path}'") 142 | if not os.path.isdir(model_path): 143 | logger.warning(f"Directory not found for model '{model_name}'. Skipping.") 144 | continue 145 | try: 146 | all_files = os.listdir(model_path) 147 | except OSError as e: 148 | logger.error(f"Cannot list directory {model_path}: {e}. Skipping model '{model_name}'.") 149 | continue 150 | imp_data = [f for f in all_files if "_importance.csv" in f and "shap" not in f] 151 | if not imp_data: 152 | logger.warning(f"No suitable importance files found for model '{model_name}'. Skipping.") 153 | continue 154 | df_all = [] 155 | for fname in imp_data: 156 | file_path = os.path.join(model_path, fname) 157 | try: 158 | df = pd.read_csv(file_path, index_col=0) 159 | numeric_df = df.select_dtypes(include=np.number) 160 | if numeric_df.empty or numeric_df.isnull().all().all(): 161 | logger.warning(f"File {fname} (model '{model_name}') contains no valid numeric data. Skipping.") 162 | continue 163 | df_all.append(df) 164 | except Exception as read_e: 165 | logger.error(f"Error reading/processing file {fname} (model '{model_name}'): {read_e}. Skipping.") 166 | continue 167 | if not df_all: 168 | logger.warning(f"No valid importance dataframes read for model '{model_name}'. Skipping.") 169 | continue 170 | try: 171 | df_concat = pd.concat(df_all, axis=1, join='outer') 172 | numeric_df_concat = df_concat.select_dtypes(include=np.number) 173 | if not numeric_df_concat.empty: 174 | model_feature_imp[model_name] = numeric_df_concat.mean(axis=1).fillna(0) 175 | else: 176 | logger.warning(f"No numeric data after concat for model '{model_name}'. Skipping.") 177 | except Exception as concat_e: 178 | logger.error(f"Error aggregating importance for model '{model_name}': {concat_e}") 179 | continue 180 | 181 | logger.info(f"Collected feature importance for {len(model_feature_imp)} models.") 182 | if len(model_feature_imp) < 2: 183 | logger.warning("Feature importance heatmaps require at least 2 models with data. Skipping plot generation.") 184 | return 185 | 186 | mfi = pd.concat(model_feature_imp, axis=1, join='outer').fillna(0) 187 | logger.debug(f"Combined importance DataFrame shape: {mfi.shape}") 188 | 189 | # --- Sorting & Top N (Common part) --- 190 | mfi["m"] = mfi.mean(axis=1) 191 | mfi_sorted = mfi.sort_values(by="m", ascending=False) 192 | mfi_sorted = mfi_sorted.drop("m", axis=1) # Keep original mfi for potential later use if needed 193 | 194 | num_features_original = mfi_sorted.shape[0] 195 | mfi_plot_data = mfi_sorted # Default to using all sorted features 196 | title_suffix = "Feature Importance" 197 | scaled_title_suffix = "Scaled Feature Importance (MinMax per model)" 198 | 199 | if num_features_original > 25: 200 | mfi_plot_data = mfi_sorted.head(25) 201 | title_suffix = f"Top-25 ({num_features_original} total) Feature Importance" 202 | scaled_title_suffix = f"Top-25 ({num_features_original} total) Scaled Feature Importance (MinMax per model)" 203 | logger.info(f"Selecting top 25 features out of {num_features_original} for plotting.") 204 | else: 205 | logger.info(f"Using all {num_features_original} features for plotting.") 206 | 207 | 208 | # --- Plotting Unscaled Version --- 209 | unscaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_fname) 210 | AutoMLPlots._plot_feature_heatmap( 211 | data_df=mfi_plot_data, 212 | title=title_suffix + " (Original Scale)", 213 | plot_path=unscaled_plot_path, 214 | cbar_label='Importance' 215 | # vmin/vmax are auto-detected by default 216 | ) 217 | 218 | # --- Scaling Data --- 219 | logger.debug("Applying Min-Max scaling for the second plot.") 220 | scaler = MinMaxScaler() 221 | mfi_scaled_array = scaler.fit_transform(mfi_plot_data) # Scale the potentially filtered data 222 | mfi_scaled = pd.DataFrame(mfi_scaled_array, index=mfi_plot_data.index, columns=mfi_plot_data.columns) 223 | 224 | # --- Plotting Scaled Version --- 225 | scaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_scaled_fname) 226 | AutoMLPlots._plot_feature_heatmap( 227 | data_df=mfi_scaled, 228 | title=scaled_title_suffix, 229 | plot_path=scaled_plot_path, 230 | vmin=0, # Explicit range for scaled data 231 | vmax=1, 232 | cbar_label='Scaled Importance (MinMax per model)' 233 | ) 234 | 235 | logger.info("Finished generating feature importance plots.") 236 | 237 | except Exception as e: 238 | logger.error(f"An error occurred during feature importance processing: {e}") 239 | logger.error(traceback.format_exc()) 240 | plt.close("all") # Ensure plots are closed on unexpected error 241 | 242 | 243 | # --- correlation and models_correlation methods remain the same as in the previous version --- 244 | # (Include the improved versions from the previous response here) 245 | @staticmethod 246 | def correlation(oof1, oof2): 247 | """ Calculates mean Spearman correlation between prediction columns """ 248 | # (Original code - unchanged) 249 | cols = [c for c in oof1.columns if "prediction" in c] 250 | # Check if prediction columns exist 251 | if not cols or not all(c in oof2.columns for c in cols): 252 | logger.warning("Prediction columns mismatch or not found for correlation calculation.") 253 | return np.nan # Return NaN if predictions can't be compared 254 | 255 | with warnings.catch_warnings(): 256 | warnings.simplefilter(action="ignore") 257 | v = [] 258 | for c in cols: 259 | try: 260 | # Calculate Spearman correlation, ignore p-value 261 | corr_val, _ = sp.stats.spearmanr(oof1[c], oof2[c]) 262 | # Handle potential NaN result from spearmanr if input variance is zero 263 | if not np.isnan(corr_val): 264 | v.append(corr_val) 265 | else: 266 | logger.debug(f"NaN result from spearmanr for column {c}. Skipping.") 267 | except Exception as corr_e: 268 | logger.warning(f"Could not calculate Spearman correlation for column {c}: {corr_e}") 269 | 270 | # Return mean correlation, or NaN if no valid correlations were calculated 271 | return np.mean(v) if v else np.nan 272 | 273 | 274 | @staticmethod 275 | def models_correlation(results_path, models): 276 | """ Generates and saves model prediction correlation heatmap """ 277 | # (Original code - minor logging/error handling improvements) 278 | logger.info("Starting model correlation heatmap generation.") 279 | try: 280 | if len(models) < 2: 281 | logger.warning("Model correlation heatmap requires at least 2 models. Skipping.") 282 | return 283 | 284 | names = [] 285 | oofs = [] 286 | valid_models_indices = [] # Keep track of models with valid OOF data 287 | 288 | for i, m in enumerate(models): 289 | try: 290 | oof_data = m.get_out_of_folds() 291 | # Basic validation of OOF data 292 | if oof_data is None or oof_data.empty or not any("prediction" in c for c in oof_data.columns): 293 | logger.warning(f"Model '{m.get_name()}' has invalid or missing out-of-folds prediction data. Excluding from correlation.") 294 | continue 295 | 296 | names.append(m.get_name()) 297 | oofs.append(oof_data) 298 | valid_models_indices.append(i) # Store original index if valid 299 | logger.debug(f"Got valid OOF data for model '{m.get_name()}'.") 300 | 301 | except AttributeError: 302 | logger.warning(f"Model '{m.get_name()}' seems to be missing 'get_out_of_folds' method or it failed. Excluding from correlation.") 303 | continue 304 | except Exception as oof_e: 305 | logger.warning(f"Failed to get OOF data for model '{m.get_name()}': {oof_e}. Excluding from correlation.") 306 | continue 307 | 308 | 309 | num_valid_models = len(names) 310 | if num_valid_models < 2: 311 | logger.warning(f"Fewer than 2 models ({num_valid_models}) have valid OOF data for correlation. Skipping plot generation.") 312 | return 313 | 314 | logger.info(f"Calculating correlations for {num_valid_models} models.") 315 | corrs = np.ones((num_valid_models, num_valid_models)) # Use num_valid_models dimension 316 | for i in range(num_valid_models): 317 | for j in range(i + 1, num_valid_models): 318 | correlation_value = AutoMLPlots.correlation(oofs[i], oofs[j]) 319 | # Fill with NaN if correlation calculation failed 320 | corrs[i, j] = corrs[j, i] = correlation_value if not np.isnan(correlation_value) else np.nan 321 | 322 | 323 | # Check if all correlations are NaN 324 | if np.isnan(corrs[np.triu_indices(num_valid_models, k=1)]).all(): 325 | logger.warning("All pairwise model correlations resulted in NaN. Cannot generate heatmap.") 326 | return 327 | 328 | 329 | logger.info("Generating model correlation heatmap.") 330 | figsize = (15, 15) if num_valid_models > 15 else (10, 10) # Adjusted threshold 331 | fig, ax = plt.subplots(1, 1, figsize=figsize) 332 | 333 | image = ax.imshow( 334 | corrs, 335 | interpolation="nearest", 336 | cmap=plt.cm.get_cmap("Blues"), 337 | aspect="auto", 338 | vmin=np.nanmin(corrs), # Use nanmin/nanmax to handle potential NaNs 339 | vmax=np.nanmax(corrs) 340 | ) 341 | plt.colorbar(mappable=image) 342 | 343 | x_tick_marks = np.arange(num_valid_models) 344 | y_tick_marks = np.arange(num_valid_models) 345 | ax.set_xticks(x_tick_marks) 346 | ax.set_xticklabels(names, rotation=90) 347 | ax.set_yticks(y_tick_marks) 348 | ax.set_yticklabels(names) 349 | ax.set_title("Spearman Correlation of Models' OOF Predictions") # Slightly more descriptive title 350 | 351 | plt.tight_layout(pad=2.0) 352 | 353 | # --- Saving the Plot --- 354 | os.makedirs(results_path, exist_ok=True) # Ensure directory exists 355 | plot_path = os.path.join( 356 | results_path, AutoMLPlots.correlation_heatmap_fname 357 | ) 358 | plt.savefig(plot_path) 359 | logger.info(f"Saved model correlation heatmap to: {plot_path}") 360 | plt.close("all") # Close plot to free memory 361 | 362 | except Exception as e: 363 | # Log the exception with traceback 364 | logger.error(f"An error occurred during model correlation plotting: {e}") 365 | logger.error(traceback.format_exc()) 366 | # Ensure plot is closed if error occurred during saving/closing 367 | plt.close("all") 368 | 369 | 370 | ```