This is page 5 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_preprocessing.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.preprocessing import Preprocessing 7 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 8 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues 9 | 10 | 11 | class PreprocessingTest(unittest.TestCase): 12 | def test_constructor_preprocessing_step(self): 13 | preprocessing_params = {} 14 | ps = Preprocessing(preprocessing_params) 15 | 16 | self.assertTrue(len(ps._missing_values) == 0) 17 | self.assertTrue(len(ps._categorical) == 0) 18 | self.assertTrue(ps._categorical_y is None) 19 | 20 | def test_exclude_missing_targets_all_good(self): 21 | # training data 22 | d = { 23 | "col1": [1, 1, 1, 3], 24 | "col2": [5, 6, 7, 0], 25 | "col3": [1, 1, 1, 3], 26 | "col4": [2, 2, 4, 3], 27 | "y": [0, 1, 0, 1], 28 | } 29 | df = pd.DataFrame(data=d) 30 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 31 | y_train = df.loc[:, "y"] 32 | 33 | ps = Preprocessing() 34 | X_train, y_train = ps._exclude_missing_targets(X_train, y_train) 35 | 36 | self.assertEqual(4, X_train.shape[0]) 37 | self.assertEqual(4, y_train.shape[0]) 38 | 39 | def test_exclude_missing_targets(self): 40 | # training data 41 | d = { 42 | "col1": [1, 1, 1, 3], 43 | "col2": [5, 6, 7, 0], 44 | "col3": [1, 1, 1, 3], 45 | "col4": [2, 2, 4, 3], 46 | "y": [0, np.nan, 0, 1], 47 | } 48 | df = pd.DataFrame(data=d) 49 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 50 | y_train = df.loc[:, "y"] 51 | 52 | ps = Preprocessing() 53 | X_train, y_train = ps._exclude_missing_targets(X_train, y_train) 54 | 55 | self.assertEqual(3, X_train.shape[0]) 56 | self.assertEqual(3, y_train.shape[0]) 57 | 58 | def test_run_exclude_missing_targets(self): 59 | # training data 60 | d = { 61 | "col1": [1, 1, 1, 3], 62 | "col2": [5, 6, 7, 0], 63 | "col3": [1, 1, 1, 3], 64 | "col4": [2, 2, 4, 3], 65 | "y": [0, np.nan, 0, 1], 66 | } 67 | df = pd.DataFrame(data=d) 68 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 69 | y_train = df.loc[:, "y"] 70 | 71 | ps = Preprocessing() 72 | X_train, y_train, _ = ps.fit_and_transform(X_train, y_train) 73 | self.assertEqual(3, X_train.shape[0]) 74 | self.assertEqual(3, y_train.shape[0]) 75 | 76 | def test_run_all_good(self): 77 | # training data 78 | d = { 79 | "col1": [1, 1, 1, 3], 80 | "col2": [5, 6, 7, 0], 81 | "col3": [1, 1, 1, 3], 82 | "col4": [2, 2, 4, 3], 83 | "y": [0, 1, 0, 1], 84 | } 85 | df = pd.DataFrame(data=d) 86 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 87 | y_train = df.loc[:, "y"] 88 | 89 | preprocessing_params = { 90 | "columns_preprocessing": { 91 | "col1": [ 92 | PreprocessingMissingValues.FILL_NA_MEDIAN, 93 | PreprocessingCategorical.CONVERT_INTEGER, 94 | ], 95 | "col2": [ 96 | PreprocessingMissingValues.FILL_NA_MEDIAN, 97 | PreprocessingCategorical.CONVERT_INTEGER, 98 | ], 99 | "col3": [ 100 | PreprocessingMissingValues.FILL_NA_MEDIAN, 101 | PreprocessingCategorical.CONVERT_INTEGER, 102 | ], 103 | "col4": [ 104 | PreprocessingMissingValues.FILL_NA_MEDIAN, 105 | PreprocessingCategorical.CONVERT_INTEGER, 106 | ], 107 | } 108 | } 109 | 110 | ps = Preprocessing(preprocessing_params) 111 | 112 | X_train, y_train, _ = ps.fit_and_transform(X_train, y_train) 113 | 114 | for col in ["col1", "col2", "col3", "col4"]: 115 | self.assertTrue(col in X_train.columns) 116 | 117 | params_json = ps.to_json() 118 | self.assertEqual(len(params_json), 1) # should store params only 119 | self.assertTrue("params" in params_json) 120 | 121 | def test_run_fill_median_convert_integer(self): 122 | # training data 123 | d = { 124 | "col1": [1, 1, np.nan, 3], 125 | "col2": ["a", "a", np.nan, "a"], 126 | "col3": [1, 1, 1, 3], 127 | "col4": ["a", "a", "b", "c"], 128 | "y": [0, 1, 0, 1], 129 | } 130 | df = pd.DataFrame(data=d) 131 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 132 | y_train = df.loc[:, "y"] 133 | 134 | preprocessing_params = { 135 | "columns_preprocessing": { 136 | "col1": [ 137 | PreprocessingMissingValues.FILL_NA_MEDIAN, 138 | PreprocessingCategorical.CONVERT_INTEGER, 139 | ], 140 | "col2": [ 141 | PreprocessingMissingValues.FILL_NA_MEDIAN, 142 | PreprocessingCategorical.CONVERT_INTEGER, 143 | ], 144 | "col3": [ 145 | PreprocessingMissingValues.FILL_NA_MEDIAN, 146 | PreprocessingCategorical.CONVERT_INTEGER, 147 | ], 148 | "col4": [ 149 | PreprocessingMissingValues.FILL_NA_MEDIAN, 150 | PreprocessingCategorical.CONVERT_INTEGER, 151 | ], 152 | } 153 | } 154 | 155 | ps = Preprocessing(preprocessing_params) 156 | X_train, y_train, _ = ps.fit_and_transform(X_train, y_train) 157 | 158 | for col in ["col1", "col2", "col3", "col4"]: 159 | self.assertTrue(col in X_train.columns) 160 | self.assertEqual(X_train["col1"][2], 1) 161 | self.assertEqual(X_train["col2"][2], 0) 162 | self.assertEqual(X_train["col4"][0], 0) 163 | self.assertEqual(X_train["col4"][1], 0) 164 | self.assertEqual(X_train["col4"][2], 1) 165 | self.assertEqual(X_train["col4"][3], 2) 166 | 167 | params_json = ps.to_json() 168 | 169 | self.assertTrue("missing_values" in params_json) 170 | self.assertTrue("categorical" in params_json) 171 | self.assertTrue("categorical_y" not in params_json) 172 | 173 | self.assertTrue("fill_params" in params_json["missing_values"][0]) 174 | self.assertEqual( 175 | "na_fill_median", params_json["missing_values"][0]["fill_method"] 176 | ) 177 | self.assertTrue("convert_params" in params_json["categorical"][0]) 178 | self.assertEqual( 179 | "categorical_to_int", params_json["categorical"][0]["convert_method"] 180 | ) 181 | 182 | def test_run_fill_median_convert_integer_validation_dataset(self): 183 | # training data 184 | d = { 185 | "col1": [1, 1, np.nan, 3], 186 | "col2": ["a", "a", np.nan, "a"], 187 | "col3": [1, 1, 1, 3], 188 | "col4": ["a", "a", "b", "c"], 189 | "y": [0, 1, 1, 1], 190 | } 191 | df = pd.DataFrame(data=d) 192 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 193 | y_train = df.loc[:, "y"] 194 | 195 | d_test = { 196 | "col1": [1, 1, np.nan, 3], 197 | "col2": ["a", "a", np.nan, "a"], 198 | "col3": [1, 1, 1, 3], 199 | "col4": ["a", "a", "b", "c"], 200 | "y": [np.nan, 1, np.nan, 1], 201 | } 202 | df_test = pd.DataFrame(data=d_test) 203 | X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 204 | y_test = df_test.loc[:, "y"] 205 | 206 | preprocessing_params = { 207 | "columns_preprocessing": { 208 | "col1": [ 209 | PreprocessingMissingValues.FILL_NA_MEDIAN, 210 | PreprocessingCategorical.CONVERT_INTEGER, 211 | ], 212 | "col2": [ 213 | PreprocessingMissingValues.FILL_NA_MEDIAN, 214 | PreprocessingCategorical.CONVERT_INTEGER, 215 | ], 216 | "col3": [ 217 | PreprocessingMissingValues.FILL_NA_MEDIAN, 218 | PreprocessingCategorical.CONVERT_INTEGER, 219 | ], 220 | "col4": [ 221 | PreprocessingMissingValues.FILL_NA_MEDIAN, 222 | PreprocessingCategorical.CONVERT_INTEGER, 223 | ], 224 | } 225 | } 226 | 227 | ps = Preprocessing(preprocessing_params) 228 | 229 | X_train, y_train, _ = ps.fit_and_transform(X_train, y_train) 230 | X_test, y_test, _ = ps.transform(X_test, y_test) 231 | 232 | for col in ["col1", "col2", "col3", "col4"]: 233 | self.assertTrue(col in X_train.columns) 234 | self.assertTrue(col in X_test.columns) 235 | 236 | self.assertEqual(4, X_train.shape[0]) 237 | self.assertEqual(4, y_train.shape[0]) 238 | self.assertEqual(2, X_test.shape[0]) 239 | self.assertEqual(2, y_test.shape[0]) 240 | 241 | def test_run_on_y_only(self): 242 | d = {"y": ["a", "b", "a", "b"]} 243 | df = pd.DataFrame(data=d) 244 | y_train = df.loc[:, "y"] 245 | 246 | preprocessing_params = { 247 | "target_preprocessing": [ 248 | PreprocessingMissingValues.FILL_NA_MEDIAN, 249 | PreprocessingCategorical.CONVERT_INTEGER, 250 | ] 251 | } 252 | 253 | ps = Preprocessing(preprocessing_params) 254 | _, y_train, _ = ps.fit_and_transform(None, y_train) 255 | 256 | self.assertEqual(4, y_train.shape[0]) 257 | self.assertEqual(0, y_train[0]) 258 | self.assertEqual(1, y_train[1]) 259 | 260 | def test_run_on_y_only_validation(self): 261 | d = {"y": ["a", "b", "a", "b"]} 262 | df = pd.DataFrame(data=d) 263 | y_train = df.loc[:, "y"] 264 | 265 | d_test = {"y": [np.nan, "a", np.nan, "b"]} 266 | df_test = pd.DataFrame(data=d_test) 267 | y_test = df_test.loc[:, "y"] 268 | 269 | preprocessing_params = { 270 | "target_preprocessing": [ 271 | PreprocessingMissingValues.FILL_NA_MEDIAN, 272 | PreprocessingCategorical.CONVERT_INTEGER, 273 | ] 274 | } 275 | 276 | ps = Preprocessing(preprocessing_params) 277 | 278 | _, y_train, _ = ps.fit_and_transform(None, y_train) 279 | _, y_test, _ = ps.transform(None, y_test) 280 | 281 | self.assertEqual(4, y_train.shape[0]) 282 | self.assertEqual(2, y_test.shape[0]) 283 | self.assertEqual(0, y_train[0]) 284 | self.assertEqual(1, y_train[1]) 285 | self.assertEqual(0, y_test[0]) 286 | self.assertEqual(1, y_test[1]) 287 | 288 | def test_to_and_from_json_run_fill_median_convert_integer(self): 289 | # training data 290 | d = { 291 | "col1": [1, 1, np.nan, 3], 292 | "col2": ["a", "a", np.nan, "a"], 293 | "col3": [1, 1, 1, 3], 294 | "col4": ["a", "a", "b", "c"], 295 | "y": [0, 1, 0, 1], 296 | } 297 | df = pd.DataFrame(data=d) 298 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 299 | y_train = df.loc[:, "y"] 300 | 301 | preprocessing_params = { 302 | "columns_preprocessing": { 303 | "col1": [PreprocessingMissingValues.FILL_NA_MEDIAN], 304 | "col2": [ 305 | PreprocessingMissingValues.FILL_NA_MEDIAN, 306 | PreprocessingCategorical.CONVERT_INTEGER, 307 | ], 308 | "col4": [ 309 | PreprocessingMissingValues.FILL_NA_MEDIAN, 310 | PreprocessingCategorical.CONVERT_INTEGER, 311 | ], 312 | }, 313 | "target_preprocessing": [], 314 | } 315 | 316 | ps = Preprocessing(preprocessing_params) 317 | _, _, _ = ps.fit_and_transform(X_train, y_train) 318 | 319 | ps2 = Preprocessing() 320 | ps2.from_json(ps.to_json(), "./") 321 | del ps 322 | 323 | d_test = { 324 | "col1": [1, 1, np.nan, 3], 325 | "col2": ["a", "a", np.nan, "a"], 326 | "col3": [1, 1, 1, 3], 327 | "col4": ["a", "a", "b", "c"], 328 | "y": [np.nan, np.nan, 1, 1], 329 | } 330 | df_test = pd.DataFrame(data=d_test) 331 | X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 332 | y_test = df_test.loc[:, "y"] 333 | 334 | X_test, y_test, _ = ps2.transform(X_test, y_test) 335 | 336 | self.assertEqual(2, y_test.shape[0]) 337 | self.assertEqual(2, np.sum(y_test)) 338 | self.assertEqual(1, X_test["col1"].iloc[0]) 339 | self.assertEqual(0, X_test["col2"].iloc[0]) 340 | 341 | def test_empty_column(self): 342 | # training data 343 | d = { 344 | "col1": [np.nan, np.nan, np.nan, np.nan], 345 | "col2": [5, 6, 7, 0], 346 | "col3": [1, 1, 1, 3], 347 | "col4": [2, 2, 4, 3], 348 | "y": [0, 1, 0, 1], 349 | } 350 | df = pd.DataFrame(data=d) 351 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 352 | y_train = df.loc[:, "y"] 353 | 354 | preprocessing_params = {"columns_preprocessing": {"col1": ["remove_column"]}} 355 | 356 | ps = Preprocessing(preprocessing_params) 357 | X_train1, _, _ = ps.fit_and_transform(X_train, y_train) 358 | 359 | self.assertTrue("col1" not in X_train1.columns) 360 | self.assertEqual(3, len(X_train1.columns)) 361 | X_train2, _, _ = ps.transform(X_train, y_train) 362 | self.assertTrue("col1" not in X_train2.columns) 363 | self.assertEqual(3, len(X_train2.columns)) 364 | for col in ["col2", "col3", "col4"]: 365 | self.assertTrue(col in X_train2.columns) 366 | 367 | params_json = ps.to_json() 368 | ps2 = Preprocessing() 369 | ps2.from_json(params_json, "./") 370 | 371 | X_train3, _, _ = ps2.transform(X_train, y_train) 372 | self.assertTrue("col1" not in X_train3.columns) 373 | self.assertEqual(3, len(X_train3.columns)) 374 | for col in ["col2", "col3", "col4"]: 375 | self.assertTrue(col in X_train3.columns) 376 | 377 | 378 | """ 379 | def test_run_fill_median_convert_one_hot_validation_dataset(self): 380 | # training data 381 | d = { 382 | "col1": [1, 1, np.nan, 3], 383 | "col2": ["a", "a", np.nan, "a"], 384 | "col3": [1, 1, 1, 3], 385 | "col4": ["a", "a", "b", "c"], 386 | "y": [0, 1, 1, 1], 387 | } 388 | df = pd.DataFrame(data=d) 389 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 390 | y_train = df.loc[:, "y"] 391 | 392 | d_test = { 393 | "col1": [1, 1, np.nan, 3], 394 | "col2": ["a", "z", np.nan, "a"], 395 | "col3": [1, 1, 1, 3], 396 | "col4": ["a", "a", "b", "c"], 397 | "y": [np.nan, 1, np.nan, 1], 398 | } 399 | df_test = pd.DataFrame(data=d_test) 400 | X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 401 | y_test = df_test.loc[:, "y"] 402 | 403 | ps = Preprocessing( 404 | missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN, 405 | categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT, 406 | ) 407 | X_train, y_train, X_test, y_test = ps.run( 408 | X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test 409 | ) 410 | 411 | for col in ["col1", "col2_a", "col3", "col4_a", "col4_b", "col4_c"]: 412 | self.assertTrue(col in X_train.columns) 413 | self.assertTrue(col in X_test.columns) 414 | 415 | self.assertEqual(4, X_train.shape[0]) 416 | self.assertEqual(2, X_test.shape[0]) 417 | self.assertEqual(4, np.sum(X_train["col2_a"])) 418 | self.assertEqual(2, np.sum(X_train["col4_a"])) 419 | self.assertEqual(1, np.sum(X_train["col4_b"])) 420 | self.assertEqual(1, np.sum(X_train["col4_c"])) 421 | self.assertEqual(0, X_test.loc[0, "col2_a"]) 422 | self.assertEqual(1, X_test.loc[1, "col2_a"]) 423 | 424 | def test_run_fill_median_convert_one_hot_big_categorical(self): 425 | 426 | a_lot = 250 427 | cs = [] 428 | for i in range(a_lot): 429 | cs.append(str(uuid.uuid4().hex.upper()[0:6])) 430 | 431 | d = { 432 | "col1": cs, 433 | "col2": ["a", "b"] * int(a_lot / 2), 434 | "col3": range(a_lot), 435 | "col4": range(a_lot), 436 | } 437 | 438 | df = pd.DataFrame(data=d) 439 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 440 | X_train_2 = copy.deepcopy(X_train) 441 | 442 | ps = Preprocessing( 443 | missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN, 444 | categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT, 445 | ) 446 | X_train, _, _, _ = ps.run(X_train=X_train) 447 | 448 | for col in ["col1", "col2_b", "col3", "col4"]: 449 | self.assertTrue(col in X_train.columns) 450 | 451 | self.assertTrue( 452 | np.max(X_train["col1"]) > 0.90 * a_lot 453 | ) # there can be duplicates ;) 454 | self.assertEqual(np.max(X_train["col2_b"]), 1) 455 | self.assertEqual(np.sum(X_train["col2_b"]), a_lot / 2) 456 | 457 | ps2 = Preprocessing() 458 | ps2.from_json(ps.to_json()) 459 | del ps 460 | # apply preprocessing loaded from json 461 | _, _, X_train_2, _ = ps2.run(X_test=X_train_2) 462 | for col in ["col1", "col2_b", "col3", "col4"]: 463 | self.assertTrue(col in X_train_2.columns) 464 | 465 | self.assertTrue( 466 | np.max(X_train_2["col1"]) > 0.90 * a_lot 467 | ) # there can be duplicates ;) 468 | self.assertEqual(np.max(X_train_2["col2_b"]), 1) 469 | self.assertEqual(np.sum(X_train_2["col2_b"]), a_lot / 2) 470 | 471 | def test_convert_target(self): 472 | d = { 473 | "col1": [1, 1, np.nan, 3], 474 | "col2": ["a", "a", np.nan, "a"], 475 | "col3": [1, 1, 1, 3], 476 | "col4": ["a", "a", "b", "c"], 477 | "y": [2, 2, 1, 1], 478 | } 479 | df = pd.DataFrame(data=d) 480 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 481 | y_train = df.loc[:, "y"] 482 | 483 | ps = Preprocessing( 484 | missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN, 485 | categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT, 486 | project_task="PROJECT_BIN_CLASS", 487 | ) 488 | X_train, y_train, _, _ = ps.run(X_train=X_train, y_train=y_train) 489 | 490 | self.assertEqual(2, len(np.unique(y_train))) 491 | self.assertTrue(0 in np.unique(y_train)) 492 | self.assertTrue(1 in np.unique(y_train)) 493 | 494 | def test_dont_convert_target(self): 495 | d = { 496 | "col1": [1, 1, np.nan, 3], 497 | "col2": ["a", "a", np.nan, "a"], 498 | "col3": [1, 1, 1, 3], 499 | "col4": ["a", "a", "b", "c"], 500 | "y": [2, 2, 1, 1], 501 | } 502 | df = pd.DataFrame(data=d) 503 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 504 | y_train = df.loc[:, "y"] 505 | 506 | ps = Preprocessing( 507 | missing_values_method=PreprocessingMissingValues.FILL_NA_MEDIAN, 508 | categorical_method=PreprocessingCategorical.CONVERT_ONE_HOT, 509 | project_task="PROJECT_REGRESSION", 510 | ) 511 | X_train, y_train, _, _ = ps.run(X_train=X_train, y_train=y_train) 512 | 513 | self.assertEqual(2, len(np.unique(y_train))) 514 | self.assertTrue(1 in np.unique(y_train)) 515 | self.assertTrue(2 in np.unique(y_train)) 516 | """ 517 | 518 | if __name__ == "__main__": 519 | unittest.main() 520 | ``` -------------------------------------------------------------------------------- /supervised/ensemble.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | import json 3 | import logging 4 | import os 5 | import time 6 | import uuid 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from supervised.algorithms.registry import ( 12 | BINARY_CLASSIFICATION, 13 | MULTICLASS_CLASSIFICATION, 14 | REGRESSION, 15 | ) 16 | from supervised.exceptions import NotTrainedException 17 | from supervised.model_framework import ModelFramework 18 | from supervised.utils.additional_metrics import AdditionalMetrics 19 | from supervised.utils.config import LOG_LEVEL 20 | from supervised.utils.jsonencoder import MLJSONEncoder 21 | from supervised.utils.metric import Metric 22 | 23 | logger = logging.getLogger(__name__) 24 | logger.setLevel(LOG_LEVEL) 25 | 26 | from tabulate import tabulate 27 | 28 | from supervised.utils.learning_curves import LearningCurves 29 | 30 | 31 | class Ensemble: 32 | algorithm_name = "Greedy Ensemble" 33 | algorithm_short_name = "Ensemble" 34 | 35 | def __init__( 36 | self, 37 | optimize_metric="logloss", 38 | ml_task=BINARY_CLASSIFICATION, 39 | is_stacked=False, 40 | max_single_prediction_time=None, 41 | fairness_metric=None, 42 | fairness_threshold=None, 43 | privileged_groups=None, 44 | underprivileged_groups=None, 45 | ): 46 | self.library_version = "0.1" 47 | self.uid = str(uuid.uuid4()) 48 | 49 | self.metric = Metric({"name": optimize_metric}) 50 | self.best_loss = self.metric.get_maximum() # the best loss obtained by ensemble 51 | self.models_map = None 52 | self.selected_models = [] 53 | self.train_time = None 54 | self.total_best_sum = None # total sum of predictions, the oof of ensemble 55 | self.target = None 56 | self.target_columns = None 57 | self.sample_weight = None 58 | self._ml_task = ml_task 59 | self._optimize_metric = optimize_metric 60 | self._is_stacked = is_stacked 61 | 62 | self._additional_metrics = None 63 | self._threshold = None 64 | self._name = "Ensemble_Stacked" if is_stacked else "Ensemble" 65 | self._scores = [] 66 | self.oof_predictions = None 67 | self._oof_predictions_fname = None 68 | self._single_prediction_time = None # prediction time on single sample 69 | self._max_single_prediction_time = max_single_prediction_time 70 | self.model_prediction_time = {} 71 | 72 | self._fairness_metric = fairness_metric 73 | self._fairness_threshold = fairness_threshold 74 | self._privileged_groups = privileged_groups 75 | self._underprivileged_groups = underprivileged_groups 76 | self._is_fair = None 77 | self.sensitive_features = None 78 | 79 | def get_train_time(self): 80 | return self.train_time 81 | 82 | def get_final_loss(self): 83 | return self.best_loss 84 | 85 | def is_valid(self): 86 | return len(self.selected_models) > 1 87 | 88 | def is_fast_enough(self, max_single_prediction_time): 89 | # dont need to check 90 | if max_single_prediction_time is None: 91 | return True 92 | 93 | # no iformation about prediction time 94 | if self._single_prediction_time is None: 95 | return True 96 | 97 | return self._single_prediction_time < max_single_prediction_time 98 | 99 | def get_type(self): 100 | prefix = "" # "Stacked" if self._is_stacked else "" 101 | return prefix + self.algorithm_short_name 102 | 103 | def get_name(self): 104 | return self._name 105 | 106 | def involved_model_names(self): 107 | """Returns the list of all models involved in the current model. 108 | For single model, it returns the list with the name of the model. 109 | For ensemble model, it returns the list with the name of the ensemble and all internal models 110 | (used to build ensemble). 111 | For single model but trained on stacked data, it returns the list with the name of the model 112 | (names of models used in stacking are not included).""" 113 | if self.selected_models is None or not self.selected_models: 114 | return [self._name] 115 | l = [] 116 | for m in self.selected_models: 117 | l += m["model"].involved_model_names() 118 | return [self._name] + l 119 | 120 | def get_metric_name(self): 121 | return self.metric.name 122 | 123 | def get_metric(self): 124 | return self.metric 125 | 126 | def get_out_of_folds(self): 127 | """Needed when ensemble is treated as model and we want to compute additional metrics for it""" 128 | # single prediction (in case of binary classification and regression) 129 | if self.oof_predictions is not None: 130 | return self.oof_predictions.copy(deep=True) 131 | 132 | if self._oof_predictions_fname is not None: 133 | self.oof_predictions = pd.read_csv(self._oof_predictions_fname) 134 | return self.oof_predictions.copy(deep=True) 135 | 136 | ensemble_oof = pd.DataFrame( 137 | data=self.total_best_sum, columns=self.total_best_sum.columns 138 | ) 139 | ensemble_oof["target"] = self.target 140 | if self.sample_weight is not None: 141 | ensemble_oof["sample_weight"] = self.sample_weight 142 | 143 | # if self.sensitive_features is not None: 144 | # for col in self.sensitive_features.columns: 145 | # ensemble_oof[col] = self.sensitive_features[col] 146 | 147 | self.oof_predictions = ensemble_oof 148 | return ensemble_oof 149 | 150 | def _get_mean(self, oof_selected, best_sum, best_count): 151 | resp = copy.deepcopy(oof_selected) 152 | if best_count > 1: 153 | resp += best_sum 154 | resp /= float(best_count) 155 | return resp 156 | 157 | def get_oof_matrix(self, models): 158 | # remember models, will be needed in predictions 159 | self.models_map = {m.get_name(): m for m in models} 160 | 161 | if self._max_single_prediction_time is not None: 162 | self.model_prediction_time = { 163 | m.get_name(): m._single_prediction_time for m in models 164 | } 165 | 166 | if not [ 167 | m for m in models if m.is_fast_enough(self._max_single_prediction_time) 168 | ]: 169 | raise NotTrainedException( 170 | "Can't contruct ensemble with prediction time smaller than limit." 171 | ) 172 | 173 | # check if we can construct fair ensemble 174 | if self._fairness_metric is not None: 175 | if not [m for m in models if m.is_fair()]: 176 | raise NotTrainedException("Can't contruct fair ensemble.") 177 | 178 | oofs = {} 179 | sensitive_features = None 180 | for m in models: 181 | # do not use model with RandomFeature 182 | if "RandomFeature" in m.get_name(): 183 | continue 184 | 185 | # ensemble only the same level of stack 186 | # if m._is_stacked != self._is_stacked: 187 | # continue 188 | oof = m.get_out_of_folds() 189 | prediction_cols = [c for c in oof.columns if "prediction" in c] 190 | oofs[m.get_name()] = oof[prediction_cols] # oof["prediction"] 191 | if self.target is None: 192 | self.target_columns = [c for c in oof.columns if "target" in c] 193 | self.target = oof[ 194 | self.target_columns 195 | ] # it will be needed for computing advance model statistics 196 | 197 | if self.sample_weight is None and "sample_weight" in oof.columns: 198 | self.sample_weight = oof["sample_weight"] 199 | 200 | sensitive_cols = [c for c in oof.columns if "sensitive" in c] 201 | if sensitive_cols and sensitive_features is None: 202 | sensitive_features = oof[sensitive_cols] 203 | 204 | return oofs, self.target, self.sample_weight, sensitive_features 205 | 206 | def get_additional_metrics(self): 207 | if self._additional_metrics is None: 208 | logger.debug("Get additional metrics for Ensemble") 209 | # 'target' - the target after processing used for model training 210 | # 'prediction' - out of folds predictions of the model 211 | oof_predictions = self.get_out_of_folds() 212 | prediction_cols = [c for c in oof_predictions.columns if "prediction" in c] 213 | target_cols = [c for c in oof_predictions.columns if "target" in c] 214 | 215 | oof_preds = oof_predictions[prediction_cols] 216 | if self._ml_task == MULTICLASS_CLASSIFICATION: 217 | cols = oof_preds.columns.tolist() 218 | # prediction_ 219 | labels = {i: v[11:] for i, v in enumerate(cols)} 220 | 221 | oof_preds["label"] = np.argmax( 222 | np.array(oof_preds[prediction_cols]), axis=1 223 | ) 224 | oof_preds["label"] = oof_preds["label"].map(labels) 225 | 226 | sample_weight = None 227 | if "sample_weight" in oof_predictions.columns: 228 | sample_weight = oof_predictions["sample_weight"] 229 | 230 | self._additional_metrics = AdditionalMetrics.compute( 231 | oof_predictions[target_cols], 232 | oof_preds, 233 | sample_weight, 234 | self._ml_task, 235 | self.sensitive_features, 236 | self._fairness_metric 237 | if self._ml_task != REGRESSION 238 | else f"{self._fairness_metric}@{self.get_metric_name()}", 239 | self._fairness_threshold, 240 | self._privileged_groups, 241 | self._underprivileged_groups, 242 | ) 243 | if self._ml_task == BINARY_CLASSIFICATION: 244 | self._threshold = float(self._additional_metrics["threshold"]) 245 | 246 | return self._additional_metrics 247 | 248 | def get_sensitive_features_names(self): 249 | metrics = self.get_additional_metrics() 250 | fm = metrics.get("fairness_metrics", {}) 251 | return [i for i in list(fm.keys()) if i != "fairness_optimization"] 252 | 253 | def get_fairness_metric(self, col_name): 254 | metrics = self.get_additional_metrics() 255 | fm = metrics.get("fairness_metrics", {}) 256 | return fm.get(col_name, {}).get("fairness_metric_value") 257 | 258 | def get_fairness_optimization(self): 259 | metrics = self.get_additional_metrics() 260 | fm = metrics.get("fairness_metrics", {}) 261 | return fm.get("fairness_optimization", {}) 262 | 263 | def get_worst_fairness(self): 264 | # We have fairness metrics per sensitive feature. 265 | # The worst fairness metric is: 266 | # - for ratio metrics, the lowest fairness value from all sensitive features 267 | # - for difference metrics, the highest fairness value from all sensitive features 268 | # It is needed as bias mitigation stop criteria. 269 | 270 | metrics = self.get_additional_metrics() 271 | 272 | fm = metrics.get("fairness_metrics", {}) 273 | worst_value = None 274 | for col_name, values in fm.items(): 275 | if col_name == "fairness_optimization": 276 | continue 277 | if "ratio" in self._fairness_metric.lower(): 278 | if worst_value is None: 279 | worst_value = values.get("fairness_metric_value", 0) 280 | else: 281 | worst_value = min( 282 | worst_value, values.get("fairness_metric_value", 0) 283 | ) 284 | else: 285 | if worst_value is None: 286 | worst_value = values.get("fairness_metric_value", 1) 287 | else: 288 | worst_value = max( 289 | worst_value, values.get("fairness_metric_value", 1) 290 | ) 291 | 292 | return worst_value 293 | 294 | def get_best_fairness(self): 295 | # We have fairness metrics per sensitive feature. 296 | # The best fairness metric is: 297 | # - for ratio metrics, the highest fairness value from all sensitive features 298 | # - for difference metrics, the lowest fairness value from all sensitive features 299 | # It is needed as bias mitigation stop criteria. 300 | 301 | metrics = self.get_additional_metrics() 302 | fm = metrics.get("fairness_metrics", {}) 303 | best_value = None 304 | for col_name, values in fm.items(): 305 | if col_name == "fairness_optimization": 306 | continue 307 | if "ratio" in self._fairness_metric.lower(): 308 | if best_value is None: 309 | best_value = values.get("fairness_metric_value", 0) 310 | else: 311 | best_value = max(best_value, values.get("fairness_metric_value", 0)) 312 | else: 313 | if best_value is None: 314 | best_value = values.get("fairness_metric_value", 1) 315 | else: 316 | best_value = min(best_value, values.get("fairness_metric_value", 1)) 317 | 318 | return best_value 319 | 320 | def is_fair(self): 321 | if self._is_fair is not None: 322 | return self._is_fair 323 | metrics = self.get_additional_metrics() 324 | fm = metrics.get("fairness_metrics", {}) 325 | for col, m in fm.items(): 326 | if col == "fairness_optimization": 327 | continue 328 | if not m.get("is_fair", True): 329 | self._is_fair = False 330 | return False 331 | self._is_fair = True 332 | return False 333 | 334 | def fit(self, oofs, y, sample_weight=None, sensitive_features=None): 335 | logger.debug("Ensemble.fit") 336 | self.sensitive_features = sensitive_features 337 | start_time = time.time() 338 | selected_algs_cnt = 0 # number of selected algorithms 339 | self.best_algs = [] # selected algoritms indices from each loop 340 | 341 | total_prediction_time = 0 342 | best_sum = None # sum of best algorihtms 343 | for j in range(len(oofs)): # iterate over all solutions 344 | min_score = self.metric.get_maximum() 345 | best_model = None 346 | # try to add some algorithm to the best_sum to minimize metric 347 | for model_name in oofs.keys(): 348 | if ( 349 | self._max_single_prediction_time 350 | and model_name in self.model_prediction_time 351 | ): 352 | if ( 353 | total_prediction_time + self.model_prediction_time[model_name] 354 | > self._max_single_prediction_time 355 | ): 356 | continue 357 | # skip unfair models 358 | if ( 359 | self._fairness_metric is not None 360 | and not self.models_map[model_name].is_fair() 361 | ): 362 | continue 363 | y_ens = self._get_mean(oofs[model_name], best_sum, j + 1) 364 | score = self.metric(y, y_ens, sample_weight) 365 | if self.metric.improvement(previous=min_score, current=score): 366 | min_score = score 367 | best_model = model_name 368 | 369 | if best_model is None: 370 | continue 371 | # there is improvement, save it 372 | # save scores for plotting learning curve 373 | # if we optimize negative, then we need to multiply by -1.0 374 | # to save correct values in the learning curve 375 | sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0 376 | self._scores += [sign * min_score] 377 | 378 | if self.metric.improvement(previous=self.best_loss, current=min_score): 379 | self.best_loss = min_score 380 | selected_algs_cnt = j 381 | 382 | self.best_algs.append(best_model) # save the best algoritm 383 | # update best_sum value 384 | best_sum = ( 385 | oofs[best_model] if best_sum is None else best_sum + oofs[best_model] 386 | ) 387 | if j == selected_algs_cnt: 388 | self.total_best_sum = copy.deepcopy(best_sum) 389 | 390 | # update prediction time estimate 391 | if self._max_single_prediction_time is not None: 392 | total_prediction_time = np.sum( 393 | [ 394 | self.model_prediction_time[name] 395 | for name in np.unique(self.best_algs) 396 | ] 397 | ) 398 | # end of main loop # 399 | 400 | if not self.best_algs: 401 | raise NotTrainedException("Ensemble wasn't fitted.") 402 | 403 | # keep oof predictions of ensemble 404 | self.total_best_sum /= float(selected_algs_cnt + 1) 405 | self.best_algs = self.best_algs[: (selected_algs_cnt + 1)] 406 | 407 | logger.debug("Selected models for ensemble:") 408 | for model_name in np.unique(self.best_algs): 409 | self.selected_models += [ 410 | { 411 | "model": self.models_map[model_name], 412 | "repeat": float(self.best_algs.count(model_name)), 413 | } 414 | ] 415 | logger.debug(f"{model_name} {self.best_algs.count(model_name)}") 416 | 417 | self._additional_metrics = self.get_additional_metrics() 418 | 419 | self.train_time = time.time() - start_time 420 | 421 | def predict(self, X, X_stacked=None): 422 | logger.debug( 423 | "Ensemble.predict with {} models".format(len(self.selected_models)) 424 | ) 425 | y_predicted_ensemble = None 426 | total_repeat = 0.0 427 | 428 | for selected in self.selected_models: 429 | model = selected["model"] 430 | repeat = selected["repeat"] 431 | total_repeat += repeat 432 | 433 | if model._is_stacked: 434 | y_predicted_from_model = model.predict(X_stacked) 435 | else: 436 | y_predicted_from_model = model.predict(X) 437 | 438 | prediction_cols = [] 439 | if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: 440 | prediction_cols = [ 441 | c for c in y_predicted_from_model.columns if "prediction_" in c 442 | ] 443 | else: # REGRESSION 444 | prediction_cols = ["prediction"] 445 | y_predicted_from_model = y_predicted_from_model[prediction_cols] 446 | y_predicted_ensemble = ( 447 | y_predicted_from_model * repeat 448 | if y_predicted_ensemble is None 449 | else y_predicted_ensemble + y_predicted_from_model * repeat 450 | ) 451 | 452 | y_predicted_ensemble /= total_repeat 453 | 454 | if self._ml_task == MULTICLASS_CLASSIFICATION: 455 | cols = y_predicted_ensemble.columns.tolist() 456 | # prediction_ 457 | labels = {i: v[11:] for i, v in enumerate(cols)} 458 | 459 | y_predicted_ensemble["label"] = np.argmax( 460 | np.array(y_predicted_ensemble[prediction_cols]), axis=1 461 | ) 462 | y_predicted_ensemble["label"] = y_predicted_ensemble["label"].map(labels) 463 | 464 | return y_predicted_ensemble 465 | 466 | def to_json(self): 467 | models_json = [] 468 | for selected in self.selected_models: 469 | model = selected["model"] 470 | repeat = selected["repeat"] 471 | models_json += [{"model": model.to_json(), "repeat": repeat}] 472 | 473 | json_desc = { 474 | "library_version": self.library_version, 475 | "algorithm_name": self.algorithm_name, 476 | "algorithm_short_name": self.algorithm_short_name, 477 | "uid": self.uid, 478 | "models": models_json, 479 | } 480 | return json_desc 481 | 482 | def from_json(self, json_desc): 483 | self.library_version = json_desc.get("library_version", self.library_version) 484 | self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) 485 | self.algorithm_short_name = json_desc.get( 486 | "algorithm_short_name", self.algorithm_short_name 487 | ) 488 | self.uid = json_desc.get("uid", self.uid) 489 | self.selected_models = [] 490 | models_json = json_desc.get("models") 491 | for selected in models_json: 492 | model = selected["model"] 493 | repeat = selected["repeat"] 494 | 495 | il = ModelFramework(model.get("params")) 496 | il.from_json(model) 497 | self.selected_models += [ 498 | # {"model": LearnerFactory.load(model), "repeat": repeat} 499 | {"model": il, "repeat": repeat} 500 | ] 501 | 502 | def save(self, results_path, model_subpath): 503 | model_path = os.path.join(results_path, model_subpath) 504 | logger.info(f"Save the ensemble to {model_path}") 505 | 506 | predictions = self.get_out_of_folds() 507 | predictions_fname = os.path.join(model_subpath, f"predictions_ensemble.csv") 508 | self._oof_predictions_fname = os.path.join(results_path, predictions_fname) 509 | predictions.to_csv(self._oof_predictions_fname, index=False) 510 | 511 | with open(os.path.join(model_path, "ensemble.json"), "w") as fout: 512 | ms = [] 513 | for selected in self.selected_models: 514 | ms += [{"model": selected["model"]._name, "repeat": selected["repeat"]}] 515 | 516 | desc = { 517 | "name": self._name, 518 | "ml_task": self._ml_task, 519 | "optimize_metric": self._optimize_metric, 520 | "selected_models": ms, 521 | "predictions_fname": predictions_fname, 522 | "metric_name": self.get_metric_name(), 523 | "final_loss": self.get_final_loss(), 524 | "train_time": self.get_train_time(), 525 | "is_stacked": self._is_stacked, 526 | } 527 | 528 | if self._threshold is not None: 529 | desc["threshold"] = self._threshold 530 | fout.write(json.dumps(desc, indent=4, cls=MLJSONEncoder)) 531 | 532 | LearningCurves.plot_for_ensemble(self._scores, self.metric.name, model_path) 533 | 534 | # call additional metics just to be sure they are computed 535 | self._additional_metrics = self.get_additional_metrics() 536 | 537 | AdditionalMetrics.save( 538 | self._additional_metrics, self._ml_task, self.model_markdown(), model_path 539 | ) 540 | 541 | with open(os.path.join(model_path, "status.txt"), "w") as fout: 542 | fout.write("ALL OK!") 543 | 544 | def model_markdown(self): 545 | select_models_desc = [] 546 | for selected in self.selected_models: 547 | select_models_desc += [ 548 | {"model": selected["model"]._name, "repeat": selected["repeat"]} 549 | ] 550 | desc = f"# Summary of {self.get_name()}\n\n" 551 | desc += "[<< Go back](../README.md)\n\n" 552 | desc += "\n## Ensemble structure\n" 553 | selected = pd.DataFrame(select_models_desc) 554 | desc += tabulate(selected.values, ["Model", "Weight"], tablefmt="pipe") 555 | desc += "\n" 556 | return desc 557 | 558 | @staticmethod 559 | def load(results_path, model_subpath, models_map): 560 | model_path = os.path.join(results_path, model_subpath) 561 | logger.info(f"Loading ensemble from {model_path}") 562 | 563 | with open(os.path.join(model_path, "ensemble.json")) as file: 564 | json_desc = json.load(file) 565 | 566 | ensemble = Ensemble(json_desc.get("optimize_metric"), json_desc.get("ml_task")) 567 | ensemble._name = json_desc.get("name", ensemble._name) 568 | ensemble._threshold = json_desc.get("threshold", ensemble._threshold) 569 | for m in json_desc.get("selected_models", []): 570 | ensemble.selected_models += [ 571 | {"model": models_map[m["model"]], "repeat": m["repeat"]} 572 | ] 573 | 574 | ensemble.best_loss = json_desc.get("final_loss", ensemble.best_loss) 575 | ensemble.train_time = json_desc.get("train_time", ensemble.train_time) 576 | ensemble._is_stacked = json_desc.get("is_stacked", ensemble._is_stacked) 577 | predictions_fname = json_desc.get("predictions_fname") 578 | if predictions_fname is not None: 579 | ensemble._oof_predictions_fname = os.path.join( 580 | results_path, predictions_fname 581 | ) 582 | 583 | return ensemble 584 | ``` -------------------------------------------------------------------------------- /supervised/fairness/metrics.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.metrics import ( 4 | mean_absolute_error, 5 | mean_absolute_percentage_error, 6 | mean_squared_error, 7 | r2_score, 8 | ) 9 | 10 | from supervised.fairness.optimization import FairnessOptimization 11 | from supervised.fairness.plots import FairnessPlots 12 | from supervised.fairness.utils import ( 13 | accuracy, 14 | false_negative_rate, 15 | false_positive_rate, 16 | selection_rate, 17 | true_negative_rate, 18 | true_positive_rate, 19 | ) 20 | from supervised.utils.metric import pearson, spearman 21 | 22 | 23 | class FairnessMetrics: 24 | @staticmethod 25 | def binary_classification( 26 | target, 27 | predicted_labels, 28 | sensitive_features, 29 | fairness_metric, 30 | fairness_threshold, 31 | privileged_groups=[], 32 | underprivileged_groups=[], 33 | previous_fairness_optimization=None, 34 | ): 35 | target = np.array(target).ravel() 36 | preds = np.array(predicted_labels) 37 | 38 | fairness_metrics = {} 39 | 40 | for col in sensitive_features.columns: 41 | col_name = col[10:] # skip 'senstive_' 42 | 43 | accuracies = [] 44 | selection_rates = [] 45 | tprs = [] 46 | fprs = [] 47 | tnrs = [] 48 | fnrs = [] 49 | samples = [] 50 | demographic_parity_diff = None 51 | demographic_parity_ratio = None 52 | equalized_odds_diff = None 53 | equalized_odds_ratio = None 54 | 55 | # overall 56 | accuracies += [accuracy(target, preds)] 57 | selection_rates += [selection_rate(preds)] 58 | tprs += [true_positive_rate(target, preds)] 59 | fprs += [false_positive_rate(target, preds)] 60 | tnrs += [true_negative_rate(target, preds)] 61 | fnrs += [false_negative_rate(target, preds)] 62 | samples += [target.shape[0]] 63 | 64 | values = sensitive_features[col].unique() 65 | 66 | for value in values: 67 | accuracies += [ 68 | accuracy( 69 | target[sensitive_features[col] == value], 70 | preds[sensitive_features[col] == value], 71 | ) 72 | ] 73 | selection_rates += [ 74 | selection_rate(preds[sensitive_features[col] == value]) 75 | ] 76 | tprs += [ 77 | true_positive_rate( 78 | target[sensitive_features[col] == value], 79 | preds[sensitive_features[col] == value], 80 | ) 81 | ] 82 | fprs += [ 83 | false_positive_rate( 84 | target[sensitive_features[col] == value], 85 | preds[sensitive_features[col] == value], 86 | ) 87 | ] 88 | tnrs += [ 89 | true_negative_rate( 90 | target[sensitive_features[col] == value], 91 | preds[sensitive_features[col] == value], 92 | ) 93 | ] 94 | fnrs += [ 95 | false_negative_rate( 96 | target[sensitive_features[col] == value], 97 | preds[sensitive_features[col] == value], 98 | ) 99 | ] 100 | samples += [np.sum([sensitive_features[col] == value])] 101 | 102 | metrics = pd.DataFrame( 103 | { 104 | "Samples": samples, 105 | "Accuracy": accuracies, 106 | "Selection Rate": selection_rates, 107 | "True Positive Rate": tprs, 108 | "False Negative Rate": fnrs, 109 | "False Positive Rate": fprs, 110 | "True Negative Rate": tnrs, 111 | }, 112 | index=["Overall"] + list(values), 113 | ) 114 | 115 | max_selection_rate = np.max(selection_rates[1:]) 116 | min_selection_rate = np.min(selection_rates[1:]) 117 | 118 | privileged_value, underprivileged_value = None, None 119 | for pg in privileged_groups: 120 | if col_name in pg: 121 | privileged_value = pg.get(col_name) 122 | for upg in underprivileged_groups: 123 | if col_name in upg: 124 | underprivileged_value = upg.get(col_name) 125 | 126 | if privileged_value is not None: 127 | for i, v in enumerate(values): 128 | if v == privileged_value: 129 | # starting from 1 because first selection rate is for all samples 130 | max_selection_rate = selection_rates[i + 1] 131 | 132 | if underprivileged_value is not None: 133 | for i, v in enumerate(values): 134 | if v == underprivileged_value: 135 | # starting from 1 because first selection rate is for all samples 136 | min_selection_rate = selection_rates[i + 1] 137 | 138 | demographic_parity_diff = np.round( 139 | max_selection_rate - min_selection_rate, 4 140 | ) 141 | demographic_parity_ratio = np.round( 142 | min_selection_rate / max_selection_rate, 4 143 | ) 144 | 145 | tpr_min = np.min(tprs[1:]) 146 | tpr_max = np.max(tprs[1:]) 147 | 148 | fpr_min = np.min(fprs[1:]) 149 | fpr_max = np.max(fprs[1:]) 150 | 151 | if privileged_value is not None: 152 | for i, v in enumerate(values): 153 | if v == privileged_value: 154 | # starting from 1 because first value is for all samples 155 | tpr_max = tprs[i + 1] 156 | fpr_max = fprs[i + 1] 157 | 158 | if underprivileged_value is not None: 159 | for i, v in enumerate(values): 160 | if v == underprivileged_value: 161 | # starting from 1 because first value is for all samples 162 | tpr_min = tprs[i + 1] 163 | fpr_min = fprs[i + 1] 164 | 165 | equalized_odds_diff = np.round(max(tpr_max - tpr_min, fpr_max - fpr_min), 4) 166 | equalized_odds_ratio = np.round( 167 | min(tpr_min / tpr_max, fpr_min / fpr_max), 4 168 | ) 169 | 170 | stats = pd.DataFrame( 171 | { 172 | "": [ 173 | demographic_parity_diff, 174 | demographic_parity_ratio, 175 | equalized_odds_diff, 176 | equalized_odds_ratio, 177 | ] 178 | }, 179 | index=[ 180 | "Demographic Parity Difference", 181 | "Demographic Parity Ratio", 182 | "Equalized Odds Difference", 183 | "Equalized Odds Ratio", 184 | ], 185 | ) 186 | 187 | fairness_metric_name = "" 188 | fairness_metric_value = 0 189 | is_fair = False 190 | if fairness_metric == "demographic_parity_difference": 191 | fairness_metric_name = "Demographic Parity Difference" 192 | fairness_metric_value = demographic_parity_diff 193 | is_fair = demographic_parity_diff < fairness_threshold 194 | elif fairness_metric == "demographic_parity_ratio": 195 | fairness_metric_name = "Demographic Parity Ratio" 196 | fairness_metric_value = demographic_parity_ratio 197 | is_fair = demographic_parity_ratio > fairness_threshold 198 | elif fairness_metric == "equalized_odds_difference": 199 | fairness_metric_name = "Equalized Odds Difference" 200 | fairness_metric_value = equalized_odds_diff 201 | is_fair = equalized_odds_diff < fairness_threshold 202 | elif fairness_metric == "equalized_odds_ratio": 203 | fairness_metric_name = "Equalized Odds Ratio" 204 | fairness_metric_value = equalized_odds_ratio 205 | is_fair = equalized_odds_ratio > fairness_threshold 206 | 207 | if "parity" in fairness_metric: 208 | if privileged_value is None: 209 | ind = np.argmax(selection_rates[1:]) 210 | privileged_value = values[ind] 211 | if underprivileged_value is None: 212 | ind = np.argmin(selection_rates[1:]) 213 | underprivileged_value = values[ind] 214 | 215 | if "odds" in fairness_metric: 216 | if tpr_max - tpr_min > fpr_max - fpr_min: 217 | if privileged_value is None: 218 | ind = np.argmax(tprs[1:]) 219 | privileged_value = values[ind] 220 | if underprivileged_value is None: 221 | ind = np.argmin(tprs[1:]) 222 | underprivileged_value = values[ind] 223 | else: 224 | if privileged_value is None: 225 | ind = np.argmax(fprs[1:]) 226 | privileged_value = values[ind] 227 | if underprivileged_value is None: 228 | ind = np.argmin(fprs[1:]) 229 | underprivileged_value = values[ind] 230 | 231 | fairness_metrics[col_name] = { 232 | "metrics": metrics, 233 | "stats": stats, 234 | "figures": FairnessPlots.binary_classification( 235 | fairness_metric, 236 | col_name, 237 | metrics, 238 | selection_rates, 239 | max_selection_rate, 240 | fairness_threshold, 241 | ), 242 | "fairness_metric_name": fairness_metric_name, 243 | "fairness_metric_value": fairness_metric_value, 244 | "is_fair": is_fair, 245 | "privileged_value": privileged_value, 246 | "underprivileged_value": underprivileged_value, 247 | } 248 | 249 | # fairness optimization stats 250 | fairness_metrics[ 251 | "fairness_optimization" 252 | ] = FairnessOptimization.binary_classification( 253 | target, 254 | predicted_labels, 255 | sensitive_features, 256 | fairness_metric, 257 | fairness_threshold, 258 | privileged_groups, 259 | underprivileged_groups, 260 | previous_fairness_optimization, 261 | min_selection_rate, 262 | max_selection_rate, 263 | ) 264 | 265 | return fairness_metrics 266 | 267 | @staticmethod 268 | def regression( 269 | target, 270 | predictions, 271 | sensitive_features, 272 | fairness_metric, 273 | fairness_threshold, 274 | privileged_groups=[], 275 | underprivileged_groups=[], 276 | previous_fairness_optimization=None, 277 | ): 278 | metric_name = fairness_metric.split("@")[1].upper() 279 | 280 | if "ratio" in fairness_metric.lower(): 281 | fairness_metric_name = f"Group Loss Ratio @ {metric_name}" 282 | else: 283 | fairness_metric_name = f"Group Loss Difference @ {metric_name}" 284 | 285 | fairness_metrics = {} 286 | 287 | regression_metrics = { 288 | "SAMPLES": lambda t, p, sw=None: t.shape[0], 289 | "MAE": mean_absolute_error, 290 | "MSE": mean_squared_error, 291 | "RMSE": lambda t, p, sample_weight=None: np.sqrt( 292 | mean_squared_error(t, p, sample_weight=sample_weight) 293 | ), 294 | "R2": r2_score, 295 | "MAPE": mean_absolute_percentage_error, 296 | "SPEARMAN": spearman, 297 | "PEARSON": pearson, 298 | } 299 | overall = {} 300 | for k, v in regression_metrics.items(): 301 | overall[k] = v(target, predictions) 302 | 303 | for col in sensitive_features.columns: 304 | col_name = col[10:] # skip 'senstive_' 305 | 306 | values = sensitive_features[col].unique() 307 | all_metrics = [overall] 308 | 309 | for value in values: 310 | metrics = {} 311 | for k, v in regression_metrics.items(): 312 | metrics[k] = v( 313 | target[sensitive_features[col] == value], 314 | predictions[sensitive_features[col] == value], 315 | ) 316 | all_metrics += [metrics] 317 | 318 | mdf = pd.DataFrame(all_metrics, index=["Overall"] + list(values)) 319 | 320 | privileged_value, underprivileged_value = None, None 321 | for pg in privileged_groups: 322 | if col_name in pg: 323 | privileged_value = pg.get(col_name) 324 | for upg in underprivileged_groups: 325 | if col_name in upg: 326 | underprivileged_value = upg.get(col_name) 327 | 328 | if privileged_value is None: 329 | if metric_name in ["R2", "SPEARMAN", "PEARSON"]: 330 | # the higher the better 331 | privileged_value = mdf.index[ 332 | mdf[metric_name][1:].argmax() + 1 333 | ] # without overall metrics 334 | else: 335 | # the lower the better 336 | privileged_value = mdf.index[ 337 | mdf[metric_name][1:].argmin() + 1 338 | ] # without overall metrics 339 | 340 | if underprivileged_value is None: 341 | if metric_name in ["R2", "SPEARMAN", "PEARSON"]: 342 | # the higher the better 343 | underprivileged_value = mdf.index[ 344 | mdf[metric_name][1:].argmin() + 1 345 | ] # without overall metrics 346 | else: 347 | # the lower the better 348 | underprivileged_value = mdf.index[ 349 | mdf[metric_name][1:].argmax() + 1 350 | ] # without overall metrics 351 | 352 | metric_min = mdf[metric_name].loc[privileged_value] 353 | metric_max = mdf[metric_name].loc[underprivileged_value] 354 | 355 | ratio = np.round(metric_min / metric_max, 4) 356 | diff = np.round(metric_max - metric_min, 4) 357 | 358 | # ratio = np.round(mdf[metric_name][1:].min()/mdf[metric_name][1:].max(), 4) 359 | # diff = np.round(mdf[metric_name][1:].max()-mdf[metric_name][1:].min(), 4) 360 | 361 | is_fair = False 362 | if "ratio" in fairness_metric.lower(): 363 | fairness_metric_value = ratio 364 | if ratio > fairness_threshold: 365 | is_fair = True 366 | else: 367 | fairness_metric_value = diff 368 | if diff < fairness_threshold: 369 | is_fair = True 370 | 371 | fairness_metrics[col_name] = { 372 | "metrics": mdf, 373 | "figures": FairnessPlots.regression( 374 | fairness_metric, col_name, mdf, fairness_metric_name 375 | ), 376 | "privileged_value": privileged_value, 377 | "underprivileged_value": underprivileged_value, 378 | "ratio": ratio, 379 | "diff": diff, 380 | "metric_name": metric_name, 381 | "fairness_metric_name": fairness_metric_name, 382 | "fairness_metric_value": fairness_metric_value, 383 | "is_fair": is_fair, 384 | "fairness_threshold": fairness_threshold, 385 | } 386 | 387 | fairness_metrics["fairness_optimization"] = FairnessOptimization.regression( 388 | target, 389 | predictions, 390 | sensitive_features, 391 | fairness_metric, 392 | fairness_threshold, 393 | privileged_groups, 394 | underprivileged_groups, 395 | previous_fairness_optimization, 396 | performance_metric=regression_metrics[metric_name], 397 | performance_metric_name=metric_name, 398 | ) 399 | 400 | return fairness_metrics 401 | 402 | @staticmethod 403 | def multiclass_classification( 404 | original_target, 405 | predicted_labels, 406 | sensitive_features, 407 | fairness_metric, 408 | fairness_threshold, 409 | privileged_groups=[], 410 | underprivileged_groups=[], 411 | previous_fairness_optimization=None, 412 | ): 413 | original_target = np.array(original_target).ravel() 414 | predicted_labels = np.array(predicted_labels) 415 | target_values = list(np.unique(original_target)) 416 | 417 | fairness_metrics = {} 418 | 419 | for col in sensitive_features.columns: 420 | col_name = col[10:] # skip 'senstive_' 421 | 422 | for target_value in target_values: 423 | # we need to reset them for each target value 424 | privileged_value, underprivileged_value = None, None 425 | for pg in privileged_groups: 426 | if col_name in pg: 427 | privileged_value = pg.get(col_name) 428 | for upg in underprivileged_groups: 429 | if col_name in upg: 430 | underprivileged_value = upg.get(col_name) 431 | 432 | target = np.copy(original_target) 433 | target[original_target == target_value] = 1 434 | target[original_target != target_value] = 0 435 | 436 | preds = np.copy(predicted_labels) 437 | preds[predicted_labels == target_value] = 1 438 | preds[predicted_labels != target_value] = 0 439 | 440 | accuracies = [] 441 | selection_rates = [] 442 | tprs = [] 443 | fprs = [] 444 | tnrs = [] 445 | fnrs = [] 446 | samples = [] 447 | demographic_parity_diff = None 448 | demographic_parity_ratio = None 449 | equalized_odds_diff = None 450 | equalized_odds_ratio = None 451 | 452 | # overall 453 | accuracies += [accuracy(target, preds)] 454 | selection_rates += [selection_rate(preds)] 455 | tprs += [true_positive_rate(target, preds)] 456 | fprs += [false_positive_rate(target, preds)] 457 | tnrs += [true_negative_rate(target, preds)] 458 | fnrs += [false_negative_rate(target, preds)] 459 | samples += [target.shape[0]] 460 | 461 | values = sensitive_features[col].unique() 462 | 463 | for value in values: 464 | accuracies += [ 465 | accuracy( 466 | target[sensitive_features[col] == value], 467 | preds[sensitive_features[col] == value], 468 | ) 469 | ] 470 | selection_rates += [ 471 | selection_rate(preds[sensitive_features[col] == value]) 472 | ] 473 | tprs += [ 474 | true_positive_rate( 475 | target[sensitive_features[col] == value], 476 | preds[sensitive_features[col] == value], 477 | ) 478 | ] 479 | fprs += [ 480 | false_positive_rate( 481 | target[sensitive_features[col] == value], 482 | preds[sensitive_features[col] == value], 483 | ) 484 | ] 485 | tnrs += [ 486 | true_negative_rate( 487 | target[sensitive_features[col] == value], 488 | preds[sensitive_features[col] == value], 489 | ) 490 | ] 491 | fnrs += [ 492 | false_negative_rate( 493 | target[sensitive_features[col] == value], 494 | preds[sensitive_features[col] == value], 495 | ) 496 | ] 497 | samples += [np.sum([sensitive_features[col] == value])] 498 | 499 | metrics = pd.DataFrame( 500 | { 501 | "Samples": samples, 502 | "Accuracy": accuracies, 503 | "Selection Rate": selection_rates, 504 | "True Positive Rate": tprs, 505 | "False Negative Rate": fnrs, 506 | "False Positive Rate": fprs, 507 | "True Negative Rate": tnrs, 508 | }, 509 | index=["Overall"] + list(values), 510 | ) 511 | 512 | max_selection_rate = np.max(selection_rates[1:]) 513 | min_selection_rate = np.min(selection_rates[1:]) 514 | 515 | if privileged_value is not None: 516 | for i, v in enumerate(values): 517 | if v == privileged_value: 518 | # starting from 1 because first selection rate is for all samples 519 | max_selection_rate = selection_rates[i + 1] 520 | 521 | if underprivileged_value is not None: 522 | for i, v in enumerate(values): 523 | if v == underprivileged_value: 524 | # starting from 1 because first selection rate is for all samples 525 | min_selection_rate = selection_rates[i + 1] 526 | 527 | demographic_parity_diff = np.round( 528 | max_selection_rate - min_selection_rate, 4 529 | ) 530 | demographic_parity_ratio = np.round( 531 | min_selection_rate / max_selection_rate, 4 532 | ) 533 | 534 | tpr_min = np.min(tprs[1:]) 535 | tpr_max = np.max(tprs[1:]) 536 | 537 | fpr_min = np.min(fprs[1:]) 538 | fpr_max = np.max(fprs[1:]) 539 | 540 | if privileged_value is not None: 541 | for i, v in enumerate(values): 542 | if v == privileged_value: 543 | # starting from 1 because first value is for all samples 544 | tpr_max = tprs[i + 1] 545 | fpr_max = fprs[i + 1] 546 | 547 | if underprivileged_value is not None: 548 | for i, v in enumerate(values): 549 | if v == underprivileged_value: 550 | # starting from 1 because first value is for all samples 551 | tpr_min = tprs[i + 1] 552 | fpr_min = fprs[i + 1] 553 | 554 | equalized_odds_diff = np.round( 555 | max(tpr_max - tpr_min, fpr_max - fpr_min), 4 556 | ) 557 | equalized_odds_ratio = np.round( 558 | min(tpr_min / tpr_max, fpr_min / fpr_max), 4 559 | ) 560 | 561 | stats = pd.DataFrame( 562 | { 563 | "": [ 564 | demographic_parity_diff, 565 | demographic_parity_ratio, 566 | equalized_odds_diff, 567 | equalized_odds_ratio, 568 | ] 569 | }, 570 | index=[ 571 | "Demographic Parity Difference", 572 | "Demographic Parity Ratio", 573 | "Equalized Odds Difference", 574 | "Equalized Odds Ratio", 575 | ], 576 | ) 577 | 578 | fairness_metric_name = "" 579 | fairness_metric_value = 0 580 | is_fair = False 581 | if fairness_metric == "demographic_parity_difference": 582 | fairness_metric_name = "Demographic Parity Difference" 583 | fairness_metric_value = demographic_parity_diff 584 | is_fair = demographic_parity_diff < fairness_threshold 585 | elif fairness_metric == "demographic_parity_ratio": 586 | fairness_metric_name = "Demographic Parity Ratio" 587 | fairness_metric_value = demographic_parity_ratio 588 | is_fair = demographic_parity_ratio > fairness_threshold 589 | elif fairness_metric == "equalized_odds_difference": 590 | fairness_metric_name = "Equalized Odds Difference" 591 | fairness_metric_value = equalized_odds_diff 592 | is_fair = equalized_odds_diff < fairness_threshold 593 | elif fairness_metric == "equalized_odds_ratio": 594 | fairness_metric_name = "Equalized Odds Ratio" 595 | fairness_metric_value = equalized_odds_ratio 596 | is_fair = equalized_odds_ratio > fairness_threshold 597 | 598 | if "parity" in fairness_metric: 599 | if privileged_value is None: 600 | ind = np.argmax(selection_rates[1:]) 601 | privileged_value = values[ind] 602 | if underprivileged_value is None: 603 | ind = np.argmin(selection_rates[1:]) 604 | underprivileged_value = values[ind] 605 | 606 | if "odds" in fairness_metric: 607 | if tpr_max - tpr_min > fpr_max - fpr_min: 608 | if privileged_value is None: 609 | ind = np.argmax(tprs[1:]) 610 | privileged_value = values[ind] 611 | if underprivileged_value is None: 612 | ind = np.argmin(tprs[1:]) 613 | underprivileged_value = values[ind] 614 | else: 615 | if privileged_value is None: 616 | ind = np.argmax(fprs[1:]) 617 | privileged_value = values[ind] 618 | if underprivileged_value is None: 619 | ind = np.argmin(fprs[1:]) 620 | underprivileged_value = values[ind] 621 | 622 | fairness_metrics[f"{col_name}__{target_value}"] = { 623 | "metrics": metrics, 624 | "stats": stats, 625 | "figures": FairnessPlots.binary_classification( 626 | fairness_metric, 627 | f"{col_name}__{target_value}", 628 | metrics, 629 | selection_rates, 630 | max_selection_rate, 631 | fairness_threshold, 632 | ), 633 | "fairness_metric_name": fairness_metric_name, 634 | "fairness_metric_value": fairness_metric_value, 635 | "is_fair": is_fair, 636 | "privileged_value": privileged_value, 637 | "underprivileged_value": underprivileged_value, 638 | } 639 | 640 | # fairness optimization stats 641 | fairness_metrics[ 642 | "fairness_optimization" 643 | ] = FairnessOptimization.multiclass_classification( 644 | original_target, 645 | predicted_labels, 646 | sensitive_features, 647 | fairness_metric, 648 | fairness_threshold, 649 | privileged_groups, 650 | underprivileged_groups, 651 | previous_fairness_optimization, 652 | ) 653 | 654 | return fairness_metrics 655 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/preprocessing.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.algorithms.registry import ( 7 | BINARY_CLASSIFICATION, 8 | MULTICLASS_CLASSIFICATION, 9 | ) 10 | from supervised.exceptions import AutoMLException 11 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer 12 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget 13 | from supervised.preprocessing.goldenfeatures_transformer import ( 14 | GoldenFeaturesTransformer, 15 | ) 16 | from supervised.preprocessing.kmeans_transformer import KMeansTransformer 17 | from supervised.preprocessing.label_binarizer import LabelBinarizer 18 | from supervised.preprocessing.label_encoder import LabelEncoder 19 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 20 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues 21 | from supervised.preprocessing.scale import Scale 22 | from supervised.preprocessing.text_transformer import TextTransformer 23 | from supervised.utils.config import LOG_LEVEL 24 | 25 | logger = logging.getLogger(__name__) 26 | logger.setLevel(LOG_LEVEL) 27 | 28 | 29 | class Preprocessing(object): 30 | def __init__( 31 | self, 32 | preprocessing_params={"target_preprocessing": [], "columns_preprocessing": {}}, 33 | model_name=None, 34 | k_fold=None, 35 | repeat=None, 36 | ): 37 | self._params = preprocessing_params 38 | 39 | if "target_preprocessing" not in preprocessing_params: 40 | self._params["target_preprocessing"] = [] 41 | if "columns_preprocessing" not in preprocessing_params: 42 | self._params["columns_preprocessing"] = {} 43 | 44 | # preprocssing step attributes 45 | self._categorical_y = None 46 | self._scale_y = None 47 | self._missing_values = [] 48 | self._categorical = [] 49 | self._scale = [] 50 | self._remove_columns = [] 51 | self._datetime_transforms = [] 52 | self._text_transforms = [] 53 | self._golden_features = None 54 | self._kmeans = None 55 | self._add_random_feature = self._params.get("add_random_feature", False) 56 | self._drop_features = self._params.get("drop_features", []) 57 | self._model_name = model_name 58 | self._k_fold = k_fold 59 | self._repeat = repeat 60 | 61 | def _exclude_missing_targets(self, X=None, y=None): 62 | # check if there are missing values in target column 63 | if y is None: 64 | return X, y 65 | y_missing = pd.isnull(y) 66 | if np.sum(np.array(y_missing)) == 0: 67 | return X, y 68 | y = y.drop(y.index[y_missing]) 69 | y.index = range(y.shape[0]) 70 | if X is not None: 71 | X = X.drop(X.index[y_missing]) 72 | X.index = range(X.shape[0]) 73 | return X, y 74 | 75 | # fit and transform 76 | def fit_and_transform(self, X_train, y_train, sample_weight=None): 77 | logger.debug("Preprocessing.fit_and_transform") 78 | 79 | if y_train is not None: 80 | # target preprocessing 81 | # this must be used first, maybe we will drop some rows because of missing target values 82 | target_preprocessing = self._params.get("target_preprocessing") 83 | logger.debug("target_preprocessing params: {}".format(target_preprocessing)) 84 | 85 | X_train, y_train, sample_weight, _ = ExcludeRowsMissingTarget.transform( 86 | X_train, y_train, sample_weight 87 | ) 88 | 89 | if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: 90 | logger.debug("Convert target to integer") 91 | self._categorical_y = LabelEncoder(try_to_fit_numeric=True) 92 | self._categorical_y.fit(y_train) 93 | y_train = pd.Series(self._categorical_y.transform(y_train)) 94 | 95 | if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: 96 | logger.debug("Convert target to one-hot coding") 97 | self._categorical_y = LabelBinarizer() 98 | self._categorical_y.fit(pd.DataFrame({"target": y_train}), "target") 99 | y_train = self._categorical_y.transform( 100 | pd.DataFrame({"target": y_train}), "target" 101 | ) 102 | 103 | if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: 104 | logger.debug("Scale log and normal") 105 | 106 | self._scale_y = Scale( 107 | ["target"], scale_method=Scale.SCALE_LOG_AND_NORMAL 108 | ) 109 | y_train = pd.DataFrame({"target": y_train}) 110 | self._scale_y.fit(y_train) 111 | y_train = self._scale_y.transform(y_train) 112 | y_train = y_train["target"] 113 | 114 | if Scale.SCALE_NORMAL in target_preprocessing: 115 | logger.debug("Scale normal") 116 | 117 | self._scale_y = Scale(["target"], scale_method=Scale.SCALE_NORMAL) 118 | y_train = pd.DataFrame({"target": y_train}) 119 | self._scale_y.fit(y_train) 120 | y_train = self._scale_y.transform(y_train) 121 | y_train = y_train["target"] 122 | 123 | # columns preprocessing 124 | columns_preprocessing = self._params.get("columns_preprocessing") 125 | for column in columns_preprocessing: 126 | transforms = columns_preprocessing[column] 127 | # logger.debug("Preprocess column {} with: {}".format(column, transforms)) 128 | 129 | # remove empty or constant columns 130 | cols_to_remove = list( 131 | filter( 132 | lambda k: "remove_column" in columns_preprocessing[k], 133 | columns_preprocessing, 134 | ) 135 | ) 136 | 137 | if X_train is not None: 138 | X_train.drop(cols_to_remove, axis=1, inplace=True) 139 | self._remove_columns = cols_to_remove 140 | 141 | numeric_cols = [] # get numeric cols before text transformations 142 | # needed for golden features 143 | if X_train is not None and ( 144 | "golden_features" in self._params or "kmeans_features" in self._params 145 | ): 146 | numeric_cols = X_train.select_dtypes(include="number").columns.tolist() 147 | 148 | # there can be missing values in the text data, 149 | # but we don't want to handle it by fill missing methods 150 | # zeros will be imputed by text_transform method 151 | cols_to_process = list( 152 | filter( 153 | lambda k: "text_transform" in columns_preprocessing[k], 154 | columns_preprocessing, 155 | ) 156 | ) 157 | 158 | new_text_columns = [] 159 | for col in cols_to_process: 160 | t = TextTransformer() 161 | t.fit(X_train, col) 162 | X_train = t.transform(X_train) 163 | self._text_transforms += [t] 164 | new_text_columns += t._new_columns 165 | # end of text transform 166 | 167 | for missing_method in [PreprocessingMissingValues.FILL_NA_MEDIAN]: 168 | cols_to_process = list( 169 | filter( 170 | lambda k: missing_method in columns_preprocessing[k], 171 | columns_preprocessing, 172 | ) 173 | ) 174 | missing = PreprocessingMissingValues(cols_to_process, missing_method) 175 | missing.fit(X_train) 176 | X_train = missing.transform(X_train) 177 | self._missing_values += [missing] 178 | 179 | # golden features 180 | golden_columns = [] 181 | if "golden_features" in self._params: 182 | results_path = self._params["golden_features"]["results_path"] 183 | ml_task = self._params["golden_features"]["ml_task"] 184 | features_count = self._params["golden_features"].get("features_count") 185 | n_jobs = self._params["golden_features"].get("n_jobs", -1) 186 | self._golden_features = GoldenFeaturesTransformer( 187 | results_path, ml_task, features_count, n_jobs 188 | ) 189 | self._golden_features.fit(X_train[numeric_cols], y_train) 190 | X_train = self._golden_features.transform(X_train) 191 | golden_columns = self._golden_features._new_columns 192 | 193 | kmeans_columns = [] 194 | if "kmeans_features" in self._params: 195 | results_path = self._params["kmeans_features"]["results_path"] 196 | self._kmeans = KMeansTransformer( 197 | results_path, self._model_name, self._k_fold 198 | ) 199 | self._kmeans.fit(X_train[numeric_cols], y_train) 200 | X_train = self._kmeans.transform(X_train) 201 | kmeans_columns = self._kmeans._new_features 202 | 203 | for convert_method in [ 204 | PreprocessingCategorical.CONVERT_INTEGER, 205 | PreprocessingCategorical.CONVERT_ONE_HOT 206 | ]: 207 | cols_to_process = list( 208 | filter( 209 | lambda k: convert_method in columns_preprocessing[k], 210 | columns_preprocessing, 211 | ) 212 | ) 213 | convert = PreprocessingCategorical(cols_to_process, convert_method) 214 | convert.fit(X_train, y_train) 215 | X_train = convert.transform(X_train) 216 | self._categorical += [convert] 217 | 218 | # datetime transform 219 | cols_to_process = list( 220 | filter( 221 | lambda k: "datetime_transform" in columns_preprocessing[k], 222 | columns_preprocessing, 223 | ) 224 | ) 225 | 226 | new_datetime_columns = [] 227 | for col in cols_to_process: 228 | t = DateTimeTransformer() 229 | t.fit(X_train, col) 230 | X_train = t.transform(X_train) 231 | self._datetime_transforms += [t] 232 | new_datetime_columns += t._new_columns 233 | 234 | # SCALE 235 | for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]: 236 | cols_to_process = list( 237 | filter( 238 | lambda k: scale_method in columns_preprocessing[k], 239 | columns_preprocessing, 240 | ) 241 | ) 242 | if ( 243 | len(cols_to_process) 244 | and len(new_datetime_columns) 245 | and scale_method == Scale.SCALE_NORMAL 246 | ): 247 | cols_to_process += new_datetime_columns 248 | if ( 249 | len(cols_to_process) 250 | and len(new_text_columns) 251 | and scale_method == Scale.SCALE_NORMAL 252 | ): 253 | cols_to_process += new_text_columns 254 | 255 | if ( 256 | len(cols_to_process) 257 | and len(golden_columns) 258 | and scale_method == Scale.SCALE_NORMAL 259 | ): 260 | cols_to_process += golden_columns 261 | 262 | if ( 263 | len(cols_to_process) 264 | and len(kmeans_columns) 265 | and scale_method == Scale.SCALE_NORMAL 266 | ): 267 | cols_to_process += kmeans_columns 268 | 269 | if len(cols_to_process): 270 | scale = Scale(cols_to_process) 271 | scale.fit(X_train) 272 | X_train = scale.transform(X_train) 273 | self._scale += [scale] 274 | 275 | if self._add_random_feature: 276 | # -1, 1, with 0 mean 277 | X_train["random_feature"] = np.random.rand(X_train.shape[0]) * 2.0 - 1.0 278 | 279 | if self._drop_features: 280 | available_cols = X_train.columns.tolist() 281 | drop_cols = [c for c in self._drop_features if c in available_cols] 282 | if len(drop_cols) == X_train.shape[1]: 283 | raise AutoMLException( 284 | "All features are droppped! Your data looks like random data." 285 | ) 286 | if drop_cols: 287 | X_train.drop(drop_cols, axis=1, inplace=True) 288 | self._drop_features = drop_cols 289 | 290 | if X_train is not None: 291 | # there can be catagorical columns (in CatBoost) which cant be clipped 292 | numeric_cols = X_train.select_dtypes(include="number").columns.tolist() 293 | X_train[numeric_cols] = X_train[numeric_cols].clip( 294 | lower=np.finfo(np.float32).min + 1000, 295 | upper=np.finfo(np.float32).max - 1000, 296 | ) 297 | 298 | return X_train, y_train, sample_weight 299 | 300 | def transform(self, X_validation, y_validation, sample_weight_validation=None): 301 | logger.debug("Preprocessing.transform") 302 | 303 | # doing copy to avoid SettingWithCopyWarning 304 | if X_validation is not None: 305 | X_validation = X_validation.copy(deep=False) 306 | if y_validation is not None: 307 | y_validation = y_validation.copy(deep=False) 308 | 309 | # target preprocessing 310 | # this must be used first, maybe we will drop some rows because of missing target values 311 | if y_validation is not None: 312 | target_preprocessing = self._params.get("target_preprocessing") 313 | logger.debug("target_preprocessing -> {}".format(target_preprocessing)) 314 | 315 | ( 316 | X_validation, 317 | y_validation, 318 | sample_weight_validation, 319 | _, 320 | ) = ExcludeRowsMissingTarget.transform( 321 | X_validation, y_validation, sample_weight_validation 322 | ) 323 | 324 | if PreprocessingCategorical.CONVERT_INTEGER in target_preprocessing: 325 | if y_validation is not None and self._categorical_y is not None: 326 | y_validation = pd.Series( 327 | self._categorical_y.transform(y_validation) 328 | ) 329 | 330 | if PreprocessingCategorical.CONVERT_ONE_HOT in target_preprocessing: 331 | if y_validation is not None and self._categorical_y is not None: 332 | y_validation = self._categorical_y.transform( 333 | pd.DataFrame({"target": y_validation}), "target" 334 | ) 335 | 336 | if Scale.SCALE_LOG_AND_NORMAL in target_preprocessing: 337 | if self._scale_y is not None and y_validation is not None: 338 | logger.debug("Transform log and normalize") 339 | y_validation = pd.DataFrame({"target": y_validation}) 340 | y_validation = self._scale_y.transform(y_validation) 341 | y_validation = y_validation["target"] 342 | 343 | if Scale.SCALE_NORMAL in target_preprocessing: 344 | if self._scale_y is not None and y_validation is not None: 345 | logger.debug("Transform normalize") 346 | y_validation = pd.DataFrame({"target": y_validation}) 347 | y_validation = self._scale_y.transform(y_validation) 348 | y_validation = y_validation["target"] 349 | 350 | # columns preprocessing 351 | if len(self._remove_columns) and X_validation is not None: 352 | cols_to_remove = [ 353 | col for col in X_validation.columns if col in self._remove_columns 354 | ] 355 | X_validation.drop(cols_to_remove, axis=1, inplace=True) 356 | 357 | # text transform 358 | for tt in self._text_transforms: 359 | if X_validation is not None and tt is not None: 360 | X_validation = tt.transform(X_validation) 361 | 362 | for missing in self._missing_values: 363 | if X_validation is not None and missing is not None: 364 | X_validation = missing.transform(X_validation) 365 | 366 | # to be sure that all missing are filled 367 | # in case new data there can be gaps! 368 | if ( 369 | X_validation is not None 370 | and pd.isnull(X_validation).sum().sum() > 0 371 | and len(self._params["columns_preprocessing"]) > 0 372 | ): 373 | # there is something missing, fill it 374 | # we should notice user about it! 375 | # warnings should go to the separate file ... 376 | # warnings.warn( 377 | # "There are columns {} with missing values which didnt have missing values in train dataset.".format( 378 | # list( 379 | # X_validation.columns[np.where(np.sum(pd.isnull(X_validation)))] 380 | # ) 381 | # ) 382 | # ) 383 | missing = PreprocessingMissingValues( 384 | X_validation.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 385 | ) 386 | missing.fit(X_validation) 387 | X_validation = missing.transform(X_validation) 388 | 389 | # golden features 390 | if self._golden_features is not None: 391 | X_validation = self._golden_features.transform(X_validation) 392 | 393 | if self._kmeans is not None: 394 | X_validation = self._kmeans.transform(X_validation) 395 | 396 | for convert in self._categorical: 397 | if X_validation is not None and convert is not None: 398 | X_validation = convert.transform(X_validation) 399 | 400 | for dtt in self._datetime_transforms: 401 | if X_validation is not None and dtt is not None: 402 | X_validation = dtt.transform(X_validation) 403 | 404 | for scale in self._scale: 405 | if X_validation is not None and scale is not None: 406 | X_validation = scale.transform(X_validation) 407 | 408 | if self._add_random_feature: 409 | # -1, 1, with 0 mean 410 | X_validation["random_feature"] = ( 411 | np.random.rand(X_validation.shape[0]) * 2.0 - 1.0 412 | ) 413 | 414 | if self._drop_features and X_validation is not None: 415 | X_validation.drop(self._drop_features, axis=1, inplace=True) 416 | 417 | if X_validation is not None: 418 | # there can be catagorical columns (in CatBoost) which cant be clipped 419 | numeric_cols = X_validation.select_dtypes(include="number").columns.tolist() 420 | X_validation[numeric_cols] = X_validation[numeric_cols].clip( 421 | lower=np.finfo(np.float32).min + 1000, 422 | upper=np.finfo(np.float32).max - 1000, 423 | ) 424 | 425 | return X_validation, y_validation, sample_weight_validation 426 | 427 | def inverse_scale_target(self, y): 428 | if self._scale_y is not None: 429 | y = pd.DataFrame({"target": y}) 430 | y = self._scale_y.inverse_transform(y) 431 | y = y["target"] 432 | return y 433 | 434 | def inverse_categorical_target(self, y): 435 | if self._categorical_y is not None: 436 | y = self._categorical_y.inverse_transform(y) 437 | y = y.astype(str) 438 | return y 439 | 440 | def get_target_class_names(self): 441 | pos_label, neg_label = "1", "0" 442 | if self._categorical_y is not None: 443 | if self._params["ml_task"] == BINARY_CLASSIFICATION: 444 | # binary classification 445 | for label, value in self._categorical_y.to_json().items(): 446 | if value == 1: 447 | pos_label = label 448 | else: 449 | neg_label = label 450 | return [neg_label, pos_label] 451 | else: 452 | # multiclass classification 453 | # logger.debug(self._categorical_y.to_json()) 454 | if "unique_values" not in self._categorical_y.to_json(): 455 | labels = dict( 456 | (v, k) for k, v in self._categorical_y.to_json().items() 457 | ) 458 | else: 459 | labels = { 460 | i: v 461 | for i, v in enumerate( 462 | self._categorical_y.to_json()["unique_values"] 463 | ) 464 | } 465 | 466 | return list(labels.values()) 467 | 468 | else: # self._categorical_y is None 469 | if "ml_task" in self._params: 470 | if self._params["ml_task"] == BINARY_CLASSIFICATION: 471 | return ["0", "1"] 472 | return [] 473 | 474 | def prepare_target_labels(self, y): 475 | pos_label, neg_label = "1", "0" 476 | 477 | if self._categorical_y is not None: 478 | if len(y.shape) == 1: 479 | # binary classification 480 | for label, value in self._categorical_y.to_json().items(): 481 | if value == 1: 482 | pos_label = label 483 | else: 484 | neg_label = label 485 | # threshold is applied in AutoML class 486 | return pd.DataFrame( 487 | { 488 | "prediction_{}".format(neg_label): 1 - y, 489 | "prediction_{}".format(pos_label): y, 490 | } 491 | ) 492 | else: 493 | # multiclass classification 494 | if "unique_values" not in self._categorical_y.to_json(): 495 | labels = dict( 496 | (v, k) for k, v in self._categorical_y.to_json().items() 497 | ) 498 | else: 499 | labels = { 500 | i: v 501 | for i, v in enumerate( 502 | self._categorical_y.to_json()["unique_values"] 503 | ) 504 | } 505 | 506 | d = {} 507 | cols = [] 508 | for i in range(y.shape[1]): 509 | d["prediction_{}".format(labels[i])] = y[:, i] 510 | cols += ["prediction_{}".format(labels[i])] 511 | df = pd.DataFrame(d) 512 | df["label"] = np.argmax(np.array(df[cols]), axis=1) 513 | 514 | df["label"] = df["label"].map(labels) 515 | 516 | return df 517 | else: # self._categorical_y is None 518 | if "ml_task" in self._params: 519 | if self._params["ml_task"] == BINARY_CLASSIFICATION: 520 | return pd.DataFrame({"prediction_0": 1 - y, "prediction_1": y}) 521 | elif self._params["ml_task"] == MULTICLASS_CLASSIFICATION: 522 | return pd.DataFrame( 523 | data=y, 524 | columns=["prediction_{}".format(i) for i in range(y.shape[1])], 525 | ) 526 | 527 | return pd.DataFrame({"prediction": y}) 528 | 529 | def to_json(self): 530 | preprocessing_params = {} 531 | if self._remove_columns: 532 | preprocessing_params["remove_columns"] = self._remove_columns 533 | if self._missing_values is not None and len(self._missing_values): 534 | mvs = [] # refactor 535 | for mv in self._missing_values: 536 | if mv.to_json(): 537 | mvs += [mv.to_json()] 538 | if mvs: 539 | preprocessing_params["missing_values"] = mvs 540 | if self._categorical is not None and len(self._categorical): 541 | cats = [] # refactor 542 | for cat in self._categorical: 543 | if cat.to_json(): 544 | cats += [cat.to_json()] 545 | if cats: 546 | preprocessing_params["categorical"] = cats 547 | 548 | if self._datetime_transforms is not None and len(self._datetime_transforms): 549 | dtts = [] 550 | for dtt in self._datetime_transforms: 551 | dtts += [dtt.to_json()] 552 | if dtts: 553 | preprocessing_params["datetime_transforms"] = dtts 554 | 555 | if self._text_transforms is not None and len(self._text_transforms): 556 | tts = [] 557 | for tt in self._text_transforms: 558 | tts += [tt.to_json()] 559 | if tts: 560 | preprocessing_params["text_transforms"] = tts 561 | 562 | if self._golden_features is not None: 563 | preprocessing_params["golden_features"] = self._golden_features.to_json() 564 | 565 | if self._kmeans is not None: 566 | preprocessing_params["kmeans"] = self._kmeans.to_json() 567 | 568 | if self._scale is not None and len(self._scale): 569 | scs = [sc.to_json() for sc in self._scale if sc.to_json()] 570 | if scs: 571 | preprocessing_params["scale"] = scs 572 | if self._categorical_y is not None: 573 | cat_y = self._categorical_y.to_json() 574 | if cat_y: 575 | preprocessing_params["categorical_y"] = cat_y 576 | if self._scale_y is not None: 577 | preprocessing_params["scale_y"] = self._scale_y.to_json() 578 | 579 | if "ml_task" in self._params: 580 | preprocessing_params["ml_task"] = self._params["ml_task"] 581 | 582 | if self._add_random_feature: 583 | preprocessing_params["add_random_feature"] = True 584 | 585 | if self._drop_features: 586 | preprocessing_params["drop_features"] = self._drop_features 587 | 588 | preprocessing_params["params"] = self._params 589 | 590 | return preprocessing_params 591 | 592 | def from_json(self, data_json, results_path): 593 | self._params = data_json.get("params", self._params) 594 | 595 | if "remove_columns" in data_json: 596 | self._remove_columns = data_json.get("remove_columns", []) 597 | if "missing_values" in data_json: 598 | self._missing_values = [] 599 | for mv_data in data_json["missing_values"]: 600 | mv = PreprocessingMissingValues() 601 | mv.from_json(mv_data) 602 | self._missing_values += [mv] 603 | if "categorical" in data_json: 604 | self._categorical = [] 605 | for cat_data in data_json["categorical"]: 606 | cat = PreprocessingCategorical() 607 | cat.from_json(cat_data) 608 | self._categorical += [cat] 609 | 610 | if "datetime_transforms" in data_json: 611 | self._datetime_transforms = [] 612 | for dtt_params in data_json["datetime_transforms"]: 613 | dtt = DateTimeTransformer() 614 | dtt.from_json(dtt_params) 615 | self._datetime_transforms += [dtt] 616 | 617 | if "text_transforms" in data_json: 618 | self._text_transforms = [] 619 | for tt_params in data_json["text_transforms"]: 620 | tt = TextTransformer() 621 | tt.from_json(tt_params) 622 | self._text_transforms += [tt] 623 | 624 | if "golden_features" in data_json: 625 | self._golden_features = GoldenFeaturesTransformer() 626 | self._golden_features.from_json(data_json["golden_features"], results_path) 627 | 628 | if "kmeans" in data_json: 629 | self._kmeans = KMeansTransformer() 630 | self._kmeans.from_json(data_json["kmeans"], results_path) 631 | 632 | if "scale" in data_json: 633 | self._scale = [] 634 | for scale_data in data_json["scale"]: 635 | sc = Scale() 636 | sc.from_json(scale_data) 637 | self._scale += [sc] 638 | if "categorical_y" in data_json: 639 | if "new_columns" in data_json["categorical_y"]: 640 | self._categorical_y = LabelBinarizer() 641 | else: 642 | self._categorical_y = LabelEncoder() 643 | 644 | self._categorical_y.from_json(data_json["categorical_y"]) 645 | if "scale_y" in data_json: 646 | self._scale_y = Scale() 647 | self._scale_y.from_json(data_json["scale_y"]) 648 | if "ml_task" in data_json: 649 | self._params["ml_task"] = data_json["ml_task"] 650 | 651 | self._add_random_feature = data_json.get("add_random_feature", False) 652 | self._drop_features = data_json.get("drop_features", []) 653 | ``` -------------------------------------------------------------------------------- /supervised/automl.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import matplotlib 4 | 5 | import warnings 6 | 7 | warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*") 8 | 9 | from collections.abc import Iterable 10 | 11 | # libraries for type hints 12 | from typing import List, Optional, Union 13 | 14 | import numpy 15 | import pandas 16 | from typing_extensions import ( 17 | Literal, 18 | ) # typing_extensions is used for using Literal from python 3.7 19 | 20 | from supervised.base_automl import BaseAutoML 21 | from supervised.utils.config import LOG_LEVEL 22 | 23 | logging.basicConfig( 24 | format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR 25 | ) 26 | logger = logging.getLogger(__name__) 27 | logger.setLevel(LOG_LEVEL) 28 | 29 | 30 | class AutoML(BaseAutoML): 31 | 32 | """ 33 | Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). 34 | """ 35 | 36 | def __init__( 37 | self, 38 | results_path: Optional[str] = None, 39 | total_time_limit: int = 60 * 60, 40 | mode: Literal["Explain", "Perform", "Compete", "Optuna"] = "Explain", 41 | ml_task: Literal[ 42 | "auto", "binary_classification", "multiclass_classification", "regression" 43 | ] = "auto", 44 | model_time_limit: Optional[int] = None, 45 | algorithms: Union[ 46 | Literal["auto"], 47 | List[ 48 | Literal[ 49 | "Baseline", 50 | "Linear", 51 | "Decision Tree", 52 | "Random Forest", 53 | "Extra Trees", 54 | "LightGBM", 55 | "Xgboost", 56 | "CatBoost", 57 | "Neural Network", 58 | "Nearest Neighbors", 59 | ] 60 | ], 61 | ] = "auto", 62 | train_ensemble: bool = True, 63 | stack_models: Union[Literal["auto"], bool] = "auto", 64 | eval_metric: str = "auto", 65 | validation_strategy: Union[Literal["auto"], dict] = "auto", 66 | explain_level: Union[Literal["auto"], Literal[0, 1, 2]] = "auto", 67 | golden_features: Union[Literal["auto"], bool, int] = "auto", 68 | features_selection: Union[Literal["auto"], bool] = "auto", 69 | start_random_models: Union[Literal["auto"], int] = "auto", 70 | hill_climbing_steps: Union[Literal["auto"], int] = "auto", 71 | top_models_to_improve: Union[Literal["auto"], int] = "auto", 72 | boost_on_errors: Union[Literal["auto"], bool] = "auto", 73 | kmeans_features: Union[Literal["auto"], bool] = "auto", 74 | mix_encoding: Union[Literal["auto"], bool] = "auto", 75 | max_single_prediction_time: Optional[Union[int, float]] = None, 76 | optuna_time_budget: Optional[int] = None, 77 | optuna_init_params: dict = {}, 78 | optuna_verbose: bool = True, 79 | fairness_metric: str = "auto", 80 | fairness_threshold: Union[Literal["auto"], float] = "auto", 81 | privileged_groups: Union[Literal["auto"], list] = "auto", 82 | underprivileged_groups: Union[Literal["auto"], list] = "auto", 83 | n_jobs: int = -1, 84 | verbose: int = 1, 85 | random_state: int = 1234, 86 | ): 87 | """ 88 | Initialize `AutoML` object. 89 | 90 | Arguments: 91 | results_path (str): The path with results. If None, then the name of directory will be generated with the template: AutoML_{number}, 92 | where the number can be from 1 to 1,000 - depends which direcory name will be available. 93 | If the `results_path` will point to directory with AutoML results (`params.json` must be present), 94 | then all models will be loaded. 95 | 96 | total_time_limit (int): The total time limit in seconds for AutoML training. 97 | It is not used when `model_time_limit` is not `None`. 98 | 99 | mode (str): Can be {`Explain`, `Perform`, `Compete`, `Optuna`}. This parameter defines the goal of AutoML and how intensive the AutoML search will be. 100 | 101 | - `Explain` : To to be used when the user wants to explain and understand the data. 102 | - Uses 75%/25% train/test split. 103 | - Uses the following models: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `XGBoost`, `Neural Network`, and `Ensemble`. 104 | - Has full explanations in reports: learning curves, importance plots, and SHAP plots. 105 | - `Perform` : To be used when the user wants to train a model that will be used in real-life use cases. 106 | - Uses 5-fold CV (Cross-Validation). 107 | - Uses the following models: `Linear`, `Random Forest`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`, and `Ensemble`. 108 | - Has learning curves and importance plots in reports. 109 | - `Compete` : To be used for machine learning competitions (maximum performance). 110 | - Uses 80/20 train/test split, or 5-fold CV, or 10-fold CV (Cross-Validation) - it depends on `total_time_limit`. If not set directly, AutoML will select validation automatically. 111 | - Uses the following models: `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`, 112 | `Nearest Neighbors`, `Ensemble`, and `Stacking`. 113 | - It has only learning curves in the reports. 114 | - `Optuna` : To be used for creating highly-tuned machine learning models. 115 | - Uses 10-fold CV (Cross-Validation). 116 | - It tunes with Optuna the following algorithms: `Random Forest`, `Extra Trees`, `LightGBM`, `XGBoost`, `CatBoost`, `Neural Network`. 117 | - It applies `Ensemble` and `Stacking` for trained models. 118 | - It has only learning curves in the reports. 119 | 120 | ml_task (str): Can be {"auto", "binary_classification", "multiclass_classification", "regression"}. 121 | 122 | - If left `auto` AutoML will try to guess the task based on target values. 123 | - If there will be only 2 values in the target, then task will be set to `"binary_classification"`. 124 | - If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`. 125 | - In all other casses, the task is set to `"regression"`. 126 | 127 | model_time_limit (int): The time limit for training a single model, in seconds. 128 | If `model_time_limit` is set, the `total_time_limit` is not respected. 129 | The single model can contain several learners. The time limit for subsequent learners is computed based on `model_time_limit`. 130 | 131 | For example, in the case of 10-fold cross-validation, one model will have 10 learners. 132 | The `model_time_limit` is the time for all 10 learners. 133 | 134 | algorithms (list of str): The list of algorithms that will be used in the training. 135 | The algorithms can be: 136 | 137 | - `Baseline`, 138 | - `Linear`, 139 | - `Decision Tree`, 140 | - `Random Forest`, 141 | - `Extra Trees`, 142 | - `LightGBM`, 143 | - `Xgboost`, 144 | - `CatBoost`, 145 | - `Neural Network`, 146 | - `Nearest Neighbors`, 147 | 148 | 149 | train_ensemble (boolean): Whether an ensemble gets created at the end of the training. 150 | 151 | stack_models (boolean): Whether a models stack gets created at the end of the training. Stack level is 1. 152 | 153 | eval_metric (str): The metric to be used in early stopping and to compare models. 154 | 155 | - for binary classification: `logloss`, `auc`, `f1`, `average_precision`, `accuracy` - default is logloss (if left "auto") 156 | - for mutliclass classification: `logloss`, `f1`, `accuracy` - default is `logloss` (if left "auto") 157 | - for regression: `rmse`, `mse`, `mae`, `r2`, `mape`, `spearman`, `pearson` - default is `rmse` (if left "auto") 158 | 159 | validation_strategy (dict): Dictionary with validation type. Right now train/test split and cross-validation are supported. 160 | 161 | Example: 162 | 163 | Cross-validation exmaple: 164 | { 165 | "validation_type": "kfold", 166 | "k_folds": 5, 167 | "shuffle": True, 168 | "stratify": True, 169 | "random_seed": 123 170 | } 171 | 172 | Train/test example: 173 | { 174 | "validation_type": "split", 175 | "train_ratio": 0.75, 176 | "shuffle": True, 177 | "stratify": True 178 | } 179 | 180 | explain_level (int): The level of explanations included to each model: 181 | 182 | - if `explain_level` is `0` no explanations are produced. 183 | - if `explain_level` is `1` the following explanations are produced: importance plot (with permutation method), for decision trees produce tree plots, for linear models save coefficients. 184 | - if `explain_level` is `2` the following explanations are produced: the same as `1` plus SHAP explanations. 185 | 186 | If left `auto` AutoML will produce explanations based on the selected `mode`. 187 | 188 | golden_features (boolean or int): Whether to use golden features (and how many should be added) 189 | If left `auto` AutoML will use golden features based on the selected `mode`: 190 | 191 | - If `mode` is "Explain", `golden_features` = False. 192 | - If `mode` is "Perform", `golden_features` = True. 193 | - If `mode` is "Compete", `golden_features` = True. 194 | 195 | If `boolean` value is set then the number of Golden Features is set automatically. 196 | It is set to min(100, max(10, 0.1*number_of_input_features)). 197 | 198 | If `int` value is set, the number of Golden Features is set to this value. 199 | 200 | features_selection (boolean): Whether to do features_selection 201 | If left `auto` AutoML will do feature selection based on the selected `mode`: 202 | 203 | - If `mode` is "Explain", `features_selection` = False. 204 | - If `mode` is "Perform", `features_selection` = True. 205 | - If `mode` is "Compete", `features_selection` = True. 206 | 207 | start_random_models (int): Number of starting random models to try. 208 | If left `auto` AutoML will select it based on the selected `mode`: 209 | 210 | - If `mode` is "Explain", `start_random_models` = 1. 211 | - If `mode` is "Perform", `start_random_models` = 5. 212 | - If `mode` is "Compete", `start_random_models` = 10. 213 | 214 | hill_climbing_steps (int): Number of steps to perform during hill climbing. 215 | If left `auto` AutoML will select it based on the selected `mode`: 216 | 217 | - If `mode` is "Explain", `hill_climbing_steps` = 0. 218 | - If `mode` is "Perform", `hill_climbing_steps` = 2. 219 | - If `mode` is "Compete", `hill_climbing_steps` = 2. 220 | 221 | top_models_to_improve (int): Number of best models to improve in `hill_climbing` steps. 222 | If left `auto` AutoML will select it based on the selected `mode`: 223 | 224 | - If `mode` is "Explain", `top_models_to_improve` = 0. 225 | - If `mode` is "Perform", `top_models_to_improve` = 2. 226 | - If `mode` is "Compete", `top_models_to_improve` = 3. 227 | 228 | boost_on_errors (boolean): Whether a model with boost on errors from previous best model should be trained. By default available in the `Compete` mode. 229 | 230 | kmeans_features (boolean): Whether a model with k-means generated features should be trained. By default available in the `Compete` mode. 231 | 232 | mix_encoding (boolean): Whether a model with mixed encoding should be trained. Mixed encoding is the encoding that uses label encoding 233 | for categoricals with more than 25 categories, and one-hot binary encoding for other categoricals. It is only applied if there are 234 | categorical features with cardinality smaller than 25. By default it is available in the `Compete` mode. 235 | 236 | max_single_prediction_time (int or float): The limit for prediction time for single sample. Use it if you want to have a model with fast predictions. 237 | Ideal for creating ML pipelines used as REST API. Time is in seconds. By default (`max_single_prediction_time=None`) models are not optimized for fast predictions, 238 | except the mode `Perform`. For the mode `Perform` the default is `0.5` seconds. 239 | 240 | optuna_time_budget (int): The time in seconds which should be used by Optuna to tune each algorithm. It is time for tuning single algorithm. 241 | If you select two algorithms: Xgboost and CatBoost, and set optuna_time_budget=1000, then Xgboost will be tuned for 1000 seconds and CatBoost will be tuned for 1000 seconds. 242 | What is more, the tuning is made for each data type, for example for raw data and for data with inserted Golden Features. 243 | This parameter is only used when `mode="Optuna"`. If you set `mode="Optuna"` and forget to set this parameter, it will be set to 3600 seconds. 244 | 245 | optuna_init_params (dict): If you have already tuned parameters from Optuna you can reuse them by setting this parameter. 246 | This parameter is only used when `mode="Optuna"`. The dict should have structure and params as specified in the MLJAR AutoML . 247 | 248 | optuna_verbose (boolean): If true the Optuna tuning details are displayed. Set to `True` by default. 249 | 250 | fairness_metric (string): Name of fairness metric that will be used for assessing fairness criteria. 251 | Available metrics for binary and multiclass classification: 252 | 253 | - `demographic_parity_difference`, 254 | - `demographic_parity_ratio` - default metric, 255 | - `equalized_odds_difference`, 256 | - `equalized_odds_ratio`. 257 | 258 | Metrics for regression: 259 | 260 | - `group_loss_difference`, 261 | - `group_loss_ratio` - default metric. 262 | 263 | 264 | fairness_threshold (float): The treshold value for fairness metric. 265 | The direction optimization (below or above threshold) of fairness metric is determined automatically. 266 | 267 | Default values: 268 | 269 | - for `demographic_parity_difference` the metric value should be below 0.1, 270 | - for `demographic_parity_ratio` the metric value should be above 0.8, 271 | - for `equalized_odds_difference` the metric value should be below 0.1, 272 | - for `equalized_odds_ratio` the metric value shoule be above 0.8. 273 | - for `group_loss_ratio` the metric value shoule be above 0.8. 274 | 275 | For `group_loss_difference` the default threshold value can't be set because it depends on the dataset. 276 | If `group_loss_difference` metric is used and `fairness_threshold` is not specified manually, then an exception will be raised. 277 | 278 | privileged_groups (list): The list of privileged groups. 279 | 280 | By default, list of privileged groups are automatically detected based on fairness metrics. 281 | For example, in binary classification task, a privileged group is the one with the highest selection rate. 282 | 283 | Example value: `[{"sex": "Male"}]` 284 | 285 | underprivileged_groups (list): The list of underprivileged groups. 286 | 287 | By default, list of underprivileged groups are automatically detected based on fairness metrics. 288 | For example, in binary classification task, an underprivileged group is the one with the lowest selection rate. 289 | 290 | Example value: `[{"sex": "Female"}]` 291 | 292 | n_jobs (int): Number of CPU cores to be used. By default is set to `-1` which means using all processors. 293 | 294 | verbose (int): Controls the verbosity when fitting and predicting. 295 | 296 | Note: 297 | Still not implemented, please left `1` 298 | 299 | random_state (int): Controls the randomness of the `AutoML` 300 | 301 | 302 | Examples: 303 | 304 | Binary Classification Example: 305 | 306 | >>> import pandas as pd 307 | >>> from sklearn.model_selection import train_test_split 308 | >>> from sklearn.metrics import roc_auc_score 309 | >>> from supervised import AutoML 310 | >>> df = pd.read_csv( 311 | ... "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 312 | ... skipinitialspace=True 313 | ... ) 314 | >>> X_train, X_test, y_train, y_test = train_test_split( 315 | ... df[df.columns[:-1]], df["income"], test_size=0.25 316 | ... ) 317 | >>> automl = AutoML() 318 | >>> automl.fit(X_train, y_train) 319 | >>> y_pred_prob = automl.predict_proba(X_test) 320 | >>> print(f"AUROC: {roc_auc_score(y_test, y_pred_prob):.2f}%") 321 | 322 | 323 | Multi-Class Classification Example: 324 | 325 | >>> import pandas as pd 326 | >>> from sklearn.datasets import load_digits 327 | >>> from sklearn.metrics import accuracy_score 328 | >>> from sklearn.model_selection import train_test_split 329 | >>> from supervised import AutoML 330 | >>> digits = load_digits() 331 | >>> X_train, X_test, y_train, y_test = train_test_split( 332 | ... digits.data, digits.target, stratify=digits.target, test_size=0.25, 333 | ... random_state=123 334 | ... ) 335 | >>> automl = AutoML(mode="Perform") 336 | >>> automl.fit(X_train, y_train) 337 | >>> y_pred = automl.predict(X_test) 338 | >>> print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}%") 339 | 340 | Regression Example: 341 | 342 | >>> import pandas as pd 343 | >>> from sklearn.datasets import fetch_california_housing 344 | >>> from sklearn.model_selection import train_test_split 345 | >>> from sklearn.metrics import mean_squared_error 346 | >>> from supervised import AutoML 347 | >>> housing = fetch_california_housing() 348 | >>> X_train, X_test, y_train, y_test = train_test_split( 349 | ... pd.DataFrame(housing.data, columns=housing.feature_names), 350 | ... housing.target, 351 | ... test_size=0.25, 352 | ... random_state=123, 353 | ... ) 354 | >>> automl = AutoML(mode="Compete") 355 | >>> automl.fit(X_train, y_train) 356 | >>> print("Test R^2:", automl.score(X_test, y_test)) 357 | 358 | Scikit-learn Pipeline Integration Example: 359 | 360 | >>> from imblearn.over_sampling import RandomOverSampler 361 | >>> from sklearn.pipeline import make_pipeline 362 | >>> from sklearn.datasets import make_classification 363 | >>> from sklearn.model_selection import train_test_split 364 | >>> from supervised import AutoML 365 | >>> X, y = make_classification() 366 | >>> X_train, X_test, y_train, y_test = train_test_split(X,y) 367 | >>> pipeline = make_pipeline(RandomOverSampler(), AutoML()) 368 | >>> print(pipeline.fit(X_train, y_train).score(X_test, y_test)) 369 | 370 | """ 371 | super(AutoML, self).__init__() 372 | # Set user arguments 373 | self.mode = mode 374 | self.ml_task = ml_task 375 | self.results_path = results_path 376 | self.total_time_limit = total_time_limit 377 | self.model_time_limit = model_time_limit 378 | self.algorithms = algorithms 379 | self.train_ensemble = train_ensemble 380 | self.stack_models = stack_models 381 | self.eval_metric = eval_metric 382 | self.validation_strategy = validation_strategy 383 | self.verbose = verbose 384 | self.explain_level = explain_level 385 | self.golden_features = golden_features 386 | self.features_selection = features_selection 387 | self.start_random_models = start_random_models 388 | self.hill_climbing_steps = hill_climbing_steps 389 | self.top_models_to_improve = top_models_to_improve 390 | self.boost_on_errors = boost_on_errors 391 | self.kmeans_features = kmeans_features 392 | self.mix_encoding = mix_encoding 393 | self.max_single_prediction_time = max_single_prediction_time 394 | self.optuna_time_budget = optuna_time_budget 395 | self.optuna_init_params = optuna_init_params 396 | self.optuna_verbose = optuna_verbose 397 | self.fairness_metric = fairness_metric 398 | self.fairness_threshold = fairness_threshold 399 | self.privileged_groups = privileged_groups 400 | self.underprivileged_groups = underprivileged_groups 401 | self.n_jobs = n_jobs 402 | self.random_state = random_state 403 | 404 | def fit( 405 | self, 406 | X: Union[numpy.ndarray, pandas.DataFrame], 407 | y: Union[numpy.ndarray, pandas.Series], 408 | sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None, 409 | cv: Optional[Union[Iterable, List]] = None, 410 | sensitive_features: Optional[ 411 | Union[numpy.ndarray, pandas.Series, pandas.DataFrame] 412 | ] = None, 413 | ): 414 | """Fit the AutoML model. 415 | 416 | Arguments: 417 | X (numpy.ndarray or pandas.DataFrame): Training data 418 | 419 | y (numpy.ndarray or pandas.Series): Training targets 420 | 421 | sample_weight (numpy.ndarray or pandas.Series): Training sample weights 422 | 423 | cv (iterable or list): List or iterable with (train, validation) splits representing array of indices. 424 | It is used only with custom validation (`validation_strategy={'validation_type': 'custom'}`). 425 | 426 | sensitive_features (pandas.Series or pandas.DataFrame): Sensitive features to learn fair models 427 | 428 | Returns: 429 | AutoML object: Returns `self` 430 | """ 431 | try: 432 | original_backend = matplotlib.get_backend() 433 | matplotlib.use("Agg") 434 | return self._fit(X, y, sample_weight, cv, sensitive_features) 435 | except Exception as e: 436 | raise e 437 | finally: 438 | matplotlib.use(original_backend) 439 | try: 440 | if 'inline' in original_backend: 441 | import matplotlib_inline 442 | matplotlib_inline.backend_inline._enable_matplotlib_integration() 443 | except: 444 | pass 445 | 446 | 447 | def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndarray: 448 | """ 449 | Computes predictions from AutoML best model. 450 | 451 | Arguments: 452 | X (list or numpy.ndarray or pandas.DataFrame): 453 | Input values to make predictions on. 454 | 455 | Returns: 456 | numpy.ndarray: 457 | 458 | - One-dimensional array of class labels for classification. 459 | - One-dimensional array of predictions for regression. 460 | 461 | Raises: 462 | AutoMLException: Model has not yet been fitted. 463 | """ 464 | return self._predict(X) 465 | 466 | def predict_proba( 467 | self, X: Union[List, numpy.ndarray, pandas.DataFrame] 468 | ) -> numpy.ndarray: 469 | """ 470 | Computes class probabilities from AutoML best model. 471 | This method can only be used for classification tasks. 472 | 473 | Arguments: 474 | X (list or numpy.ndarray or pandas.DataFrame): 475 | Input values to make predictions on. 476 | 477 | Returns: 478 | numpy.ndarray of shape (n_samples, n_classes): 479 | Matrix of containing class probabilities of the input samples 480 | 481 | Raises: 482 | AutoMLException: Model has not yet been fitted. 483 | 484 | """ 485 | return self._predict_proba(X) 486 | 487 | def predict_all( 488 | self, X: Union[List, numpy.ndarray, pandas.DataFrame] 489 | ) -> pandas.DataFrame: 490 | """ 491 | Computes both class probabilities and class labels for classification tasks. 492 | Computes predictions for regression tasks. 493 | 494 | Arguments: 495 | X (list or numpy.ndarray or pandas.DataFrame): 496 | Input values to make predictions on. 497 | 498 | Returns: 499 | pandas.Dataframe: 500 | Dataframe (n_samples, n_classes + 1) containing both class probabilities and class 501 | labels of the input samples for classification tasks. 502 | Dataframe with predictions for regression tasks. 503 | 504 | Raises: 505 | AutoMLException: Model has not yet been fitted. 506 | 507 | """ 508 | return self._predict_all(X) 509 | 510 | def score( 511 | self, 512 | X: Union[numpy.ndarray, pandas.DataFrame], 513 | y: Optional[Union[numpy.ndarray, pandas.Series]] = None, 514 | sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None, 515 | ) -> float: 516 | """Calculates a goodness of `fit` for an AutoML instance. 517 | 518 | Arguments: 519 | X (numpy.ndarray or pandas.DataFrame): 520 | Test values to make predictions on. 521 | 522 | y (numpy.ndarray or pandas.Series): 523 | True labels for X. 524 | 525 | sample_weight (numpy.ndarray or pandas.Series): 526 | Sample weights. 527 | Returns: 528 | float: Returns a goodness of fit measure (higher is better): 529 | 530 | - For classification tasks: returns the mean accuracy on the given test data and labels. 531 | - For regression tasks: returns the R^2 (coefficient of determination) on the given test data and labels. 532 | """ 533 | return self._score(X, y, sample_weight) 534 | 535 | def report(self, width=900, height=1200): 536 | return self._report(width, height) 537 | 538 | def need_retrain( 539 | self, 540 | X: Union[numpy.ndarray, pandas.DataFrame], 541 | y: Union[numpy.ndarray, pandas.Series], 542 | sample_weight: Optional[Union[numpy.ndarray, pandas.Series]] = None, 543 | decrease: float = 0.1, 544 | ) -> bool: 545 | """Decides about model retraining based on new data. 546 | 547 | Arguments: 548 | X (numpy.ndarray or pandas.DataFrame): 549 | New data. 550 | 551 | y (numpy.ndarray or pandas.Series): 552 | True labels for X. 553 | 554 | sample_weight (numpy.ndarray or pandas.Series): 555 | Sample weights. 556 | 557 | decrease (float): The ratio of change in the performance used as a threshold for retraining decision. 558 | By default, it is set to `0.1` which means that if the performance of AutoML will decrease by 10% 559 | on new data then there is a need to retrain. This value should be set depending on your project needs. 560 | Sometimes, 10% is enough, but for some projects, it can be even lower than 1%. 561 | 562 | Returns: 563 | boolean: Decides if there is a need to retrain the AutoML. 564 | """ 565 | return self._need_retrain(X, y, sample_weight, decrease) 566 | ```