This is page 14 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /supervised/base_automl.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import logging 3 | import os 4 | import shutil 5 | import time 6 | import types 7 | import uuid 8 | from abc import ABC 9 | from copy import deepcopy 10 | 11 | import joblib 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.base import BaseEstimator 15 | from sklearn.metrics import accuracy_score, r2_score 16 | from sklearn.utils.validation import check_array 17 | from tabulate import tabulate 18 | 19 | from supervised.algorithms.registry import ( 20 | BINARY_CLASSIFICATION, 21 | MULTICLASS_CLASSIFICATION, 22 | REGRESSION, 23 | AlgorithmsRegistry, 24 | ) 25 | from supervised.callbacks.early_stopping import EarlyStopping 26 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint 27 | from supervised.ensemble import Ensemble 28 | from supervised.exceptions import AutoMLException, NotTrainedException 29 | from supervised.model_framework import ModelFramework 30 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget 31 | # disable EDA 32 | # from supervised.preprocessing.eda import EDA 33 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 34 | from supervised.tuner.data_info import DataInfo 35 | from supervised.tuner.mljar_tuner import MljarTuner 36 | from supervised.tuner.time_controller import TimeController 37 | from supervised.utils.automl_plots import AutoMLPlots 38 | from supervised.utils.config import LOG_LEVEL 39 | from supervised.utils.data_validation import ( 40 | check_bool, 41 | check_greater_than_zero_integer, 42 | check_greater_than_zero_integer_or_float, 43 | check_integer, 44 | check_positive_integer, 45 | ) 46 | from supervised.utils.jsonencoder import MLJSONEncoder 47 | from supervised.utils.leaderboard_plots import LeaderboardPlots 48 | from supervised.utils.metric import Metric, UserDefinedEvalMetric 49 | from supervised.utils.utils import dump_data, load_data 50 | 51 | logger = logging.getLogger(__name__) 52 | logger.setLevel(LOG_LEVEL) 53 | 54 | 55 | class BaseAutoML(BaseEstimator, ABC): 56 | """ 57 | Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). 58 | Warning: This class should not be used directly. Use derived classes instead. 59 | """ 60 | 61 | def __init__(self): 62 | logger.debug("BaseAutoML.__init__") 63 | self._mode = None 64 | self._ml_task = None 65 | self._results_path = None 66 | self._total_time_limit = None 67 | self._model_time_limit = None 68 | self._algorithms = [] 69 | self._train_ensemble = False 70 | self._stack_models = False 71 | self._eval_metric = None 72 | self._validation_strategy = None 73 | self._verbose = None 74 | self._explain_level = None 75 | self._golden_features = None 76 | self._features_selection = None 77 | self._start_random_models = None 78 | self._hill_climbing_steps = None 79 | self._top_models_to_improve = None 80 | self._random_state = 1234 81 | self._models = [] # instances of iterative learner framework or ensemble 82 | self._best_model = None 83 | self._verbose = True 84 | self._threshold = None # used only in classification 85 | self._metrics_details = None 86 | self._max_metrics = None 87 | self._confusion_matrix = None 88 | self._X_path, self._y_path = None, None 89 | self._data_info = None 90 | self._model_subpaths = [] 91 | self._stacked_models = None 92 | self._fit_level = None 93 | self._start_time = time.time() 94 | self._time_ctrl = None 95 | self._all_params = {} 96 | # https://scikit-learn.org/stable/developers/develop.html#universal-attributes 97 | self.n_features_in_ = None # for scikit-learn api 98 | self.tuner = None 99 | self._boost_on_errors = None 100 | self._kmeans_features = None 101 | self._mix_encoding = None 102 | self._max_single_prediction_time = None 103 | self._optuna_time_budget = None 104 | self._optuna_init_params = {} 105 | self._fairness_metric = None 106 | self._fairness_threshold = None 107 | self._privileged_groups = [] 108 | self._underprivileged_groups = [] 109 | self._optuna_verbose = True 110 | self._n_jobs = -1 111 | self._id = str(uuid.uuid4()) 112 | 113 | def _get_tuner_params( 114 | self, start_random_models, hill_climbing_steps, top_models_to_improve 115 | ): 116 | return { 117 | "start_random_models": start_random_models, 118 | "hill_climbing_steps": hill_climbing_steps, 119 | "top_models_to_improve": top_models_to_improve, 120 | } 121 | 122 | def _check_can_load(self): 123 | """Checks if AutoML can be loaded from a folder""" 124 | if self.results_path is not None: 125 | # Dir exists and can be loaded 126 | if os.path.exists(self.results_path) and os.path.exists( 127 | os.path.join(self.results_path, "params.json") 128 | ): 129 | self.load(self.results_path) 130 | self._results_path = self.results_path 131 | 132 | def load(self, path): 133 | logger.info("Loading AutoML models ...") 134 | try: 135 | with open(os.path.join(path, "params.json")) as file: 136 | params = json.load(file) 137 | 138 | self._model_subpaths = params["saved"] 139 | self._mode = params.get("mode", self._mode) 140 | self._ml_task = params.get("ml_task", self._ml_task) 141 | self._results_path = params.get("results_path", self._results_path) 142 | self._total_time_limit = params.get( 143 | "total_time_limit", self._total_time_limit 144 | ) 145 | self._model_time_limit = params.get( 146 | "model_time_limit", self._model_time_limit 147 | ) 148 | self._algorithms = params.get("algorithms", self._algorithms) 149 | self._train_ensemble = params.get("train_ensemble", self._train_ensemble) 150 | self._stack_models = params.get("stack_models", self._stack_models) 151 | self._eval_metric = params.get("eval_metric", self._eval_metric) 152 | self._validation_strategy = params.get( 153 | "validation_strategy", self._validation_strategy 154 | ) 155 | self._verbose = params.get("verbose", self._verbose) 156 | self._explain_level = params.get("explain_level", self._explain_level) 157 | self._golden_features = params.get("golden_features", self._golden_features) 158 | self._features_selection = params.get( 159 | "features_selection", self._features_selection 160 | ) 161 | self._start_random_models = params.get( 162 | "start_random_models", self._start_random_models 163 | ) 164 | self._hill_climbing_steps = params.get( 165 | "hill_climbing_steps", self._hill_climbing_steps 166 | ) 167 | self._top_models_to_improve = params.get( 168 | "top_models_to_improve", self._top_models_to_improve 169 | ) 170 | self._boost_on_errors = params.get("boost_on_errors", self._boost_on_errors) 171 | self._kmeans_features = params.get("kmeans_features", self._kmeans_features) 172 | self._mix_encoding = params.get("mix_encoding", self._mix_encoding) 173 | self._max_single_prediction_time = params.get( 174 | "max_single_prediction_time", self._max_single_prediction_time 175 | ) 176 | self._n_jobs = params.get("n_jobs", self._n_jobs) 177 | self._random_state = params.get("random_state", self._random_state) 178 | stacked_models = params.get("stacked") 179 | 180 | best_model_name = params.get("best_model") 181 | load_on_predict = params.get("load_on_predict") 182 | self._fit_level = params.get("fit_level") 183 | lazy_load = not ( 184 | self._fit_level is not None and self._fit_level == "finished" 185 | ) 186 | load_models = self._model_subpaths 187 | if load_on_predict is not None and self._fit_level == "finished": 188 | load_models = load_on_predict 189 | # just in case there is check for which models should be loaded 190 | # fix https://github.com/mljar/mljar-supervised/issues/395 191 | models_needed = self.models_needed_on_predict(best_model_name) 192 | # join them and return unique list 193 | load_models = list(np.unique(load_models + models_needed)) 194 | 195 | models_map = {} 196 | 197 | for model_subpath in load_models: 198 | if model_subpath.endswith("Ensemble") or model_subpath.endswith( 199 | "Ensemble_Stacked" 200 | ): 201 | ens = Ensemble.load(path, model_subpath, models_map) 202 | self._models += [ens] 203 | models_map[ens.get_name()] = ens 204 | else: 205 | m = ModelFramework.load(path, model_subpath, lazy_load) 206 | self._models += [m] 207 | models_map[m.get_name()] = m 208 | 209 | self._best_model = None 210 | if best_model_name is not None: 211 | self._best_model = models_map.get(best_model_name) 212 | 213 | if stacked_models is not None and ( 214 | self._best_model._is_stacked or self._fit_level != "finished" 215 | ): 216 | self._stacked_models = [] 217 | for stacked_model_name in stacked_models: 218 | self._stacked_models += [models_map[stacked_model_name]] 219 | 220 | data_info_path = os.path.join(path, "data_info.json") 221 | with open(data_info_path, "r") as file: 222 | self._data_info = json.load(file) 223 | self.n_features_in_ = self._data_info["n_features"] 224 | 225 | if "n_classes" in self._data_info: 226 | self.n_classes = self._data_info["n_classes"] 227 | 228 | except Exception as e: 229 | raise AutoMLException(f"Cannot load AutoML directory. {str(e)}") 230 | 231 | def get_leaderboard( 232 | self, filter_random_feature=False, original_metric_values=False 233 | ): 234 | ldb = { 235 | "name": [], 236 | "model_type": [], 237 | "metric_type": [], 238 | "metric_value": [], 239 | "train_time": [], 240 | } 241 | if self._max_single_prediction_time is not None: 242 | ldb["single_prediction_time"] = [] 243 | 244 | sensitive_features_names = [] 245 | if self._fairness_metric is not None and len(self._models): 246 | sensitive_features_names = self._models[0].get_sensitive_features_names() 247 | ldb["fairness_metric"] = [] 248 | for sf in sensitive_features_names: 249 | ldb[f"fairness_{sf}"] = [] 250 | ldb["is_fair"] = [] 251 | 252 | for m in self._models: 253 | # filter model with random feature 254 | if filter_random_feature and "RandomFeature" in m.get_name(): 255 | continue 256 | ldb["name"] += [m.get_name()] 257 | ldb["model_type"] += [m.get_type()] 258 | ldb["metric_type"] += [self._eval_metric] 259 | ldb["metric_value"] += [m.get_final_loss()] 260 | ldb["train_time"] += [np.round(m.get_train_time(), 2)] 261 | if self._max_single_prediction_time is not None: 262 | if m._single_prediction_time is not None: 263 | ldb["single_prediction_time"] += [ 264 | np.round(m._single_prediction_time, 4) 265 | ] 266 | else: 267 | ldb["single_prediction_time"] += [None] 268 | if self._fairness_metric is not None: 269 | ldb["fairness_metric"] += [self._fairness_metric] 270 | for sf in sensitive_features_names: 271 | ldb[f"fairness_{sf}"] += [m.get_fairness_metric(sf)] 272 | ldb["is_fair"] += [m.is_fair()] 273 | 274 | ldb = pd.DataFrame(ldb) 275 | # need to add argument for sorting 276 | # minimize_direction = m.get_metric().get_minimize_direction() 277 | # ldb = ldb.sort_values("metric_value", ascending=minimize_direction) 278 | 279 | if original_metric_values: 280 | if Metric.optimize_negative(self._eval_metric): 281 | ldb["metric_value"] *= -1.0 282 | 283 | return ldb 284 | 285 | def keep_model(self, model, model_subpath): 286 | if model is None: 287 | return 288 | 289 | if self._max_single_prediction_time is not None: 290 | # let's check the prediction time ... 291 | # load 2x because of model reloading during the training 292 | for _ in range(2): 293 | start_time = time.time() 294 | self._base_predict(self._one_sample, model) 295 | model._single_prediction_time = ( 296 | time.time() - start_time 297 | ) # prediction time on single sample 298 | # again release learners from models 299 | if "Ensemble" not in model.get_type(): 300 | model.release_learners() 301 | 302 | self._models += [model] 303 | self._model_subpaths += [model_subpath] 304 | self.select_and_save_best() 305 | 306 | sign = -1.0 if Metric.optimize_negative(self._eval_metric) else 1.0 307 | msg = "{} {} {} trained in {} seconds".format( 308 | model.get_name(), 309 | self._eval_metric, 310 | np.round(sign * model.get_final_loss(), 6), 311 | np.round(model.get_train_time(), 2), 312 | ) 313 | if model._single_prediction_time is not None: 314 | msg += f" (1-sample predict time {np.round(model._single_prediction_time,4)} seconds)" 315 | self.verbose_print(msg) 316 | self._time_ctrl.log_time( 317 | model.get_name(), model.get_type(), self._fit_level, model.get_train_time() 318 | ) 319 | 320 | self.tuner.add_key(model) 321 | 322 | def create_dir(self, model_path): 323 | if not os.path.exists(model_path): 324 | try: 325 | os.mkdir(model_path) 326 | except Exception as e: 327 | raise AutoMLException(f"Cannot create directory {model_path}. {str(e)}") 328 | 329 | def _expected_learners_cnt(self): 330 | try: 331 | repeats = self._validation_strategy.get("repeats", 1) 332 | folds = self._validation_strategy.get("k_folds", 1) 333 | return repeats * folds 334 | except Exception as e: 335 | pass 336 | return 1 337 | 338 | def train_model(self, params): 339 | # do we have enough time to train? 340 | # if not, skip 341 | if not self._time_ctrl.enough_time( 342 | params["learner"]["model_type"], self._fit_level 343 | ): 344 | logger.info(f"Cannot train {params['name']} because of the time constraint") 345 | return False 346 | # let's create directory to log all training artifacts 347 | results_path, model_subpath = self._results_path, params["name"] 348 | model_path = os.path.join(results_path, model_subpath) 349 | self.create_dir(model_path) 350 | 351 | # prepare callbacks 352 | early_stop = EarlyStopping( 353 | {"metric": {"name": self._eval_metric}, "log_to_dir": model_path} 354 | ) 355 | 356 | # disable for now 357 | max_time_for_learner = 3600 358 | if self._total_time_limit is not None: 359 | k_folds = self._validation_strategy.get("k_folds", 1.0) 360 | at_least_algorithms = 10.0 361 | 362 | max_time_for_learner = max( 363 | self._total_time_limit / k_folds / at_least_algorithms, 60 364 | ) 365 | 366 | params["max_time_for_learner"] = max_time_for_learner 367 | 368 | total_time_constraint = TotalTimeConstraint( 369 | { 370 | "total_time_limit": self._total_time_limit 371 | if self._model_time_limit is None 372 | else None, 373 | "total_time_start": self._start_time, 374 | "expected_learners_cnt": self._expected_learners_cnt(), 375 | } 376 | ) 377 | 378 | # create model framework 379 | mf = ModelFramework( 380 | params, 381 | callbacks=[early_stop, total_time_constraint], 382 | ) 383 | 384 | # start training 385 | logger.info( 386 | f"Train model #{len(self._models)+1} / Model name: {params['name']}" 387 | ) 388 | mf.train(results_path, model_subpath) 389 | 390 | # keep info about the model 391 | self.keep_model(mf, model_subpath) 392 | 393 | # save the model 394 | mf.save(results_path, model_subpath) 395 | 396 | return True 397 | 398 | def verbose_print(self, msg): 399 | if self._verbose > 0: 400 | # self._progress_bar.write(msg) 401 | print(msg) 402 | 403 | def ensemble_step(self, is_stacked=False): 404 | if self._train_ensemble and len(self._models) > 1: 405 | ensemble_subpath = "Ensemble_Stacked" if is_stacked else "Ensemble" 406 | ensemble_path = os.path.join(self._results_path, ensemble_subpath) 407 | self.create_dir(ensemble_path) 408 | 409 | self.ensemble = Ensemble( 410 | self._eval_metric, 411 | self._ml_task, 412 | is_stacked=is_stacked, 413 | max_single_prediction_time=self._max_single_prediction_time, 414 | fairness_metric=self._fairness_metric, 415 | fairness_threshold=self._fairness_threshold, 416 | privileged_groups=self._privileged_groups, 417 | underprivileged_groups=self._underprivileged_groups, 418 | ) 419 | ( 420 | oofs, 421 | target, 422 | sample_weight, 423 | sensitive_features, 424 | ) = self.ensemble.get_oof_matrix(self._models) 425 | self.ensemble.fit(oofs, target, sample_weight, sensitive_features) 426 | self.keep_model(self.ensemble, ensemble_subpath) 427 | self.ensemble.save(self._results_path, ensemble_subpath) 428 | return True 429 | return False 430 | 431 | def can_we_stack_them(self, y): 432 | # if multiclass and too many classes then No 433 | return True 434 | 435 | def get_stacked_data(self, X, mode="training"): 436 | # mode can be `training` or `predict` 437 | if self._stacked_models is None: 438 | return X 439 | all_oofs = [] 440 | for m in self._stacked_models: 441 | oof = None 442 | if mode == "training": 443 | oof = m.get_out_of_folds() 444 | else: 445 | oof = m.predict(X) 446 | if self._ml_task == BINARY_CLASSIFICATION: 447 | cols = [f for f in oof.columns if "prediction" in f] 448 | if len(cols) == 2: 449 | oof = pd.DataFrame({"prediction": oof[cols[1]]}) 450 | 451 | cols = [f for f in oof.columns if "prediction" in f] 452 | oof = oof[cols] 453 | oof.columns = [f"{m.get_name()}_{c}" for c in cols] 454 | all_oofs += [oof] 455 | 456 | org_index = X.index.copy() 457 | X.reset_index(drop=True, inplace=True) 458 | X_stacked = pd.concat([X] + all_oofs, axis=1) 459 | 460 | X_stacked.index = org_index.copy() 461 | X.index = org_index.copy() 462 | return X_stacked 463 | 464 | def _perform_model_stacking(self): 465 | if self._stacked_models is not None: 466 | return 467 | 468 | ldb = self.get_leaderboard(filter_random_feature=True) 469 | if self._fairness_metric is not None: 470 | # get only fair models if we train with sensitive features 471 | ldb = ldb[ldb["is_fair"]] 472 | ldb = ldb.sort_values(by="metric_value", ascending=True) 473 | models_map = {m.get_name(): m for m in self._models if not m._is_stacked} 474 | self._stacked_models = [] 475 | models_limit = 10 476 | 477 | for model_type in np.unique(ldb.model_type): 478 | if model_type in ["Baseline"]: 479 | continue 480 | ds = ldb[ldb.model_type == model_type].copy() 481 | ds.sort_values(by="metric_value", inplace=True) 482 | 483 | for n in list(ds.name.iloc[:models_limit].values): 484 | self._stacked_models += [models_map[n]] 485 | 486 | scores = [m.get_final_loss() for m in self._stacked_models] 487 | self._stacked_models = [ 488 | self._stacked_models[i] for i in np.argsort(scores).tolist() 489 | ] 490 | 491 | def get_stacking_minimum_time_needed(self): 492 | try: 493 | ldb = self.get_leaderboard(filter_random_feature=True) 494 | ldb = ldb.sort_values(by="metric_value", ascending=True) 495 | return min(2.0 * ldb.iloc[0]["train_time"], 60) 496 | except Exception as e: 497 | return 60 498 | 499 | def prepare_for_stacking(self): 500 | # print("Stacked models ....") 501 | # do we have enough models? 502 | if len(self._models) < 5: 503 | return 504 | # do we have time? 505 | if self._total_time_limit is not None: 506 | time_left = self._total_time_limit - (time.time() - self._start_time) 507 | # we need some time to start stacking 508 | # it should be at least 60 seconds for larger data 509 | # but for small data it can be less 510 | if time_left < self.get_stacking_minimum_time_needed(): 511 | return 512 | # too many classes and models 513 | if self._ml_task == MULTICLASS_CLASSIFICATION: 514 | if self.n_classes * len(self._models) > 1000: 515 | return 516 | # if we are training with sensitive features 517 | # then we will stack only fair models 518 | # if there are no fair models then we skip this step 519 | if self._fairness_metric is not None: 520 | if not [m for m in self._models if m.is_fair()]: 521 | self.verbose_print("Skip stacking. We can stack only fair models.") 522 | return 523 | 524 | self._perform_model_stacking() 525 | 526 | X_stacked_path = os.path.join(self._results_path, "X_stacked.data") 527 | if os.path.exists(X_stacked_path): 528 | return 529 | 530 | X = load_data(self._X_path) 531 | org_columns = X.columns.tolist() 532 | X_stacked = self.get_stacked_data(X) 533 | new_columns = X_stacked.columns.tolist() 534 | added_columns = [c for c in new_columns if c not in org_columns] 535 | 536 | # save stacked train data 537 | dump_data(X_stacked_path, X_stacked) 538 | 539 | """ 540 | # resue old params 541 | for m in self._stacked_models: 542 | # print(m.get_type()) 543 | # use only Xgboost, LightGBM and CatBoost as stacked models 544 | if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]: 545 | continue 546 | params = copy.deepcopy(m.params) 547 | params["validation"]["X_train_path"] = X_train_stacked_path 548 | params["name"] = params["name"] + "_Stacked" 549 | params["is_stacked"] = True 550 | # print(params) 551 | if "model_architecture_json" in params["learner"]: 552 | # the new model will be created with wider input size 553 | del params["learner"]["model_architecture_json"] 554 | if self._ml_task == REGRESSION: 555 | # scale added predictions in regression if the target was scaled (in the case of NN) 556 | target_preprocessing = params["preprocessing"]["target_preprocessing"] 557 | scale = None 558 | if "scale_log_and_normal" in target_preprocessing: 559 | scale = "scale_log_and_normal" 560 | elif "scale_normal" in target_preprocessing: 561 | scale = "scale_normal" 562 | if scale is not None: 563 | for col in added_columns: 564 | params["preprocessing"]["columns_preprocessing"][col] = [ 565 | scale] 566 | self.train_model(params) 567 | """ 568 | 569 | def _save_data(self, X, y, sample_weight=None, cv=None, sensitive_features=None): 570 | # save information about original data 571 | self._save_data_info(X, y, sample_weight, sensitive_features) 572 | 573 | # handle drastic imbalance 574 | # assure at least 20 samples of each class 575 | # for binary and multiclass classification 576 | self._handle_drastic_imbalance(X, y, sample_weight, sensitive_features) 577 | 578 | # prepare path for saving files 579 | self._X_path = os.path.join(self._results_path, "X.data") 580 | self._y_path = os.path.join(self._results_path, "y.data") 581 | self._sample_weight_path = None 582 | if sample_weight is not None: 583 | self._sample_weight_path = os.path.join( 584 | self._results_path, "sample_weight.data" 585 | ) 586 | dump_data( 587 | self._sample_weight_path, pd.DataFrame({"sample_weight": sample_weight}) 588 | ) 589 | self._sensitive_features_path = None 590 | if sensitive_features is not None: 591 | self._sensitive_features_path = os.path.join( 592 | self._results_path, "sensitive_features.data" 593 | ) 594 | dump_data(self._sensitive_features_path, sensitive_features) 595 | 596 | dump_data(self._X_path, X) 597 | 598 | if self._ml_task == MULTICLASS_CLASSIFICATION: 599 | y = y.astype(str) 600 | 601 | dump_data(self._y_path, pd.DataFrame({"target": y})) 602 | 603 | # set paths in validation parameters 604 | self._validation_strategy["X_path"] = self._X_path 605 | self._validation_strategy["y_path"] = self._y_path 606 | self._validation_strategy["results_path"] = self._results_path 607 | if sample_weight is not None: 608 | self._validation_strategy["sample_weight_path"] = self._sample_weight_path 609 | if sensitive_features is not None: 610 | self._validation_strategy[ 611 | "sensitive_features_path" 612 | ] = self._sensitive_features_path 613 | 614 | if cv is not None: 615 | self._validation_strategy["cv_path"] = os.path.join( 616 | self._results_path, "cv.data" 617 | ) 618 | joblib.dump(cv, self._validation_strategy["cv_path"]) 619 | 620 | if self._max_single_prediction_time is not None: 621 | self._one_sample = X.iloc[:1].copy(deep=True) 622 | 623 | def _handle_drastic_imbalance( 624 | self, X, y, sample_weight=None, sensitive_features=None 625 | ): 626 | if self._ml_task == REGRESSION: 627 | return 628 | classes, cnts = np.unique(y, return_counts=True) 629 | min_samples_per_class = 20 630 | if self._validation_strategy is not None: 631 | min_samples_per_class = max( 632 | min_samples_per_class, self._validation_strategy.get("k_folds", 0) 633 | ) 634 | for i in range(len(classes)): 635 | if cnts[i] < min_samples_per_class: 636 | append_samples = min_samples_per_class - cnts[i] 637 | new_X = ( 638 | X[y == classes[i]] 639 | .sample(n=append_samples, replace=True, random_state=1) 640 | .reset_index(drop=True) 641 | ) 642 | if sample_weight is not None: 643 | new_sample_weight = ( 644 | sample_weight[y == classes[i]] 645 | .sample(n=append_samples, replace=True, random_state=1) 646 | .reset_index(drop=True) 647 | ) 648 | if sensitive_features is not None: 649 | new_sensitive_features = ( 650 | sensitive_features[y == classes[i]] 651 | .sample(n=append_samples, replace=True, random_state=1) 652 | .reset_index(drop=True) 653 | ) 654 | for j in range(new_X.shape[0]): 655 | X.loc[X.shape[0]] = new_X.loc[j] 656 | y.loc[y.shape[0]] = classes[i] 657 | if sample_weight is not None: 658 | sample_weight.loc[ 659 | sample_weight.shape[0] 660 | ] = new_sample_weight.loc[j] 661 | if sensitive_features is not None: 662 | sensitive_features.loc[ 663 | sensitive_features.shape[0] 664 | ] = new_sensitive_features.loc[j] 665 | 666 | def _save_data_info(self, X, y, sample_weight=None, sensitive_features=None): 667 | target_is_numeric = pd.api.types.is_numeric_dtype(y) 668 | if self._ml_task == MULTICLASS_CLASSIFICATION: 669 | y = y.astype(str) 670 | 671 | columns_and_target_info = DataInfo.compute(X, y, self._ml_task) 672 | 673 | self.n_features_in_ = X.shape[1] 674 | self.n_classes = len(np.unique(y[~pd.isnull(y)])) 675 | 676 | self._data_info = { 677 | "columns": X.columns.tolist(), 678 | "rows": y.shape[0], 679 | "cols": X.shape[1], 680 | "target_is_numeric": target_is_numeric, 681 | "columns_info": columns_and_target_info["columns_info"], 682 | "target_info": columns_and_target_info["target_info"], 683 | "n_features": self.n_features_in_, 684 | "is_sample_weighted": sample_weight is not None, 685 | "is_fairness_applied": sensitive_features is not None, 686 | } 687 | # Add n_classes if not regression 688 | if self._ml_task != REGRESSION: 689 | self._data_info["n_classes"] = self.n_classes 690 | 691 | if columns_and_target_info.get("num_class") is not None: 692 | self._data_info["num_class"] = columns_and_target_info["num_class"] 693 | data_info_path = os.path.join(self._results_path, "data_info.json") 694 | with open(data_info_path, "w") as fout: 695 | fout.write(json.dumps(self._data_info, indent=4, cls=MLJSONEncoder)) 696 | 697 | def save_progress(self, step=None, generated_params=None): 698 | if step is not None and generated_params is not None: 699 | self._all_params[step] = generated_params 700 | 701 | state = {} 702 | 703 | state["fit_level"] = self._fit_level 704 | state["time_controller"] = self._time_ctrl.to_json() 705 | state["all_params"] = self._all_params 706 | state["adjust_validation"] = self._adjust_validation 707 | 708 | fname = os.path.join(self._results_path, "progress.json") 709 | with open(fname, "w") as fout: 710 | fout.write(json.dumps(state, indent=4, cls=MLJSONEncoder)) 711 | 712 | def load_progress(self): 713 | state = {} 714 | fname = os.path.join(self._results_path, "progress.json") 715 | if not os.path.exists(fname): 716 | return 717 | with open(fname, "r") as file: 718 | state = json.load(file) 719 | self._fit_level = state.get("fit_level", self._fit_level) 720 | self._all_params = state.get("all_params", self._all_params) 721 | self._time_ctrl = TimeController.from_json(state.get("time_controller")) 722 | self._adjust_validation = state.get("adjust_validation", False) 723 | 724 | def _validate_X_predict(self, X): 725 | """Validate X whenever one tries to predict, apply, predict_proba""" 726 | # X = check_array(X, ensure_2d=False) 727 | X = np.atleast_2d(X) 728 | n_features = X.shape[1] 729 | if self.n_features_in_ != n_features: 730 | raise ValueError( 731 | f"Number of features of the model must match the input. Model n_features_in_ is {self.n_features_in_} and input n_features is {n_features}. Reshape your data." 732 | ) 733 | 734 | # This method builds pandas.Dataframe from input. The input can be numpy.ndarray, matrix, or pandas.Dataframe 735 | # This method is used to build dataframes in `fit()` and in `predict`. That's the reason y can be None (`predict()` method) 736 | def _build_dataframe(self, X, y=None, sample_weight=None, sensitive_features=None): 737 | if X is None or X.shape[0] == 0: 738 | raise AutoMLException("Empty input dataset") 739 | # If Inputs are not pandas dataframes use scikit-learn validation for X array 740 | if not isinstance(X, pd.DataFrame): 741 | # Validate X as array 742 | X = check_array(X, ensure_2d=False, ensure_all_finite=False) 743 | # Force X to be 2D 744 | X = np.atleast_2d(X) 745 | # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index} 746 | X = pd.DataFrame( 747 | X, columns=["feature_" + str(i) for i in range(1, len(X[0]) + 1)] 748 | ) 749 | # Enforce column names 750 | # Enforce X_train columns to be string 751 | X.columns = X.columns.astype(str) 752 | 753 | X.reset_index(drop=True, inplace=True) 754 | 755 | if y is None: 756 | return X 757 | 758 | # Check if y is np.ndarray, transform to pd.Series 759 | if isinstance(y, np.ndarray): 760 | y = check_array( 761 | y, 762 | ensure_2d=False, 763 | dtype="str" if PreprocessingUtils.is_categorical(y) else "numeric", 764 | ) 765 | y = pd.Series(np.array(y), name="target") 766 | # if pd.DataFrame, slice first column 767 | elif isinstance(y, pd.DataFrame): 768 | y = np.array(y.iloc[:, 0]) 769 | y = check_array(y, ensure_2d=False) 770 | y = pd.Series(np.array(y), name="target") 771 | 772 | if sample_weight is not None: 773 | if isinstance(sample_weight, np.ndarray): 774 | sample_weight = check_array(sample_weight, ensure_2d=False) 775 | sample_weight = pd.Series(np.array(sample_weight), name="sample_weight") 776 | elif isinstance(sample_weight, pd.DataFrame): 777 | sample_weight = np.array(sample_weight.iloc[:, 0]) 778 | sample_weight = check_array(sample_weight, ensure_2d=False) 779 | sample_weight = pd.Series(np.array(sample_weight), name="sample_weight") 780 | 781 | if sensitive_features is not None: 782 | if isinstance(sensitive_features, np.ndarray): 783 | sensitive_features = check_array(sensitive_features, ensure_2d=False) 784 | sensitive_features = pd.DataFrame( 785 | sensitive_features, 786 | columns=[ 787 | "sensitive_" + str(i) 788 | for i in range(1, len(sensitive_features[0]) + 1) 789 | ], 790 | ) 791 | elif isinstance(sensitive_features, pd.Series): 792 | sensitive_features = pd.DataFrame(sensitive_features) 793 | 794 | X, y, sample_weight, sensitive_features = ExcludeRowsMissingTarget.transform( 795 | X, y, sample_weight, sensitive_features, warn=True 796 | ) 797 | 798 | X.reset_index(drop=True, inplace=True) 799 | y.reset_index(drop=True, inplace=True) 800 | 801 | if sample_weight is not None: 802 | sample_weight.reset_index(drop=True, inplace=True) 803 | 804 | if sensitive_features is not None: 805 | sensitive_features.reset_index(drop=True, inplace=True) 806 | 807 | for col in sensitive_features.columns: 808 | if not sensitive_features[col].dtype.name in ["category", "object"]: 809 | self.verbose_print("Sensitive features should be categorical") 810 | self.verbose_print( 811 | f"Apply automatic binarization for feature {col}" 812 | ) 813 | sensitive_features[col] = pd.DataFrame( 814 | pd.qcut(sensitive_features[col], q=2).astype(str) 815 | ) 816 | self.verbose_print( 817 | f"New values {list(sensitive_features[col].unique())} for feature {col} are applied" 818 | ) 819 | 820 | return X, y, sample_weight, sensitive_features 821 | 822 | def _apply_constraints(self): 823 | if "Neural Network" in self._algorithms and self._n_jobs != -1: 824 | self._algorithms.remove("Neural Network") 825 | self.verbose_print( 826 | "Neural Network algorithm was disabled because it doesn't support n_jobs parameter." 827 | ) 828 | if "Linear" in self._algorithms and not ( 829 | self.n_rows_in_ < 10000 and self.n_features_in_ < 1000 830 | ): 831 | self._algorithms.remove("Linear") 832 | self.verbose_print("Linear algorithm was disabled.") 833 | 834 | # remove algorithms in the case of multiclass 835 | # and too many classes and columns 836 | if self._ml_task == MULTICLASS_CLASSIFICATION: 837 | if self.n_classes >= 10 and self.n_features_in_ * self.n_classes > 500: 838 | if self.algorithms == "auto": 839 | for a in ["CatBoost"]: 840 | if a in self._algorithms: 841 | self._algorithms.remove(a) 842 | 843 | if self.n_features_in_ * self.n_classes > 1000: 844 | if self.algorithms == "auto": 845 | for a in ["Xgboost", "CatBoost"]: 846 | if a in self._algorithms: 847 | self._algorithms.remove(a) 848 | if self.validation_strategy == "auto": 849 | self._validation_strategy = { 850 | "validation_type": "split", 851 | "train_ratio": 0.9, 852 | "shuffle": True, 853 | } 854 | if self._get_ml_task() != REGRESSION: 855 | self._validation_strategy["stratify"] = True 856 | 857 | if self.n_features_in_ * self.n_classes > 10000: 858 | if self.algorithms == "auto": 859 | for a in ["Random Forest", "Extra Trees"]: 860 | if a in self._algorithms: 861 | self._algorithms.remove(a) 862 | 863 | # Adjust the validation type based on speed of Decision Tree learning 864 | if ( 865 | self._get_mode() == "Compete" 866 | and self._total_time_limit is not None 867 | and self.validation_strategy == "auto" 868 | and self._validation_strategy["validation_type"] 869 | != "split" # split is the fastest validation type, no need to change 870 | ): 871 | # the validation will be adjusted after first Decision Tree learning on 872 | # train/test split (1-fold) 873 | self._adjust_validation = True 874 | self._validation_strategy = self._fastest_validation() 875 | 876 | def _fastest_validation(self): 877 | strategy = {"validation_type": "split", "train_ratio": 0.9, "shuffle": True} 878 | if self._get_ml_task() != REGRESSION: 879 | strategy["stratify"] = True 880 | return strategy 881 | 882 | def _set_adjusted_validation(self): 883 | if self._validation_strategy["validation_type"] != "split": 884 | return 885 | train_time = self._models[-1].get_train_time() 886 | # the time of Decision Tree training multiply by 5.0 887 | # to get the rough estimation how much time is needed for 888 | # other algorithms 889 | one_fold_time = train_time * 5.0 890 | # it will be good to train at least 10 models 891 | min_model_cnt = 10.0 892 | # the number of folds we can afford during the training 893 | folds_cnt = np.round(self._total_time_limit / one_fold_time / min_model_cnt) 894 | 895 | # adjust the validation if possible 896 | if folds_cnt >= 5.0: 897 | self.verbose_print(f"Adjust validation. Remove: {self._model_subpaths[0]}") 898 | k_folds = 5 899 | if folds_cnt >= 15: 900 | k_folds = 10 901 | # too small dataset for stacking 902 | if self.n_rows_in_ < 500: 903 | self._stack_models = False 904 | self.verbose_print( 905 | "*** Disable stacking for small dataset (nrows < 500)" 906 | ) 907 | 908 | self._validation_strategy["validation_type"] = "kfold" 909 | del self._validation_strategy["train_ratio"] 910 | self._validation_strategy["k_folds"] = k_folds 911 | self.tuner._validation_strategy = self._validation_strategy 912 | shutil.rmtree( 913 | os.path.join(self._results_path, self._model_subpaths[0]), 914 | ignore_errors=True, 915 | ) 916 | del self._models[0] 917 | del self._model_subpaths[0] 918 | del self.tuner._unique_params_keys[0] 919 | self._adjust_validation = False 920 | cv = [] 921 | if self._validation_strategy.get("shuffle", False): 922 | cv += ["Shuffle"] 923 | if self._validation_strategy.get("stratify", False): 924 | cv += ["Stratify"] 925 | self.select_and_save_best() # save validation strategy 926 | 927 | self.verbose_print(f"Validation strategy: {k_folds}-fold CV {','.join(cv)}") 928 | else: 929 | # cant stack models for train/test split 930 | self._stack_models = False 931 | self.verbose_print("Disable stacking for split validation") 932 | 933 | self._apply_constraints_stack_models() 934 | 935 | def _apply_constraints_stack_models(self): 936 | if self._validation_strategy["validation_type"] == "split": 937 | if self._stack_models: 938 | self.verbose_print("Disable stacking for split validation") 939 | self._stack_models = False 940 | self._boost_on_errors = False 941 | if "repeats" in self._validation_strategy: 942 | if self._stack_models: 943 | self.verbose_print("Disable stacking for repeated validation") 944 | self._stack_models = False 945 | self._boost_on_errors = False 946 | 947 | # update Tuner 948 | if self.tuner is not None: 949 | self.tuner._stack_models = self._stack_models 950 | self.tuner._boost_on_errors = self._boost_on_errors 951 | 952 | # update Time Controler 953 | if self._time_ctrl is not None: 954 | self._time_ctrl._is_stacking = self._stack_models 955 | 956 | if "stack" in self._time_ctrl._steps and not self._stack_models: 957 | self._time_ctrl._steps.remove("stack") 958 | if ( 959 | "boost_on_errors" in self._time_ctrl._steps 960 | and not self._boost_on_errors 961 | ): 962 | self._time_ctrl._steps.remove("boost_on_errors") 963 | 964 | def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None): 965 | """Fits the AutoML model with data""" 966 | if self._fit_level == "finished": 967 | print( 968 | "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." 969 | ) 970 | return 971 | # Validate input and build dataframes 972 | X, y, sample_weight, sensitive_features = self._build_dataframe( 973 | X, y, sample_weight, sensitive_features 974 | ) 975 | 976 | self.n_rows_in_ = X.shape[0] 977 | self.n_features_in_ = X.shape[1] 978 | self.n_classes = len(np.unique(y[~pd.isnull(y)])) 979 | 980 | # Get attributes (__init__ params) 981 | self._mode = self._get_mode() 982 | self._ml_task = self._get_ml_task() 983 | self._results_path = self._get_results_path() 984 | self._total_time_limit = self._get_total_time_limit() 985 | self._model_time_limit = self._get_model_time_limit() 986 | self._algorithms = self._get_algorithms() 987 | self._train_ensemble = self._get_train_ensemble() 988 | self._stack_models = self._get_stack_models() 989 | self._eval_metric = self._get_eval_metric() 990 | self._validation_strategy = self._get_validation_strategy() 991 | self._verbose = self._get_verbose() 992 | self._explain_level = self._get_explain_level() 993 | self._golden_features = self._get_golden_features() 994 | self._features_selection = self._get_features_selection() 995 | self._start_random_models = self._get_start_random_models() 996 | self._hill_climbing_steps = self._get_hill_climbing_steps() 997 | self._top_models_to_improve = self._get_top_models_to_improve() 998 | self._boost_on_errors = self._get_boost_on_errors() 999 | self._kmeans_features = self._get_kmeans_features() 1000 | self._mix_encoding = self._get_mix_encoding() 1001 | self._max_single_prediction_time = self._get_max_single_prediction_time() 1002 | self._optuna_time_budget = self._get_optuna_time_budget() 1003 | self._optuna_init_params = self._get_optuna_init_params() 1004 | self._optuna_verbose = self._get_optuna_verbose() 1005 | self._n_jobs = self._get_n_jobs() 1006 | self._random_state = self._get_random_state() 1007 | 1008 | if sensitive_features is not None: 1009 | self._fairness_metric = self._get_fairness_metric() 1010 | self._fairness_threshold = self._get_fairness_threshold() 1011 | self._privileged_groups = self._get_privileged_groups() 1012 | self._underprivileged_groups = self._get_underprivileged_groups() 1013 | 1014 | self._adjust_validation = False 1015 | self._apply_constraints() 1016 | if not self._adjust_validation: 1017 | # if there is no validation adjustement 1018 | # then we can apply stack_models constraints immediately 1019 | # if there is validation adjustement 1020 | # then we will apply contraints after the adjustement 1021 | self._apply_constraints_stack_models() 1022 | 1023 | try: 1024 | self.load_progress() 1025 | if self._fit_level == "finished": 1026 | print( 1027 | "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." 1028 | ) 1029 | return 1030 | self._check_can_load() 1031 | 1032 | self.verbose_print(f"AutoML directory: {self._results_path}") 1033 | if self._mode == "Optuna": 1034 | ttl = int(len(self._algorithms) * self._optuna_time_budget) 1035 | self.verbose_print("Expected computing time:") 1036 | self.verbose_print( 1037 | f"Time for tuning with Optuna: len(algorithms) * optuna_time_budget = {int(len(self._algorithms) * self._optuna_time_budget)} seconds" 1038 | ) 1039 | self.verbose_print( 1040 | f"There is no time limit for ML model training after Optuna tuning (total_time_limit parameter is ignored)." 1041 | ) 1042 | 1043 | self.verbose_print( 1044 | f"The task is {self._ml_task} with evaluation metric {self._eval_metric}" 1045 | ) 1046 | self.verbose_print(f"AutoML will use algorithms: {self._algorithms}") 1047 | if self._stack_models: 1048 | self.verbose_print("AutoML will stack models") 1049 | if self._train_ensemble: 1050 | self.verbose_print("AutoML will ensemble available models") 1051 | 1052 | self._start_time = time.time() 1053 | if self._time_ctrl is not None: 1054 | self._start_time -= self._time_ctrl.already_spend() 1055 | 1056 | # Automatic Exloratory Data Analysis 1057 | # I disabled EDA, because it won't be supported 1058 | # I recomend use pandas_profiling or Sweetviz 1059 | # if self._explain_level == 2: 1060 | # EDA.compute(X, y, os.path.join(self._results_path, "EDA")) 1061 | 1062 | # Save data 1063 | 1064 | self._save_data( 1065 | X.copy(deep=False), 1066 | y.copy(deep=False), 1067 | None if sample_weight is None else sample_weight.copy(deep=False), 1068 | cv, 1069 | None 1070 | if sensitive_features is None 1071 | else sensitive_features.copy(deep=False), 1072 | ) 1073 | 1074 | tuner = MljarTuner( 1075 | self._get_tuner_params( 1076 | self._start_random_models, 1077 | self._hill_climbing_steps, 1078 | self._top_models_to_improve, 1079 | ), 1080 | self._algorithms, 1081 | self._ml_task, 1082 | self._eval_metric, 1083 | self._validation_strategy, 1084 | self._explain_level, 1085 | self._data_info, 1086 | self._golden_features, 1087 | self._features_selection, 1088 | self._train_ensemble, 1089 | self._stack_models, 1090 | self._adjust_validation, 1091 | self._boost_on_errors, 1092 | self._kmeans_features, 1093 | self._mix_encoding, 1094 | self._optuna_time_budget, 1095 | self._optuna_init_params, 1096 | self._optuna_verbose, 1097 | self._n_jobs, 1098 | self._random_state, 1099 | self._fairness_metric, 1100 | self._fairness_threshold, 1101 | self._privileged_groups, 1102 | self._underprivileged_groups, 1103 | ) 1104 | self.tuner = tuner 1105 | 1106 | steps = tuner.steps() 1107 | self.verbose_print( 1108 | f'AutoML steps: {[s for s in steps if "update_" not in s]}' 1109 | ) 1110 | if self._time_ctrl is None: 1111 | self._time_ctrl = TimeController( 1112 | self._start_time, 1113 | self._total_time_limit, 1114 | self._model_time_limit, 1115 | steps, 1116 | self._algorithms, 1117 | ) 1118 | 1119 | self._time_ctrl.log_time( 1120 | "prepare_data", 1121 | "prepare_data", 1122 | "prepare_data", 1123 | time.time() - self._start_time, 1124 | ) 1125 | 1126 | for step in steps: 1127 | self._fit_level = step 1128 | start = time.time() 1129 | # self._time_start[step] = start 1130 | 1131 | if step in ["stack", "ensemble_stacked"] and not self._stack_models: 1132 | continue 1133 | 1134 | if step == "stack": 1135 | self.prepare_for_stacking() 1136 | if "hill_climbing" in step or step in ["ensemble", "stack"]: 1137 | if len(self._models) == 0: 1138 | raise AutoMLException( 1139 | "No models produced. \nPlease check your data or" 1140 | " submit a Github issue at https://github.com/mljar/mljar-supervised/issues/new." 1141 | ) 1142 | 1143 | generated_params = [] 1144 | if step in self._all_params: 1145 | generated_params = self._all_params[step] 1146 | else: 1147 | generated_params = tuner.generate_params( 1148 | step, 1149 | self._models, 1150 | self._results_path, 1151 | self._stacked_models, 1152 | self._total_time_limit, 1153 | ) 1154 | 1155 | if generated_params is None or not generated_params: 1156 | if "_update_" not in step: 1157 | self.verbose_print( 1158 | f"Skip {step} because no parameters were generated." 1159 | ) 1160 | continue 1161 | if generated_params: 1162 | if not self._time_ctrl.enough_time_for_step(self._fit_level): 1163 | self.verbose_print(f"Skip {step} because of the time limit.") 1164 | continue 1165 | else: 1166 | model_str = "models" if len(generated_params) > 1 else "model" 1167 | self.verbose_print( 1168 | f"* Step {step} will try to check up to {len(generated_params)} {model_str}" 1169 | ) 1170 | 1171 | for params in generated_params: 1172 | if params.get("status", "") in ["trained", "skipped", "error"]: 1173 | self.verbose_print(f"{params['name']}: {params['status']}.") 1174 | continue 1175 | 1176 | try: 1177 | trained = False 1178 | if "ensemble" in step: 1179 | trained = self.ensemble_step( 1180 | is_stacked=params["is_stacked"] 1181 | ) 1182 | else: 1183 | trained = self.train_model(params) 1184 | params["status"] = "trained" if trained else "skipped" 1185 | params["final_loss"] = self._models[-1].get_final_loss() 1186 | params["train_time"] = self._models[-1].get_train_time() 1187 | 1188 | if ( 1189 | self._adjust_validation 1190 | and len(self._models) == 1 1191 | and step == "adjust_validation" 1192 | ): 1193 | self._set_adjusted_validation() 1194 | 1195 | except NotTrainedException as e: 1196 | params["status"] = "error" 1197 | self.verbose_print( 1198 | params.get("name") + " not trained. " + str(e) 1199 | ) 1200 | except Exception as e: 1201 | import traceback 1202 | 1203 | self._update_errors_report( 1204 | params.get("name"), str(e) + "\n" + traceback.format_exc() 1205 | ) 1206 | params["status"] = "error" 1207 | 1208 | self.save_progress(step, generated_params) 1209 | 1210 | if not self._models: 1211 | raise AutoMLException("No models produced.") 1212 | self._fit_level = "finished" 1213 | self.save_progress() 1214 | self.select_and_save_best(show_warnings=True) 1215 | 1216 | self.verbose_print( 1217 | f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" 1218 | ) 1219 | self.verbose_print(f"AutoML best model: {self._best_model.get_name()}") 1220 | 1221 | if self._fairness_metric is not None: 1222 | # check if we have fair model 1223 | has_fair_model = False 1224 | for m in self._models: 1225 | if m.is_fair(): 1226 | has_fair_model = True 1227 | break 1228 | if not has_fair_model: 1229 | self.verbose_print( 1230 | "AutoML can't construct model that meets your fairness criteria." 1231 | ) 1232 | self.verbose_print("What you can do?") 1233 | self.verbose_print( 1234 | "1. Please include more samples that are not biased." 1235 | ) 1236 | self.verbose_print( 1237 | "2. Please examine the most unfairly treated samples." 1238 | ) 1239 | self.verbose_print("3. Please change fairness threshold.") 1240 | 1241 | except Exception as e: 1242 | raise e 1243 | 1244 | return self 1245 | 1246 | def _update_errors_report(self, model_name, error_msg): 1247 | """Append error message to errors.md file.""" 1248 | errors_filename = os.path.join(self._get_results_path(), "errors.md") 1249 | with open(errors_filename, "a") as fout: 1250 | self.verbose_print(f"There was an error during {model_name} training.") 1251 | self.verbose_print(f"Please check {errors_filename} for details.") 1252 | fout.write(f"## Error for {model_name}\n\n") 1253 | fout.write(error_msg) 1254 | link = "https://github.com/mljar/mljar-supervised/issues/new" 1255 | fout.write( 1256 | f"\n\nPlease set a GitHub issue with above error message at: {link}" 1257 | ) 1258 | fout.write("\n\n") 1259 | 1260 | def select_and_save_best(self, show_warnings=False): 1261 | # Select best model based on the lowest loss 1262 | self._best_model = None 1263 | 1264 | if self._models: 1265 | if self._fairness_metric is not None: 1266 | models = [ 1267 | m 1268 | for m in self._models 1269 | if m.is_valid() 1270 | # and m.is_fast_enough(self._max_single_prediction_time) 1271 | and m.is_fair() 1272 | ] 1273 | 1274 | if models: 1275 | # if there are fair models, we select the one with best performance 1276 | self._best_model = min( 1277 | models, 1278 | key=lambda x: x.get_final_loss(), 1279 | ) 1280 | else: 1281 | # if no models are fair, we select the most fair model 1282 | if "ratio" in self._fairness_metric.lower(): 1283 | self._best_model = max( 1284 | [m for m in self._models if m.is_valid()], 1285 | key=lambda x: x.get_best_fairness(), 1286 | ) 1287 | else: 1288 | self._best_model = min( 1289 | [m for m in self._models if m.is_valid()], 1290 | key=lambda x: x.get_best_fairness(), 1291 | ) 1292 | 1293 | else: 1294 | model_list = [ 1295 | m 1296 | for m in self._models 1297 | if m.is_valid() 1298 | and m.is_fast_enough(self._max_single_prediction_time) 1299 | ] 1300 | if model_list: 1301 | self._best_model = min( 1302 | model_list, 1303 | key=lambda x: x.get_final_loss(), 1304 | ) 1305 | # if none selected please select again and warn the user 1306 | if ( 1307 | len(self._models) 1308 | and self._best_model is None 1309 | and self._max_single_prediction_time is not None 1310 | ): 1311 | if show_warnings: 1312 | msg = ( 1313 | "*" * 64 1314 | + "\nThere were no model with prediction time smaller than the limit.\n" 1315 | + "Please increase the prediction time for single sample,\n" 1316 | + "or please to use train/test split for validation\n" 1317 | + "*" * 64 1318 | ) 1319 | self.verbose_print(msg) 1320 | 1321 | self._best_model = min( 1322 | [m for m in self._models if m.is_valid()], 1323 | key=lambda x: x.get_final_loss(), 1324 | ) 1325 | 1326 | with open(os.path.join(self._results_path, "params.json"), "w") as fout: 1327 | params = { 1328 | "mode": self._mode, 1329 | "ml_task": self._ml_task, 1330 | "results_path": self._results_path, 1331 | "total_time_limit": self._total_time_limit, 1332 | "model_time_limit": self._model_time_limit, 1333 | "algorithms": self._algorithms, 1334 | "train_ensemble": self._train_ensemble, 1335 | "stack_models": self._stack_models, 1336 | "eval_metric": self._eval_metric, 1337 | "validation_strategy": self._validation_strategy, 1338 | "verbose": self._verbose, 1339 | "explain_level": self._explain_level, 1340 | "golden_features": self._golden_features, 1341 | "features_selection": self._features_selection, 1342 | "start_random_models": self._start_random_models, 1343 | "hill_climbing_steps": self._hill_climbing_steps, 1344 | "top_models_to_improve": self._top_models_to_improve, 1345 | "boost_on_errors": self._boost_on_errors, 1346 | "kmeans_features": self._kmeans_features, 1347 | "mix_encoding": self._mix_encoding, 1348 | "max_single_prediction_time": self._max_single_prediction_time, 1349 | "n_jobs": self._n_jobs, 1350 | "random_state": self._random_state, 1351 | "saved": self._model_subpaths, 1352 | "fit_level": self._fit_level, 1353 | } 1354 | if self._best_model is not None: 1355 | params["best_model"] = self._best_model.get_name() 1356 | load_on_predict = [] 1357 | load_on_predict += self._best_model.involved_model_names() 1358 | if self._best_model._is_stacked and self._stacked_models is not None: 1359 | for m in self._stacked_models: 1360 | load_on_predict += m.involved_model_names() 1361 | params["load_on_predict"] = list(np.unique(load_on_predict)) 1362 | 1363 | if self._stacked_models is not None: 1364 | params["stacked"] = [m.get_name() for m in self._stacked_models] 1365 | fout.write(json.dumps(params, indent=4, cls=MLJSONEncoder)) 1366 | 1367 | if self._models: 1368 | ldb = self.get_leaderboard(original_metric_values=True) 1369 | ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) 1370 | 1371 | # save report 1372 | ldb.insert(loc=0, column="Best model", value="") 1373 | ldb.loc[ 1374 | ldb.name == self._best_model.get_name(), "Best model" 1375 | ] = "**the best**" 1376 | ldb["name"] = [f"[{m}]({m}/README.md)" for m in ldb["name"].values] 1377 | 1378 | with open(os.path.join(self._results_path, "README.md"), "w") as fout: 1379 | fout.write(f"# AutoML Leaderboard\n\n") 1380 | fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) 1381 | LeaderboardPlots.compute( 1382 | ldb, self._results_path, fout, self._fairness_threshold 1383 | ) 1384 | 1385 | if self._fit_level == "finished": 1386 | AutoMLPlots.add(self._results_path, self._models, fout) 1387 | 1388 | def get_ensemble_models(self, ensemble_name="Ensemble"): 1389 | try: 1390 | with open(os.path.join(self.results_path, ensemble_name, "ensemble.json")) as file: 1391 | params = json.load(file) 1392 | return [m["model"] for m in params["selected_models"]] 1393 | except Exception as e: 1394 | return [] 1395 | 1396 | def models_needed_on_predict(self, required_model_name): 1397 | with open(os.path.join(self.results_path, "params.json")) as file: 1398 | params = json.load(file) 1399 | saved_models = params.get("saved", []) 1400 | stacked_models = params.get("stacked", []) 1401 | 1402 | if required_model_name not in saved_models: 1403 | raise AutoMLException( 1404 | f"Can't load model {required_model_name}. Please check if the model's name is correct." 1405 | ) 1406 | # single model needed 1407 | if ( 1408 | "Stacked" not in required_model_name 1409 | and "Ensemble" not in required_model_name 1410 | ): 1411 | return [required_model_name] 1412 | ensemble_models = self.get_ensemble_models("Ensemble") 1413 | # ensemble of single models 1414 | if required_model_name == "Ensemble": 1415 | return ensemble_models + [required_model_name] 1416 | # single model on stacked data 1417 | if required_model_name != "Stacked_Ensemble": 1418 | return list( 1419 | np.unique( 1420 | ensemble_models 1421 | + ["Ensemble"] 1422 | + stacked_models 1423 | + [required_model_name] 1424 | ) 1425 | ) 1426 | # must be stacked ensemble 1427 | stacked_ensemble_models = self.get_ensemble_models("Stacked_Ensemble") 1428 | return list( 1429 | np.unique( 1430 | ensemble_models 1431 | + ["Ensemble"] 1432 | + stacked_models 1433 | + stacked_ensemble_models 1434 | + [required_model_name] 1435 | ) 1436 | ) 1437 | 1438 | def _base_predict(self, X, model=None): 1439 | if model is None: 1440 | if self._best_model is None: 1441 | self.load(self.results_path) 1442 | model = self._best_model 1443 | 1444 | if model is None: 1445 | raise AutoMLException( 1446 | "This model has not been fitted yet. Please call `fit()` first." 1447 | ) 1448 | 1449 | X = self._build_dataframe(X) 1450 | if not isinstance(X.columns[0], str): 1451 | X.columns = [str(c) for c in X.columns] 1452 | 1453 | input_columns = X.columns.tolist() 1454 | for column in self._data_info["columns"]: 1455 | if column not in input_columns: 1456 | raise AutoMLException( 1457 | f"Missing column: {column} in input data. Cannot predict" 1458 | ) 1459 | 1460 | X = X[self._data_info["columns"]] 1461 | self._validate_X_predict(X) 1462 | 1463 | # is stacked model 1464 | if model._is_stacked: 1465 | self._perform_model_stacking() 1466 | X_stacked = self.get_stacked_data(X, mode="predict") 1467 | 1468 | if model.get_type() == "Ensemble": 1469 | # Ensemble is using both original and stacked data 1470 | predictions = model.predict(X, X_stacked) 1471 | else: 1472 | predictions = model.predict(X_stacked) 1473 | else: 1474 | predictions = model.predict(X) 1475 | 1476 | if self._ml_task == BINARY_CLASSIFICATION: 1477 | # need to predict the label based on predictions and threshold 1478 | neg_label, pos_label = ( 1479 | predictions.columns[0][11:], 1480 | predictions.columns[1][11:], 1481 | ) 1482 | 1483 | if neg_label == "0" and pos_label == "1": 1484 | neg_label, pos_label = 0, 1 1485 | target_is_numeric = self._data_info.get("target_is_numeric", False) 1486 | if target_is_numeric: 1487 | neg_label = int(neg_label) 1488 | pos_label = int(pos_label) 1489 | # assume that it is binary classification 1490 | predictions["label"] = predictions.iloc[:, 1] > model._threshold 1491 | predictions["label"] = predictions["label"].map( 1492 | {True: pos_label, False: neg_label} 1493 | ) 1494 | return predictions 1495 | elif self._ml_task == MULTICLASS_CLASSIFICATION: 1496 | target_is_numeric = self._data_info.get("target_is_numeric", False) 1497 | if target_is_numeric: 1498 | try: 1499 | predictions["label"] = predictions["label"].astype(int) 1500 | except Exception as e: 1501 | predictions["label"] = predictions["label"].astype(float) 1502 | return predictions 1503 | # Regression 1504 | else: 1505 | return predictions 1506 | 1507 | def _predict(self, X): 1508 | predictions = self._base_predict(X) 1509 | # Return predictions 1510 | # If classification task the result is in column 'label' 1511 | # If regression task the result is in column 'prediction' 1512 | return ( 1513 | predictions["label"].to_numpy() 1514 | if self._ml_task != REGRESSION 1515 | else predictions["prediction"].to_numpy() 1516 | ) 1517 | 1518 | def _predict_proba(self, X): 1519 | # Check is task type is correct 1520 | if self._ml_task == REGRESSION: 1521 | raise AutoMLException( 1522 | f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'." 1523 | ) 1524 | 1525 | # Make and return predictions 1526 | # If classification task the result is in column 'label' 1527 | # Need to drop `label` column. 1528 | return self._base_predict(X).drop(["label"], axis=1).to_numpy() 1529 | 1530 | def _predict_all(self, X): 1531 | # Make and return predictions 1532 | return self._base_predict(X) 1533 | 1534 | def _score(self, X, y=None, sample_weight=None): 1535 | # y default must be None for scikit-learn compatibility 1536 | 1537 | # Check if y is None 1538 | if y is None: 1539 | raise AutoMLException("y must be specified.") 1540 | 1541 | predictions = self._predict(X) 1542 | return ( 1543 | r2_score(y, predictions, sample_weight=sample_weight) 1544 | if self._ml_task == REGRESSION 1545 | else accuracy_score(y, predictions, sample_weight=sample_weight) 1546 | ) 1547 | 1548 | def _get_mode(self): 1549 | """Gets the current mode""" 1550 | self._validate_mode() 1551 | return deepcopy(self.mode) 1552 | 1553 | def _get_ml_task(self): 1554 | """Gets the current ml_task. If "auto" it is determined""" 1555 | self._validate_ml_task() 1556 | if self.ml_task == "auto": 1557 | classes_number = self.n_classes 1558 | if classes_number == 2: 1559 | self._estimator_type = "classifier" # for sk-learn api 1560 | return BINARY_CLASSIFICATION 1561 | elif classes_number <= 20: 1562 | self._estimator_type = "classifier" # for sk-learn api 1563 | return MULTICLASS_CLASSIFICATION 1564 | else: 1565 | self._estimator_type = "regressor" # for sk-learn api 1566 | return REGRESSION 1567 | else: 1568 | return deepcopy(self.ml_task) 1569 | 1570 | def _get_results_path(self): 1571 | """Gets the current results_path""" 1572 | # if we already have the results path set, please return it 1573 | if self._results_path is not None: 1574 | return self._results_path 1575 | 1576 | self._validate_results_path() 1577 | 1578 | path = self.results_path 1579 | 1580 | if path is None: 1581 | for i in range(1, 10001): 1582 | name = f"AutoML_{i}" 1583 | if not os.path.exists(name): 1584 | self.create_dir(name) 1585 | self._results_path = name 1586 | return name 1587 | # If it got here, could not create, raise expection 1588 | raise AutoMLException("Cannot create directory for AutoML results") 1589 | elif os.path.exists(self.results_path) and os.path.exists( 1590 | os.path.join(self.results_path, "params.json") 1591 | ): # AutoML already loaded, return path 1592 | self._results_path = path 1593 | return path 1594 | # Dir does not exist, create it 1595 | elif not os.path.exists(path): 1596 | self.create_dir(path) 1597 | self._results_path = path 1598 | return path 1599 | # Dir exists and is empty, use it 1600 | elif os.path.exists(path) and not len(os.listdir(path)): 1601 | self._results_path = path 1602 | return path 1603 | elif os.path.exists(path) and len(os.listdir(path)): 1604 | raise AutoMLException( 1605 | f"Cannot set directory for AutoML. Directory '{path}' is not empty." 1606 | ) 1607 | 1608 | raise AutoMLException("Cannot set directory for AutoML results") 1609 | 1610 | def _get_total_time_limit(self): 1611 | """Gets the current total_time_limit""" 1612 | self._validate_total_time_limit() 1613 | if self._get_mode() == "Optuna": 1614 | return None # there no training limit for model in the Optuna mode 1615 | # just train and be happy with super models :) 1616 | return deepcopy(self.total_time_limit) 1617 | 1618 | def _get_model_time_limit(self): 1619 | """Gets the current model_time_limit""" 1620 | self._validate_model_time_limit() 1621 | return deepcopy(self.model_time_limit) 1622 | 1623 | def _get_algorithms(self): 1624 | """Gets the current algorithms. If "auto" it is determined""" 1625 | self._validate_algorithms() 1626 | if self.algorithms == "auto": 1627 | if self._get_mode() == "Explain": 1628 | return [ 1629 | "Baseline", 1630 | "Linear", 1631 | "Decision Tree", 1632 | "Random Forest", 1633 | "Xgboost", 1634 | "Neural Network", 1635 | ] 1636 | if self._get_mode() == "Perform": 1637 | return [ 1638 | "Linear", 1639 | "Random Forest", 1640 | "LightGBM", 1641 | "Xgboost", 1642 | "CatBoost", 1643 | "Neural Network", 1644 | ] 1645 | if self._get_mode() == "Compete": 1646 | return [ 1647 | "Decision Tree", 1648 | "Linear", 1649 | "Random Forest", 1650 | "Extra Trees", 1651 | "LightGBM", 1652 | "Xgboost", 1653 | "CatBoost", 1654 | "Neural Network", 1655 | "Nearest Neighbors", 1656 | ] 1657 | if self._get_mode() == "Optuna": 1658 | return [ 1659 | "Random Forest", 1660 | "Extra Trees", 1661 | "LightGBM", 1662 | "Xgboost", 1663 | "CatBoost", 1664 | "Neural Network", 1665 | ] 1666 | else: 1667 | return deepcopy(self.algorithms) 1668 | 1669 | def _get_train_ensemble(self): 1670 | """Gets the current train_ensemble""" 1671 | self._validate_train_ensemble() 1672 | return deepcopy(self.train_ensemble) 1673 | 1674 | def _get_stack_models(self): 1675 | """Gets the current stack_models""" 1676 | self._validate_stack_models() 1677 | if self.stack_models == "auto": 1678 | val = self._get_validation_strategy() 1679 | if val.get("validation_type", "") == "custom": 1680 | return False 1681 | return True if self.mode in ["Compete", "Optuna"] else False 1682 | else: 1683 | return deepcopy(self.stack_models) 1684 | 1685 | def _get_eval_metric(self): 1686 | """Gets the current eval_metric""" 1687 | self._validate_eval_metric() 1688 | if isinstance(self.eval_metric, types.FunctionType): 1689 | UserDefinedEvalMetric().set_metric(self.eval_metric) 1690 | return "user_defined_metric" 1691 | 1692 | if self.eval_metric == "auto": 1693 | if self._get_ml_task() == BINARY_CLASSIFICATION: 1694 | return "logloss" 1695 | elif self._get_ml_task() == MULTICLASS_CLASSIFICATION: 1696 | return "logloss" 1697 | elif self._get_ml_task() == REGRESSION: 1698 | return "rmse" 1699 | else: 1700 | return deepcopy(self.eval_metric) 1701 | 1702 | def _get_validation_strategy(self): 1703 | """Gets the current validation_strategy""" 1704 | strat = {} 1705 | self._validate_validation_strategy() 1706 | if self.validation_strategy == "auto": 1707 | if self._get_mode() == "Explain": 1708 | strat = { 1709 | "validation_type": "split", 1710 | "train_ratio": 0.75, 1711 | "shuffle": True, 1712 | "stratify": True, 1713 | } 1714 | elif self._get_mode() == "Perform": 1715 | strat = { 1716 | "validation_type": "kfold", 1717 | "k_folds": 5, 1718 | "shuffle": True, 1719 | "stratify": True, 1720 | } 1721 | elif self._get_mode() in ["Compete", "Optuna"]: 1722 | strat = { 1723 | "validation_type": "kfold", 1724 | "k_folds": 10, 1725 | "shuffle": True, 1726 | "stratify": True, 1727 | } 1728 | if self._get_ml_task() == REGRESSION: 1729 | if "stratify" in strat: 1730 | # it's better to always check 1731 | # before delete (trust me) 1732 | del strat["stratify"] 1733 | return strat 1734 | else: 1735 | strat = deepcopy(self.validation_strategy) 1736 | if self._get_ml_task() == REGRESSION: 1737 | if "stratify" in strat: 1738 | del strat["stratify"] 1739 | return strat 1740 | 1741 | def _get_verbose(self): 1742 | """Gets the current verbose""" 1743 | self._validate_verbose() 1744 | return deepcopy(self.verbose) 1745 | 1746 | def _get_explain_level(self): 1747 | """Gets the current explain_level""" 1748 | self._validate_explain_level() 1749 | if self.explain_level == "auto": 1750 | if self._get_mode() == "Explain": 1751 | return 2 1752 | if self._get_mode() == "Perform": 1753 | return 1 1754 | if self._get_mode() == "Compete": 1755 | return 0 1756 | if self._get_mode() == "Optuna": 1757 | return 0 1758 | else: 1759 | return deepcopy(self.explain_level) 1760 | 1761 | def _get_golden_features(self): 1762 | self._validate_golden_features() 1763 | if self.golden_features == "auto": 1764 | if self._get_mode() == "Explain": 1765 | return False 1766 | if self._get_mode() == "Perform": 1767 | return True 1768 | if self._get_mode() == "Compete": 1769 | return True 1770 | if self._get_mode() == "Optuna": 1771 | return False 1772 | else: 1773 | return deepcopy(self.golden_features) 1774 | 1775 | def _get_features_selection(self): 1776 | """Gets the current features_selection""" 1777 | self._validate_features_selection() 1778 | if self.features_selection == "auto": 1779 | if self._get_mode() == "Explain": 1780 | return False 1781 | if self._get_mode() == "Perform": 1782 | return True 1783 | if self._get_mode() == "Compete": 1784 | return True 1785 | if self._get_mode() == "Optuna": 1786 | return False 1787 | else: 1788 | return deepcopy(self.features_selection) 1789 | 1790 | def _get_start_random_models(self): 1791 | """Gets the current start_random_models""" 1792 | self._validate_start_random_models() 1793 | if self.start_random_models == "auto": 1794 | if self._get_mode() == "Explain": 1795 | return 1 1796 | if self._get_mode() == "Perform": 1797 | return 5 1798 | if self._get_mode() == "Compete": 1799 | return 10 1800 | if self._get_mode() == "Optuna": 1801 | return 1 # just 1, because it will be tuned by Optuna 1802 | else: 1803 | return deepcopy(self.start_random_models) 1804 | 1805 | def _get_hill_climbing_steps(self): 1806 | """Gets the current hill_climbing_steps""" 1807 | self._validate_hill_climbing_steps() 1808 | if self.hill_climbing_steps == "auto": 1809 | if self._get_mode() == "Explain": 1810 | return 0 1811 | if self._get_mode() == "Perform": 1812 | return 2 1813 | if self._get_mode() == "Compete": 1814 | return 2 1815 | if self._get_mode() == "Optuna": 1816 | return 0 # all tuning is done in Optuna 1817 | else: 1818 | return deepcopy(self.hill_climbing_steps) 1819 | 1820 | def _get_top_models_to_improve(self): 1821 | """Gets the current top_models_to_improve""" 1822 | self._validate_top_models_to_improve() 1823 | if self.top_models_to_improve == "auto": 1824 | if self._get_mode() == "Explain": 1825 | return 0 1826 | if self._get_mode() == "Perform": 1827 | return 2 1828 | if self._get_mode() == "Compete": 1829 | return 3 1830 | if self._get_mode() == "Optuna": 1831 | return 0 1832 | else: 1833 | return deepcopy(self.top_models_to_improve) 1834 | 1835 | def _get_boost_on_errors(self): 1836 | """Gets the current boost_on_errors""" 1837 | self._validate_boost_on_errors() 1838 | if self.boost_on_errors == "auto": 1839 | val = self._get_validation_strategy() 1840 | if val.get("validation_type", "") == "custom": 1841 | return False 1842 | if self._get_mode() == "Explain": 1843 | return False 1844 | if self._get_mode() == "Perform": 1845 | return False 1846 | if self._get_mode() == "Compete": 1847 | return True 1848 | if self._get_mode() == "Optuna": 1849 | return False 1850 | else: 1851 | return deepcopy(self.boost_on_errors) 1852 | 1853 | def _get_kmeans_features(self): 1854 | """Gets the current kmeans_features""" 1855 | self._validate_kmeans_features() 1856 | if self.kmeans_features == "auto": 1857 | if self._get_mode() == "Explain": 1858 | return False 1859 | if self._get_mode() == "Perform": 1860 | return False 1861 | if self._get_mode() == "Compete": 1862 | return True 1863 | if self._get_mode() == "Optuna": 1864 | return False 1865 | else: 1866 | return deepcopy(self.kmeans_features) 1867 | 1868 | def _get_mix_encoding(self): 1869 | """Gets the current mix_encoding""" 1870 | self._validate_mix_encoding() 1871 | if self.mix_encoding == "auto": 1872 | if self._get_mode() == "Explain": 1873 | return False 1874 | if self._get_mode() == "Perform": 1875 | return False 1876 | if self._get_mode() == "Compete": 1877 | return True 1878 | if self._get_mode() == "Optuna": 1879 | return False 1880 | else: 1881 | return deepcopy(self.mix_encoding) 1882 | 1883 | def _get_max_single_prediction_time(self): 1884 | """Gets the current max_single_prediction_time""" 1885 | self._validate_max_single_prediction_time() 1886 | if self.max_single_prediction_time is None: 1887 | if self._get_mode() == "Perform": 1888 | return 0.5 # prediction time should be under 0.5 second 1889 | return None 1890 | else: 1891 | return deepcopy(self.max_single_prediction_time) 1892 | 1893 | def _get_optuna_time_budget(self): 1894 | """Gets the current optuna_time_budget""" 1895 | self._validate_optuna_time_budget() 1896 | 1897 | if self.optuna_time_budget is None: 1898 | if self._get_mode() == "Optuna": 1899 | return 3600 1900 | return None 1901 | else: 1902 | if self._get_mode() != "Optuna": 1903 | # use only for mode Optuna 1904 | return None 1905 | return deepcopy(self.optuna_time_budget) 1906 | 1907 | def _get_optuna_init_params(self): 1908 | """Gets the current optuna_init_params""" 1909 | self._validate_optuna_init_params() 1910 | if self._get_mode() != "Optuna": 1911 | # use only for mode Optuna 1912 | return {} 1913 | return deepcopy(self.optuna_init_params) 1914 | 1915 | def _get_optuna_verbose(self): 1916 | """Gets the current optuna_verbose""" 1917 | self._validate_optuna_verbose() 1918 | # use only for mode Optuna 1919 | if self._get_mode() != "Optuna": 1920 | return True 1921 | return deepcopy(self.optuna_verbose) 1922 | 1923 | def _get_n_jobs(self): 1924 | """Gets the current n_jobs""" 1925 | self._validate_n_jobs() 1926 | return deepcopy(self.n_jobs) 1927 | 1928 | def _get_random_state(self): 1929 | """Gets the current random_state""" 1930 | self._validate_random_state() 1931 | return deepcopy(self.random_state) 1932 | 1933 | def _validate_mode(self): 1934 | """Validates mode parameter""" 1935 | valid_modes = ["Explain", "Perform", "Compete", "Optuna"] 1936 | if self.mode not in valid_modes: 1937 | raise ValueError( 1938 | f"Expected 'mode' to be {' or '.join(valid_modes)}, got '{self.mode}'" 1939 | ) 1940 | 1941 | def _validate_ml_task(self): 1942 | """Validates ml_task parameter""" 1943 | if isinstance(self.ml_task, str) and self.ml_task == "auto": 1944 | return 1945 | 1946 | if self.ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): 1947 | raise ValueError( 1948 | f"Expected 'ml_task' to be {' or '.join(AlgorithmsRegistry.get_supported_ml_tasks())}, got '{self.ml_task}''" 1949 | ) 1950 | 1951 | def _validate_results_path(self): 1952 | """Validates path parameter""" 1953 | if self.results_path is None or isinstance(self.results_path, str): 1954 | return 1955 | 1956 | raise ValueError( 1957 | f"Expected 'results_path' to be of type string, got '{type(self.results_path)}''" 1958 | ) 1959 | 1960 | def _validate_total_time_limit(self): 1961 | """Validates total_time_limit parameter""" 1962 | if self.total_time_limit is None: 1963 | return 1964 | if self.total_time_limit is not None: 1965 | check_greater_than_zero_integer(self.total_time_limit, "total_time_limit") 1966 | 1967 | def _validate_model_time_limit(self): 1968 | """Validates model_time_limit parameter""" 1969 | if self.model_time_limit is not None: 1970 | check_greater_than_zero_integer(self.model_time_limit, "model_time_limit") 1971 | 1972 | def _validate_algorithms(self): 1973 | """Validates algorithms parameter""" 1974 | if isinstance(self.algorithms, str) and self.algorithms == "auto": 1975 | return 1976 | 1977 | for algo in self.algorithms: 1978 | if algo not in list(AlgorithmsRegistry.registry[self._ml_task].keys()): 1979 | raise ValueError( 1980 | f"The algorithm {algo} is not allowed to use for ML task: {self._ml_task}. Allowed algorithms: {list(AlgorithmsRegistry.registry[self._ml_task].keys())}" 1981 | ) 1982 | 1983 | def _validate_train_ensemble(self): 1984 | """Validates train_ensemble parameter""" 1985 | # `train_ensemble` defaults to True, no further checking required 1986 | check_bool(self.train_ensemble, "train_ensemble") 1987 | 1988 | def _validate_stack_models(self): 1989 | """Validates stack_models parameter""" 1990 | # `stack_models` defaults to "auto". If "auto" return, else check if is valid bool 1991 | if isinstance(self.stack_models, str) and self.stack_models == "auto": 1992 | return 1993 | 1994 | check_bool(self.stack_models, "stack_models") 1995 | 1996 | def _validate_eval_metric(self): 1997 | """Validates eval_metric parameter""" 1998 | if isinstance(self.eval_metric, types.FunctionType): 1999 | return 2000 | 2001 | if isinstance(self.eval_metric, str) and self.eval_metric == "auto": 2002 | return 2003 | 2004 | if (self._get_ml_task() == BINARY_CLASSIFICATION) and self.eval_metric not in [ 2005 | "logloss", 2006 | "auc", 2007 | "f1", 2008 | "average_precision", 2009 | "accuracy", 2010 | ]: 2011 | raise ValueError( 2012 | f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ 2013 | Use 'logloss', 'auc', 'f1', 'average_precision', or 'accuracy'" 2014 | ) 2015 | 2016 | elif ( 2017 | self._get_ml_task() == MULTICLASS_CLASSIFICATION 2018 | ) and self.eval_metric not in ["logloss", "f1", "accuracy"]: 2019 | raise ValueError( 2020 | f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ 2021 | Use 'logloss', 'f1', or 'accuracy'" 2022 | ) 2023 | 2024 | elif self._get_ml_task() == REGRESSION and self.eval_metric not in [ 2025 | "rmse", 2026 | "mse", 2027 | "mae", 2028 | "r2", 2029 | "mape", 2030 | "spearman", 2031 | "pearson", 2032 | ]: 2033 | raise ValueError( 2034 | f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ 2035 | Use 'rmse', 'mse', 'mae', 'r2', 'mape', 'spearman', or 'pearson'" 2036 | ) 2037 | 2038 | def _validate_validation_strategy(self): 2039 | """Validates validation parameter""" 2040 | if ( 2041 | isinstance(self.validation_strategy, str) 2042 | and self.validation_strategy == "auto" 2043 | ): 2044 | return 2045 | 2046 | # only validation_type is mandatory 2047 | # other parameters of validations 2048 | # have defaults set in their constructors 2049 | required_keys = ["validation_type"] 2050 | if type(self.validation_strategy) is not dict: 2051 | raise ValueError( 2052 | f"Expected 'validation_strategy' to be a dict, got '{type(self.validation_strategy)}'" 2053 | ) 2054 | if not all(key in self.validation_strategy for key in required_keys): 2055 | raise ValueError(f"Expected dict with keys: {' , '.join(required_keys)}") 2056 | 2057 | def _validate_verbose(self): 2058 | """Validates verbose parameter""" 2059 | check_positive_integer(self.verbose, "verbose") 2060 | 2061 | def _validate_explain_level(self): 2062 | """Validates explain_level parameter""" 2063 | if isinstance(self.explain_level, str) and self.explain_level == "auto": 2064 | return 2065 | valid_explain_levels = [0, 1, 2] 2066 | # Check if explain level is 0 or greater integer 2067 | if not ( 2068 | isinstance(self.explain_level, int) 2069 | and self.explain_level in valid_explain_levels 2070 | ): 2071 | raise ValueError( 2072 | f"Expected 'explain_level' to be {' or '.join([str(x) for x in valid_explain_levels])}, got '{self.explain_level}'" 2073 | ) 2074 | 2075 | def _validate_golden_features(self): 2076 | """Validates golden_features parameter""" 2077 | if isinstance(self.golden_features, str) and self.golden_features == "auto": 2078 | return 2079 | if isinstance(self.golden_features, int): 2080 | return 2081 | check_bool(self.golden_features, "golden_features") 2082 | 2083 | def _validate_features_selection(self): 2084 | """Validates features_selection parameter""" 2085 | if ( 2086 | isinstance(self.features_selection, str) 2087 | and self.features_selection == "auto" 2088 | ): 2089 | return 2090 | check_bool(self.features_selection, "features_selection") 2091 | 2092 | def _validate_start_random_models(self): 2093 | """Validates start_random_models parameter""" 2094 | if ( 2095 | isinstance(self.start_random_models, str) 2096 | and self.start_random_models == "auto" 2097 | ): 2098 | return 2099 | check_greater_than_zero_integer(self.start_random_models, "start_random_models") 2100 | 2101 | def _validate_hill_climbing_steps(self): 2102 | """Validates hill_climbing_steps parameter""" 2103 | if ( 2104 | isinstance(self.hill_climbing_steps, str) 2105 | and self.hill_climbing_steps == "auto" 2106 | ): 2107 | return 2108 | check_positive_integer(self.hill_climbing_steps, "hill_climbing_steps") 2109 | 2110 | def _validate_top_models_to_improve(self): 2111 | """Validates top_models_to_improve parameter""" 2112 | if ( 2113 | isinstance(self.top_models_to_improve, str) 2114 | and self.top_models_to_improve == "auto" 2115 | ): 2116 | return 2117 | check_positive_integer(self.top_models_to_improve, "top_models_to_improve") 2118 | 2119 | def _validate_boost_on_errors(self): 2120 | """Validates boost_on_errors parameter""" 2121 | if isinstance(self.boost_on_errors, str) and self.boost_on_errors == "auto": 2122 | return 2123 | check_bool(self.boost_on_errors, "boost_on_errors") 2124 | 2125 | def _validate_kmeans_features(self): 2126 | """Validates kmeans_features parameter""" 2127 | if isinstance(self.kmeans_features, str) and self.kmeans_features == "auto": 2128 | return 2129 | check_bool(self.kmeans_features, "kmeans_features") 2130 | 2131 | def _validate_mix_encoding(self): 2132 | """Validates mix_encoding parameter""" 2133 | if isinstance(self.mix_encoding, str) and self.mix_encoding == "auto": 2134 | return 2135 | check_bool(self.mix_encoding, "mix_encoding") 2136 | 2137 | def _validate_max_single_prediction_time(self): 2138 | """Validates max_single_prediction_time parameter""" 2139 | if self.max_single_prediction_time is None: 2140 | return 2141 | check_greater_than_zero_integer_or_float( 2142 | self.max_single_prediction_time, "max_single_prediction_time" 2143 | ) 2144 | 2145 | def _validate_optuna_time_budget(self): 2146 | """Validates optuna_time_budget parameter""" 2147 | if self.optuna_time_budget is None: 2148 | return 2149 | check_greater_than_zero_integer(self.optuna_time_budget, "optuna_time_budget") 2150 | 2151 | def _validate_optuna_init_params(self): 2152 | """Validates optuna_init_params parameter""" 2153 | if self.optuna_init_params is None: 2154 | return 2155 | if type(self.optuna_init_params) is not dict: 2156 | raise ValueError( 2157 | f"Expected 'optuna_init_params' to be a dict, got '{type(self.optuna_init_params)}'" 2158 | ) 2159 | 2160 | def _validate_optuna_verbose(self): 2161 | """Validates optuna_verbose parameter""" 2162 | if self.optuna_verbose is None: 2163 | return 2164 | check_bool(self.optuna_verbose, "optuna_verbose") 2165 | 2166 | def _validate_n_jobs(self): 2167 | """Validates mix_encoding parameter""" 2168 | check_integer(self.n_jobs, "n_jobs") 2169 | 2170 | def _validate_random_state(self): 2171 | """Validates random_state parameter""" 2172 | check_positive_integer(self.random_state, "random_state") 2173 | 2174 | def _validate_fairness_metric(self): 2175 | """Validates fariness_metric parameter""" 2176 | if isinstance(self.fairness_metric, str) and self.fairness_metric == "auto": 2177 | return 2178 | 2179 | if ( 2180 | self._get_ml_task() in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION] 2181 | ) and self.fairness_metric not in [ 2182 | "demographic_parity_difference", 2183 | "demographic_parity_ratio", 2184 | "equalized_odds_difference", 2185 | "equalized_odds_ratio", 2186 | ]: 2187 | raise ValueError( 2188 | f"Metric {self.fairness_metric} is not allowed in ML task: {self._get_ml_task()}. \ 2189 | Use `demographic_parity_difference`, `demographic_parity_ratio`, `equalized_odds_difference` or `equalized_odds_ratio`" 2190 | ) 2191 | if (self._get_ml_task() == REGRESSION) and self.fairness_metric not in [ 2192 | "group_loss_difference", 2193 | "group_loss_ratio", 2194 | ]: 2195 | raise ValueError( 2196 | f"Metric {self.fairness_metric} is not allowed in ML task: {self._get_ml_task()}. \ 2197 | Use `group_loss`" 2198 | ) 2199 | 2200 | def _get_fairness_metric(self): 2201 | """Gets the fairness metric""" 2202 | self._validate_fairness_metric() 2203 | if self.fairness_metric == "auto": 2204 | if self._get_ml_task() == BINARY_CLASSIFICATION: 2205 | return "demographic_parity_ratio" 2206 | if self._get_ml_task() == REGRESSION: 2207 | return "group_loss_ratio" 2208 | if self._get_ml_task() == MULTICLASS_CLASSIFICATION: 2209 | return "demographic_parity_ratio" 2210 | else: 2211 | return deepcopy(self.fairness_metric) 2212 | 2213 | def _get_fairness_threshold(self): 2214 | """Gets the fairness threshold""" 2215 | if self.fairness_threshold == "auto": 2216 | if self._get_ml_task() in [ 2217 | BINARY_CLASSIFICATION, 2218 | MULTICLASS_CLASSIFICATION, 2219 | ]: 2220 | thresholds = { 2221 | "demographic_parity_difference": 0.1, 2222 | "demographic_parity_ratio": 0.8, 2223 | "equalized_odds_difference": 0.1, 2224 | "equalized_odds_ratio": 0.8, 2225 | } 2226 | return thresholds.get(self._fairness_metric, 0.8) 2227 | elif self._get_ml_task() == REGRESSION: 2228 | thresholds = { 2229 | "group_loss_ratio": 0.8, 2230 | } 2231 | if self._fairness_metric == "group_loss_difference": 2232 | raise AutoMLException( 2233 | "We can't set default fairness threshold value. Please set `fairness_threshold` value in AutoML constructor." 2234 | ) 2235 | return thresholds.get(self._fairness_metric, 0.8) 2236 | else: 2237 | return deepcopy(self.fairness_threshold) 2238 | 2239 | def _get_privileged_groups(self): 2240 | """Gets privileged groups for fair training""" 2241 | if self.privileged_groups == "auto": 2242 | return [] 2243 | else: 2244 | return deepcopy(self.privileged_groups) 2245 | 2246 | def _get_underprivileged_groups(self): 2247 | """Gets underprivileged groups for fair training""" 2248 | if self.underprivileged_groups == "auto": 2249 | return [] 2250 | else: 2251 | return deepcopy(self.underprivileged_groups) 2252 | 2253 | def to_json(self): 2254 | if self._best_model is None: 2255 | return None 2256 | 2257 | return { 2258 | "best_model": self._best_model.to_json(), 2259 | "threshold": self._threshold, 2260 | "ml_task": self._ml_task, 2261 | } 2262 | 2263 | def from_json(self, json_data): 2264 | if json_data["best_model"]["algorithm_short_name"] == "Ensemble": 2265 | self._best_model = Ensemble() 2266 | self._best_model.from_json(json_data["best_model"]) 2267 | else: 2268 | self._best_model = ModelFramework(json_data["best_model"].get("params")) 2269 | self._best_model.from_json(json_data["best_model"]) 2270 | self._threshold = json_data.get("threshold") 2271 | 2272 | self._ml_task = json_data.get("ml_task") 2273 | 2274 | report_style = f""" 2275 | .styled-table {{ 2276 | border-collapse: collapse; 2277 | font-size: 0.9em; 2278 | font-family: Courier New; 2279 | }} 2280 | 2281 | .styled-table td, .styled-table th {{ 2282 | border: 1px solid #ddd; 2283 | padding: 8px; 2284 | }} 2285 | 2286 | .styled-table tr:nth-child(even){{background-color: #f2f2f2;}} 2287 | 2288 | .styled-table tr:hover {{background-color: #e0ecf5;}} 2289 | 2290 | .styled-table thead {{ 2291 | padding-top: 6px; 2292 | padding-bottom: 6px; 2293 | text-align: left; 2294 | background-color: #0099cc; 2295 | color: white; 2296 | }} 2297 | 2298 | .mljar-automl-report {{ 2299 | font-family: ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; 2300 | background-color: rgba(236, 243, 249, 0.15); 2301 | 2302 | 2303 | h1 {{ 2304 | color: #004666; 2305 | border-bottom: 1px solid rgba(0,70,102,0.3) 2306 | }} 2307 | h2 {{ 2308 | color: #004666; 2309 | padding-bottom: 5px; 2310 | margin-bottom: 0px; 2311 | }} 2312 | 2313 | ul {{ 2314 | margin-top: 0px; 2315 | }} 2316 | 2317 | p {{ 2318 | margin-top: 5px; 2319 | }} 2320 | 2321 | h3 {{ 2322 | color: #004666; 2323 | padding-bottom: 5px; 2324 | margin-bottom: 0px; 2325 | }} 2326 | a {{ 2327 | font-weight: bold; 2328 | color: #004666; 2329 | }} 2330 | 2331 | a:hover {{ 2332 | cursor: pointer; 2333 | color: #0099CC; 2334 | }} 2335 | }} 2336 | 2337 | """ 2338 | 2339 | def _md_to_html(self, md_fname, page_type, dir_path, me=None): 2340 | import base64 2341 | 2342 | import markdown 2343 | 2344 | if not os.path.exists(md_fname): 2345 | return None 2346 | content = "" 2347 | with open(md_fname) as fin: 2348 | content = fin.read() 2349 | 2350 | content = content.replace("README.md", "README.html") 2351 | content_html = markdown.markdown( 2352 | content, extensions=["markdown.extensions.tables"] 2353 | ) 2354 | content_html = content_html.replace("<img ", '<img style="width:750px" ') 2355 | content_html = content_html.replace("<table>", '<table class="styled-table">') 2356 | content_html = content_html.replace("<tr>", '<tr style="text-align: right;">') 2357 | 2358 | # replace png figures to base64 2359 | for f in os.listdir(dir_path): 2360 | if ".png" in f: 2361 | encoded_string = "" 2362 | with open(os.path.join(dir_path, f), "rb") as image_file: 2363 | encoded_string = base64.b64encode(image_file.read()) 2364 | encoded_string = encoded_string.decode("utf-8") 2365 | encoded_figure = f"data:image/png;base64, {encoded_string}" 2366 | content_html = content_html.replace(f, encoded_figure) 2367 | 2368 | # insert svg figures 2369 | for f in os.listdir(dir_path): 2370 | if ".svg" in f: 2371 | with open(os.path.join(dir_path, f), "rb") as image_file: 2372 | svg_plot = image_file.read() 2373 | svg_plot = svg_plot.decode("utf-8") 2374 | 2375 | arr = content_html.split("\n") 2376 | new_content = [] 2377 | for i in arr: 2378 | if f in i: 2379 | new_content += [f"<p>{svg_plot}</p>"] 2380 | else: 2381 | new_content += [i] 2382 | content_html = "\n".join(new_content) 2383 | 2384 | # change links 2385 | if page_type == f"automl-report-main-{self._id}": 2386 | for f in os.listdir(dir_path): 2387 | if os.path.exists(os.path.join(dir_path, f, "README.md")): 2388 | old = f'href="{f}/README.html"' 2389 | new = f"onclick=\"toggleShow('{f}-{self._id}');toggleShow('automl-report-main-{self._id}')\" " 2390 | content_html = content_html.replace(old, new) 2391 | 2392 | # other links 2393 | if me is not None: 2394 | old = 'href="../README.html"' 2395 | new = f"onclick=\"toggleShow('{me}-{self._id}');toggleShow('automl-report-main-{self._id}')\" " 2396 | content_html = content_html.replace(old, new) 2397 | 2398 | beginning = "" 2399 | 2400 | if page_type == f"automl-report-main-{self._id}": 2401 | beginning += """<img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/mljar_AutomatedML.png" style="height:128px; margin-left: auto; 2402 | margin-right: auto;display: block;"/>\n\n""" 2403 | if os.path.exists(os.path.join(self._results_path, "optuna/README.md")): 2404 | beginning += f"<h2><a onclick=\"toggleShow('optuna');toggleShow('automl-report-main-{self._id}')\" >» Optuna Params Tuning Report</a></h2>" 2405 | 2406 | content_html = beginning + content_html 2407 | 2408 | return content_html 2409 | 2410 | def _show_report(self, main_readme_html, width=900, height=1200): 2411 | from IPython.display import HTML, IFrame 2412 | 2413 | if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is None: 2414 | with open(main_readme_html) as fin: 2415 | return HTML(fin.read()) 2416 | else: 2417 | return IFrame(main_readme_html, width=width, height=height) 2418 | 2419 | def _report(self, width=900, height=1200): 2420 | self._results_path = self._get_results_path() 2421 | main_readme_html = os.path.join(self._results_path, "README.html") 2422 | 2423 | if os.path.exists(main_readme_html): 2424 | return self._show_report(main_readme_html, width, height) 2425 | 2426 | body = "" 2427 | fname = os.path.join(self._results_path, "README.md") 2428 | body += ( 2429 | f'<div id="automl-report-main-{self._id}">\n' 2430 | + self._md_to_html(fname, f"automl-report-main-{self._id}", self._results_path) 2431 | + "\n\n</div>\n\n" 2432 | ) 2433 | 2434 | for f in os.listdir(self._results_path): 2435 | fname = os.path.join(self._results_path, f, "README.md") 2436 | if os.path.exists(fname): 2437 | body += ( 2438 | f'<div id="{f}-{self._id}" style="display: none">\n' 2439 | + self._md_to_html( 2440 | fname, "sub", os.path.join(self._results_path, f), f 2441 | ) 2442 | + "\n\n</div>\n\n" 2443 | ) 2444 | 2445 | body += """ 2446 | <script> 2447 | function toggleShow(elementId) { 2448 | var x = document.getElementById(elementId); 2449 | if (x.style.display === "none") { 2450 | x.style.display = "block"; 2451 | } else { 2452 | x.style.display = "none"; 2453 | } 2454 | } 2455 | </script> 2456 | """ 2457 | 2458 | report_content = f""" 2459 | <!DOCTYPE html> 2460 | <html> 2461 | <head> 2462 | <style> 2463 | {self.report_style} 2464 | </style> 2465 | </head> 2466 | <body> 2467 | <div class="mljar-automl-report-{self._id}"> 2468 | {body} 2469 | <div> 2470 | </body> 2471 | </html> 2472 | """ 2473 | with open(main_readme_html, "w") as fout: 2474 | fout.write(report_content) 2475 | 2476 | return self._show_report(main_readme_html, width, height) 2477 | 2478 | def _need_retrain(self, X, y, sample_weight, decrease): 2479 | metric = self._best_model.get_metric() 2480 | 2481 | X, y, sample_weight, _ = ExcludeRowsMissingTarget.transform( 2482 | X, y, sample_weight, warn=True 2483 | ) 2484 | 2485 | if self._ml_task == BINARY_CLASSIFICATION: 2486 | prediction = self._predict_proba(X)[:, 1] 2487 | if self._ml_task == MULTICLASS_CLASSIFICATION: 2488 | prediction = self._predict_proba(X) 2489 | else: 2490 | prediction = self._predict(X) 2491 | 2492 | sign = -1.0 if Metric.optimize_negative(metric.name) else 1.0 2493 | 2494 | new_score = metric(y, prediction, sample_weight) 2495 | old_score = self._best_model.get_final_loss() 2496 | 2497 | change = np.abs((old_score - new_score) / old_score) 2498 | 2499 | # always minimize the score 2500 | if new_score > old_score: 2501 | self.verbose_print( 2502 | f"Model performance decreased by {np.round(change*100.0,2)}%" 2503 | ) 2504 | return change > decrease 2505 | else: 2506 | self.verbose_print( 2507 | f"Model performance increased by {np.round(change*100.0,2)}%" 2508 | ) 2509 | return False 2510 | ```