mljar/mljar-supervised # codebase.md

This is page 14 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/supervised/base_automl.py:
--------------------------------------------------------------------------------

```python
   1 | import json
   2 | import logging
   3 | import os
   4 | import shutil
   5 | import time
   6 | import types
   7 | import uuid
   8 | from abc import ABC
   9 | from copy import deepcopy
  10 | 
  11 | import joblib
  12 | import numpy as np
  13 | import pandas as pd
  14 | from sklearn.base import BaseEstimator
  15 | from sklearn.metrics import accuracy_score, r2_score
  16 | from sklearn.utils.validation import check_array
  17 | from tabulate import tabulate
  18 | 
  19 | from supervised.algorithms.registry import (
  20 |     BINARY_CLASSIFICATION,
  21 |     MULTICLASS_CLASSIFICATION,
  22 |     REGRESSION,
  23 |     AlgorithmsRegistry,
  24 | )
  25 | from supervised.callbacks.early_stopping import EarlyStopping
  26 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint
  27 | from supervised.ensemble import Ensemble
  28 | from supervised.exceptions import AutoMLException, NotTrainedException
  29 | from supervised.model_framework import ModelFramework
  30 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
  31 | # disable EDA
  32 | # from supervised.preprocessing.eda import EDA
  33 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
  34 | from supervised.tuner.data_info import DataInfo
  35 | from supervised.tuner.mljar_tuner import MljarTuner
  36 | from supervised.tuner.time_controller import TimeController
  37 | from supervised.utils.automl_plots import AutoMLPlots
  38 | from supervised.utils.config import LOG_LEVEL
  39 | from supervised.utils.data_validation import (
  40 |     check_bool,
  41 |     check_greater_than_zero_integer,
  42 |     check_greater_than_zero_integer_or_float,
  43 |     check_integer,
  44 |     check_positive_integer,
  45 | )
  46 | from supervised.utils.jsonencoder import MLJSONEncoder
  47 | from supervised.utils.leaderboard_plots import LeaderboardPlots
  48 | from supervised.utils.metric import Metric, UserDefinedEvalMetric
  49 | from supervised.utils.utils import dump_data, load_data
  50 | 
  51 | logger = logging.getLogger(__name__)
  52 | logger.setLevel(LOG_LEVEL)
  53 | 
  54 | 
  55 | class BaseAutoML(BaseEstimator, ABC):
  56 |     """
  57 |     Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression).
  58 |     Warning: This class should not be used directly. Use derived classes instead.
  59 |     """
  60 | 
  61 |     def __init__(self):
  62 |         logger.debug("BaseAutoML.__init__")
  63 |         self._mode = None
  64 |         self._ml_task = None
  65 |         self._results_path = None
  66 |         self._total_time_limit = None
  67 |         self._model_time_limit = None
  68 |         self._algorithms = []
  69 |         self._train_ensemble = False
  70 |         self._stack_models = False
  71 |         self._eval_metric = None
  72 |         self._validation_strategy = None
  73 |         self._verbose = None
  74 |         self._explain_level = None
  75 |         self._golden_features = None
  76 |         self._features_selection = None
  77 |         self._start_random_models = None
  78 |         self._hill_climbing_steps = None
  79 |         self._top_models_to_improve = None
  80 |         self._random_state = 1234
  81 |         self._models = []  # instances of iterative learner framework or ensemble
  82 |         self._best_model = None
  83 |         self._verbose = True
  84 |         self._threshold = None  # used only in classification
  85 |         self._metrics_details = None
  86 |         self._max_metrics = None
  87 |         self._confusion_matrix = None
  88 |         self._X_path, self._y_path = None, None
  89 |         self._data_info = None
  90 |         self._model_subpaths = []
  91 |         self._stacked_models = None
  92 |         self._fit_level = None
  93 |         self._start_time = time.time()
  94 |         self._time_ctrl = None
  95 |         self._all_params = {}
  96 |         # https://scikit-learn.org/stable/developers/develop.html#universal-attributes
  97 |         self.n_features_in_ = None  # for scikit-learn api
  98 |         self.tuner = None
  99 |         self._boost_on_errors = None
 100 |         self._kmeans_features = None
 101 |         self._mix_encoding = None
 102 |         self._max_single_prediction_time = None
 103 |         self._optuna_time_budget = None
 104 |         self._optuna_init_params = {}
 105 |         self._fairness_metric = None
 106 |         self._fairness_threshold = None
 107 |         self._privileged_groups = []
 108 |         self._underprivileged_groups = []
 109 |         self._optuna_verbose = True
 110 |         self._n_jobs = -1
 111 |         self._id = str(uuid.uuid4())
 112 | 
 113 |     def _get_tuner_params(
 114 |         self, start_random_models, hill_climbing_steps, top_models_to_improve
 115 |     ):
 116 |         return {
 117 |             "start_random_models": start_random_models,
 118 |             "hill_climbing_steps": hill_climbing_steps,
 119 |             "top_models_to_improve": top_models_to_improve,
 120 |         }
 121 | 
 122 |     def _check_can_load(self):
 123 |         """Checks if AutoML can be loaded from a folder"""
 124 |         if self.results_path is not None:
 125 |             # Dir exists and can be loaded
 126 |             if os.path.exists(self.results_path) and os.path.exists(
 127 |                 os.path.join(self.results_path, "params.json")
 128 |             ):
 129 |                 self.load(self.results_path)
 130 |                 self._results_path = self.results_path
 131 | 
 132 |     def load(self, path):
 133 |         logger.info("Loading AutoML models ...")
 134 |         try:
 135 |             with open(os.path.join(path, "params.json")) as file:
 136 |                 params = json.load(file)
 137 | 
 138 |             self._model_subpaths = params["saved"]
 139 |             self._mode = params.get("mode", self._mode)
 140 |             self._ml_task = params.get("ml_task", self._ml_task)
 141 |             self._results_path = params.get("results_path", self._results_path)
 142 |             self._total_time_limit = params.get(
 143 |                 "total_time_limit", self._total_time_limit
 144 |             )
 145 |             self._model_time_limit = params.get(
 146 |                 "model_time_limit", self._model_time_limit
 147 |             )
 148 |             self._algorithms = params.get("algorithms", self._algorithms)
 149 |             self._train_ensemble = params.get("train_ensemble", self._train_ensemble)
 150 |             self._stack_models = params.get("stack_models", self._stack_models)
 151 |             self._eval_metric = params.get("eval_metric", self._eval_metric)
 152 |             self._validation_strategy = params.get(
 153 |                 "validation_strategy", self._validation_strategy
 154 |             )
 155 |             self._verbose = params.get("verbose", self._verbose)
 156 |             self._explain_level = params.get("explain_level", self._explain_level)
 157 |             self._golden_features = params.get("golden_features", self._golden_features)
 158 |             self._features_selection = params.get(
 159 |                 "features_selection", self._features_selection
 160 |             )
 161 |             self._start_random_models = params.get(
 162 |                 "start_random_models", self._start_random_models
 163 |             )
 164 |             self._hill_climbing_steps = params.get(
 165 |                 "hill_climbing_steps", self._hill_climbing_steps
 166 |             )
 167 |             self._top_models_to_improve = params.get(
 168 |                 "top_models_to_improve", self._top_models_to_improve
 169 |             )
 170 |             self._boost_on_errors = params.get("boost_on_errors", self._boost_on_errors)
 171 |             self._kmeans_features = params.get("kmeans_features", self._kmeans_features)
 172 |             self._mix_encoding = params.get("mix_encoding", self._mix_encoding)
 173 |             self._max_single_prediction_time = params.get(
 174 |                 "max_single_prediction_time", self._max_single_prediction_time
 175 |             )
 176 |             self._n_jobs = params.get("n_jobs", self._n_jobs)
 177 |             self._random_state = params.get("random_state", self._random_state)
 178 |             stacked_models = params.get("stacked")
 179 | 
 180 |             best_model_name = params.get("best_model")
 181 |             load_on_predict = params.get("load_on_predict")
 182 |             self._fit_level = params.get("fit_level")
 183 |             lazy_load = not (
 184 |                 self._fit_level is not None and self._fit_level == "finished"
 185 |             )
 186 |             load_models = self._model_subpaths
 187 |             if load_on_predict is not None and self._fit_level == "finished":
 188 |                 load_models = load_on_predict
 189 |                 # just in case there is check for which models should be loaded
 190 |                 # fix https://github.com/mljar/mljar-supervised/issues/395
 191 |                 models_needed = self.models_needed_on_predict(best_model_name)
 192 |                 # join them and return unique list
 193 |                 load_models = list(np.unique(load_models + models_needed))
 194 | 
 195 |             models_map = {}
 196 | 
 197 |             for model_subpath in load_models:
 198 |                 if model_subpath.endswith("Ensemble") or model_subpath.endswith(
 199 |                     "Ensemble_Stacked"
 200 |                 ):
 201 |                     ens = Ensemble.load(path, model_subpath, models_map)
 202 |                     self._models += [ens]
 203 |                     models_map[ens.get_name()] = ens
 204 |                 else:
 205 |                     m = ModelFramework.load(path, model_subpath, lazy_load)
 206 |                     self._models += [m]
 207 |                     models_map[m.get_name()] = m
 208 | 
 209 |             self._best_model = None
 210 |             if best_model_name is not None:
 211 |                 self._best_model = models_map.get(best_model_name)
 212 | 
 213 |             if stacked_models is not None and (
 214 |                 self._best_model._is_stacked or self._fit_level != "finished"
 215 |             ):
 216 |                 self._stacked_models = []
 217 |                 for stacked_model_name in stacked_models:
 218 |                     self._stacked_models += [models_map[stacked_model_name]]
 219 | 
 220 |             data_info_path = os.path.join(path, "data_info.json")
 221 |             with open(data_info_path, "r") as file:
 222 |                 self._data_info = json.load(file)
 223 |             self.n_features_in_ = self._data_info["n_features"]
 224 | 
 225 |             if "n_classes" in self._data_info:
 226 |                 self.n_classes = self._data_info["n_classes"]
 227 | 
 228 |         except Exception as e:
 229 |             raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
 230 | 
 231 |     def get_leaderboard(
 232 |         self, filter_random_feature=False, original_metric_values=False
 233 |     ):
 234 |         ldb = {
 235 |             "name": [],
 236 |             "model_type": [],
 237 |             "metric_type": [],
 238 |             "metric_value": [],
 239 |             "train_time": [],
 240 |         }
 241 |         if self._max_single_prediction_time is not None:
 242 |             ldb["single_prediction_time"] = []
 243 | 
 244 |         sensitive_features_names = []
 245 |         if self._fairness_metric is not None and len(self._models):
 246 |             sensitive_features_names = self._models[0].get_sensitive_features_names()
 247 |             ldb["fairness_metric"] = []
 248 |             for sf in sensitive_features_names:
 249 |                 ldb[f"fairness_{sf}"] = []
 250 |             ldb["is_fair"] = []
 251 | 
 252 |         for m in self._models:
 253 |             # filter model with random feature
 254 |             if filter_random_feature and "RandomFeature" in m.get_name():
 255 |                 continue
 256 |             ldb["name"] += [m.get_name()]
 257 |             ldb["model_type"] += [m.get_type()]
 258 |             ldb["metric_type"] += [self._eval_metric]
 259 |             ldb["metric_value"] += [m.get_final_loss()]
 260 |             ldb["train_time"] += [np.round(m.get_train_time(), 2)]
 261 |             if self._max_single_prediction_time is not None:
 262 |                 if m._single_prediction_time is not None:
 263 |                     ldb["single_prediction_time"] += [
 264 |                         np.round(m._single_prediction_time, 4)
 265 |                     ]
 266 |                 else:
 267 |                     ldb["single_prediction_time"] += [None]
 268 |             if self._fairness_metric is not None:
 269 |                 ldb["fairness_metric"] += [self._fairness_metric]
 270 |                 for sf in sensitive_features_names:
 271 |                     ldb[f"fairness_{sf}"] += [m.get_fairness_metric(sf)]
 272 |                 ldb["is_fair"] += [m.is_fair()]
 273 | 
 274 |         ldb = pd.DataFrame(ldb)
 275 |         # need to add argument for sorting
 276 |         # minimize_direction = m.get_metric().get_minimize_direction()
 277 |         # ldb = ldb.sort_values("metric_value", ascending=minimize_direction)
 278 | 
 279 |         if original_metric_values:
 280 |             if Metric.optimize_negative(self._eval_metric):
 281 |                 ldb["metric_value"] *= -1.0
 282 | 
 283 |         return ldb
 284 | 
 285 |     def keep_model(self, model, model_subpath):
 286 |         if model is None:
 287 |             return
 288 | 
 289 |         if self._max_single_prediction_time is not None:
 290 |             # let's check the prediction time ...
 291 |             # load 2x because of model reloading during the training
 292 |             for _ in range(2):
 293 |                 start_time = time.time()
 294 |                 self._base_predict(self._one_sample, model)
 295 |                 model._single_prediction_time = (
 296 |                     time.time() - start_time
 297 |                 )  # prediction time on single sample
 298 |             # again release learners from models
 299 |             if "Ensemble" not in model.get_type():
 300 |                 model.release_learners()
 301 | 
 302 |         self._models += [model]
 303 |         self._model_subpaths += [model_subpath]
 304 |         self.select_and_save_best()
 305 | 
 306 |         sign = -1.0 if Metric.optimize_negative(self._eval_metric) else 1.0
 307 |         msg = "{} {} {} trained in {} seconds".format(
 308 |             model.get_name(),
 309 |             self._eval_metric,
 310 |             np.round(sign * model.get_final_loss(), 6),
 311 |             np.round(model.get_train_time(), 2),
 312 |         )
 313 |         if model._single_prediction_time is not None:
 314 |             msg += f" (1-sample predict time {np.round(model._single_prediction_time,4)} seconds)"
 315 |         self.verbose_print(msg)
 316 |         self._time_ctrl.log_time(
 317 |             model.get_name(), model.get_type(), self._fit_level, model.get_train_time()
 318 |         )
 319 | 
 320 |         self.tuner.add_key(model)
 321 | 
 322 |     def create_dir(self, model_path):
 323 |         if not os.path.exists(model_path):
 324 |             try:
 325 |                 os.mkdir(model_path)
 326 |             except Exception as e:
 327 |                 raise AutoMLException(f"Cannot create directory {model_path}. {str(e)}")
 328 | 
 329 |     def _expected_learners_cnt(self):
 330 |         try:
 331 |             repeats = self._validation_strategy.get("repeats", 1)
 332 |             folds = self._validation_strategy.get("k_folds", 1)
 333 |             return repeats * folds
 334 |         except Exception as e:
 335 |             pass
 336 |         return 1
 337 | 
 338 |     def train_model(self, params):
 339 |         # do we have enough time to train?
 340 |         # if not, skip
 341 |         if not self._time_ctrl.enough_time(
 342 |             params["learner"]["model_type"], self._fit_level
 343 |         ):
 344 |             logger.info(f"Cannot train {params['name']} because of the time constraint")
 345 |             return False
 346 |         # let's create directory to log all training artifacts
 347 |         results_path, model_subpath = self._results_path, params["name"]
 348 |         model_path = os.path.join(results_path, model_subpath)
 349 |         self.create_dir(model_path)
 350 | 
 351 |         # prepare callbacks
 352 |         early_stop = EarlyStopping(
 353 |             {"metric": {"name": self._eval_metric}, "log_to_dir": model_path}
 354 |         )
 355 | 
 356 |         # disable for now
 357 |         max_time_for_learner = 3600
 358 |         if self._total_time_limit is not None:
 359 |             k_folds = self._validation_strategy.get("k_folds", 1.0)
 360 |             at_least_algorithms = 10.0
 361 | 
 362 |             max_time_for_learner = max(
 363 |                 self._total_time_limit / k_folds / at_least_algorithms, 60
 364 |             )
 365 | 
 366 |         params["max_time_for_learner"] = max_time_for_learner
 367 | 
 368 |         total_time_constraint = TotalTimeConstraint(
 369 |             {
 370 |                 "total_time_limit": self._total_time_limit
 371 |                 if self._model_time_limit is None
 372 |                 else None,
 373 |                 "total_time_start": self._start_time,
 374 |                 "expected_learners_cnt": self._expected_learners_cnt(),
 375 |             }
 376 |         )
 377 | 
 378 |         # create model framework
 379 |         mf = ModelFramework(
 380 |             params,
 381 |             callbacks=[early_stop, total_time_constraint],
 382 |         )
 383 | 
 384 |         # start training
 385 |         logger.info(
 386 |             f"Train model #{len(self._models)+1} / Model name: {params['name']}"
 387 |         )
 388 |         mf.train(results_path, model_subpath)
 389 | 
 390 |         # keep info about the model
 391 |         self.keep_model(mf, model_subpath)
 392 | 
 393 |         # save the model
 394 |         mf.save(results_path, model_subpath)
 395 | 
 396 |         return True
 397 | 
 398 |     def verbose_print(self, msg):
 399 |         if self._verbose > 0:
 400 |             # self._progress_bar.write(msg)
 401 |             print(msg)
 402 | 
 403 |     def ensemble_step(self, is_stacked=False):
 404 |         if self._train_ensemble and len(self._models) > 1:
 405 |             ensemble_subpath = "Ensemble_Stacked" if is_stacked else "Ensemble"
 406 |             ensemble_path = os.path.join(self._results_path, ensemble_subpath)
 407 |             self.create_dir(ensemble_path)
 408 | 
 409 |             self.ensemble = Ensemble(
 410 |                 self._eval_metric,
 411 |                 self._ml_task,
 412 |                 is_stacked=is_stacked,
 413 |                 max_single_prediction_time=self._max_single_prediction_time,
 414 |                 fairness_metric=self._fairness_metric,
 415 |                 fairness_threshold=self._fairness_threshold,
 416 |                 privileged_groups=self._privileged_groups,
 417 |                 underprivileged_groups=self._underprivileged_groups,
 418 |             )
 419 |             (
 420 |                 oofs,
 421 |                 target,
 422 |                 sample_weight,
 423 |                 sensitive_features,
 424 |             ) = self.ensemble.get_oof_matrix(self._models)
 425 |             self.ensemble.fit(oofs, target, sample_weight, sensitive_features)
 426 |             self.keep_model(self.ensemble, ensemble_subpath)
 427 |             self.ensemble.save(self._results_path, ensemble_subpath)
 428 |             return True
 429 |         return False
 430 | 
 431 |     def can_we_stack_them(self, y):
 432 |         # if multiclass and too many classes then No
 433 |         return True
 434 | 
 435 |     def get_stacked_data(self, X, mode="training"):
 436 |         # mode can be `training` or `predict`
 437 |         if self._stacked_models is None:
 438 |             return X
 439 |         all_oofs = []
 440 |         for m in self._stacked_models:
 441 |             oof = None
 442 |             if mode == "training":
 443 |                 oof = m.get_out_of_folds()
 444 |             else:
 445 |                 oof = m.predict(X)
 446 |                 if self._ml_task == BINARY_CLASSIFICATION:
 447 |                     cols = [f for f in oof.columns if "prediction" in f]
 448 |                     if len(cols) == 2:
 449 |                         oof = pd.DataFrame({"prediction": oof[cols[1]]})
 450 | 
 451 |             cols = [f for f in oof.columns if "prediction" in f]
 452 |             oof = oof[cols]
 453 |             oof.columns = [f"{m.get_name()}_{c}" for c in cols]
 454 |             all_oofs += [oof]
 455 | 
 456 |         org_index = X.index.copy()
 457 |         X.reset_index(drop=True, inplace=True)
 458 |         X_stacked = pd.concat([X] + all_oofs, axis=1)
 459 | 
 460 |         X_stacked.index = org_index.copy()
 461 |         X.index = org_index.copy()
 462 |         return X_stacked
 463 | 
 464 |     def _perform_model_stacking(self):
 465 |         if self._stacked_models is not None:
 466 |             return
 467 | 
 468 |         ldb = self.get_leaderboard(filter_random_feature=True)
 469 |         if self._fairness_metric is not None:
 470 |             # get only fair models if we train with sensitive features
 471 |             ldb = ldb[ldb["is_fair"]]
 472 |         ldb = ldb.sort_values(by="metric_value", ascending=True)
 473 |         models_map = {m.get_name(): m for m in self._models if not m._is_stacked}
 474 |         self._stacked_models = []
 475 |         models_limit = 10
 476 | 
 477 |         for model_type in np.unique(ldb.model_type):
 478 |             if model_type in ["Baseline"]:
 479 |                 continue
 480 |             ds = ldb[ldb.model_type == model_type].copy()
 481 |             ds.sort_values(by="metric_value", inplace=True)
 482 | 
 483 |             for n in list(ds.name.iloc[:models_limit].values):
 484 |                 self._stacked_models += [models_map[n]]
 485 | 
 486 |         scores = [m.get_final_loss() for m in self._stacked_models]
 487 |         self._stacked_models = [
 488 |             self._stacked_models[i] for i in np.argsort(scores).tolist()
 489 |         ]
 490 | 
 491 |     def get_stacking_minimum_time_needed(self):
 492 |         try:
 493 |             ldb = self.get_leaderboard(filter_random_feature=True)
 494 |             ldb = ldb.sort_values(by="metric_value", ascending=True)
 495 |             return min(2.0 * ldb.iloc[0]["train_time"], 60)
 496 |         except Exception as e:
 497 |             return 60
 498 | 
 499 |     def prepare_for_stacking(self):
 500 |         # print("Stacked models ....")
 501 |         # do we have enough models?
 502 |         if len(self._models) < 5:
 503 |             return
 504 |         # do we have time?
 505 |         if self._total_time_limit is not None:
 506 |             time_left = self._total_time_limit - (time.time() - self._start_time)
 507 |             # we need some time to start stacking
 508 |             # it should be at least 60 seconds for larger data
 509 |             # but for small data it can be less
 510 |             if time_left < self.get_stacking_minimum_time_needed():
 511 |                 return
 512 |         # too many classes and models
 513 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
 514 |             if self.n_classes * len(self._models) > 1000:
 515 |                 return
 516 |         # if we are training with sensitive features
 517 |         # then we will stack only fair models
 518 |         # if there are no fair models then we skip this step
 519 |         if self._fairness_metric is not None:
 520 |             if not [m for m in self._models if m.is_fair()]:
 521 |                 self.verbose_print("Skip stacking. We can stack only fair models.")
 522 |                 return
 523 | 
 524 |         self._perform_model_stacking()
 525 | 
 526 |         X_stacked_path = os.path.join(self._results_path, "X_stacked.data")
 527 |         if os.path.exists(X_stacked_path):
 528 |             return
 529 | 
 530 |         X = load_data(self._X_path)
 531 |         org_columns = X.columns.tolist()
 532 |         X_stacked = self.get_stacked_data(X)
 533 |         new_columns = X_stacked.columns.tolist()
 534 |         added_columns = [c for c in new_columns if c not in org_columns]
 535 | 
 536 |         # save stacked train data
 537 |         dump_data(X_stacked_path, X_stacked)
 538 | 
 539 |         """
 540 |         # resue old params
 541 |         for m in self._stacked_models:
 542 |             # print(m.get_type())
 543 |             # use only Xgboost, LightGBM and CatBoost as stacked models
 544 |             if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]:
 545 |                 continue
 546 |             params = copy.deepcopy(m.params)
 547 |             params["validation"]["X_train_path"] = X_train_stacked_path
 548 |             params["name"] = params["name"] + "_Stacked"
 549 |             params["is_stacked"] = True
 550 |             # print(params)
 551 |             if "model_architecture_json" in params["learner"]:
 552 |                 # the new model will be created with wider input size
 553 |                 del params["learner"]["model_architecture_json"]
 554 |             if self._ml_task == REGRESSION:
 555 |                 # scale added predictions in regression if the target was scaled (in the case of NN)
 556 |                 target_preprocessing = params["preprocessing"]["target_preprocessing"]
 557 |                 scale = None
 558 |                 if "scale_log_and_normal" in target_preprocessing:
 559 |                     scale = "scale_log_and_normal"
 560 |                 elif "scale_normal" in target_preprocessing:
 561 |                     scale = "scale_normal"
 562 |                 if scale is not None:
 563 |                     for col in added_columns:
 564 |                         params["preprocessing"]["columns_preprocessing"][col] = [
 565 |                             scale]
 566 |             self.train_model(params)
 567 |         """
 568 | 
 569 |     def _save_data(self, X, y, sample_weight=None, cv=None, sensitive_features=None):
 570 |         # save information about original data
 571 |         self._save_data_info(X, y, sample_weight, sensitive_features)
 572 | 
 573 |         # handle drastic imbalance
 574 |         # assure at least 20 samples of each class
 575 |         # for binary and multiclass classification
 576 |         self._handle_drastic_imbalance(X, y, sample_weight, sensitive_features)
 577 | 
 578 |         # prepare path for saving files
 579 |         self._X_path = os.path.join(self._results_path, "X.data")
 580 |         self._y_path = os.path.join(self._results_path, "y.data")
 581 |         self._sample_weight_path = None
 582 |         if sample_weight is not None:
 583 |             self._sample_weight_path = os.path.join(
 584 |                 self._results_path, "sample_weight.data"
 585 |             )
 586 |             dump_data(
 587 |                 self._sample_weight_path, pd.DataFrame({"sample_weight": sample_weight})
 588 |             )
 589 |         self._sensitive_features_path = None
 590 |         if sensitive_features is not None:
 591 |             self._sensitive_features_path = os.path.join(
 592 |                 self._results_path, "sensitive_features.data"
 593 |             )
 594 |             dump_data(self._sensitive_features_path, sensitive_features)
 595 | 
 596 |         dump_data(self._X_path, X)
 597 | 
 598 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
 599 |             y = y.astype(str)
 600 | 
 601 |         dump_data(self._y_path, pd.DataFrame({"target": y}))
 602 | 
 603 |         # set paths in validation parameters
 604 |         self._validation_strategy["X_path"] = self._X_path
 605 |         self._validation_strategy["y_path"] = self._y_path
 606 |         self._validation_strategy["results_path"] = self._results_path
 607 |         if sample_weight is not None:
 608 |             self._validation_strategy["sample_weight_path"] = self._sample_weight_path
 609 |         if sensitive_features is not None:
 610 |             self._validation_strategy[
 611 |                 "sensitive_features_path"
 612 |             ] = self._sensitive_features_path
 613 | 
 614 |         if cv is not None:
 615 |             self._validation_strategy["cv_path"] = os.path.join(
 616 |                 self._results_path, "cv.data"
 617 |             )
 618 |             joblib.dump(cv, self._validation_strategy["cv_path"])
 619 | 
 620 |         if self._max_single_prediction_time is not None:
 621 |             self._one_sample = X.iloc[:1].copy(deep=True)
 622 | 
 623 |     def _handle_drastic_imbalance(
 624 |         self, X, y, sample_weight=None, sensitive_features=None
 625 |     ):
 626 |         if self._ml_task == REGRESSION:
 627 |             return
 628 |         classes, cnts = np.unique(y, return_counts=True)
 629 |         min_samples_per_class = 20
 630 |         if self._validation_strategy is not None:
 631 |             min_samples_per_class = max(
 632 |                 min_samples_per_class, self._validation_strategy.get("k_folds", 0)
 633 |             )
 634 |         for i in range(len(classes)):
 635 |             if cnts[i] < min_samples_per_class:
 636 |                 append_samples = min_samples_per_class - cnts[i]
 637 |                 new_X = (
 638 |                     X[y == classes[i]]
 639 |                     .sample(n=append_samples, replace=True, random_state=1)
 640 |                     .reset_index(drop=True)
 641 |                 )
 642 |                 if sample_weight is not None:
 643 |                     new_sample_weight = (
 644 |                         sample_weight[y == classes[i]]
 645 |                         .sample(n=append_samples, replace=True, random_state=1)
 646 |                         .reset_index(drop=True)
 647 |                     )
 648 |                 if sensitive_features is not None:
 649 |                     new_sensitive_features = (
 650 |                         sensitive_features[y == classes[i]]
 651 |                         .sample(n=append_samples, replace=True, random_state=1)
 652 |                         .reset_index(drop=True)
 653 |                     )
 654 |                 for j in range(new_X.shape[0]):
 655 |                     X.loc[X.shape[0]] = new_X.loc[j]
 656 |                     y.loc[y.shape[0]] = classes[i]
 657 |                     if sample_weight is not None:
 658 |                         sample_weight.loc[
 659 |                             sample_weight.shape[0]
 660 |                         ] = new_sample_weight.loc[j]
 661 |                     if sensitive_features is not None:
 662 |                         sensitive_features.loc[
 663 |                             sensitive_features.shape[0]
 664 |                         ] = new_sensitive_features.loc[j]
 665 | 
 666 |     def _save_data_info(self, X, y, sample_weight=None, sensitive_features=None):
 667 |         target_is_numeric = pd.api.types.is_numeric_dtype(y)
 668 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
 669 |             y = y.astype(str)
 670 | 
 671 |         columns_and_target_info = DataInfo.compute(X, y, self._ml_task)
 672 | 
 673 |         self.n_features_in_ = X.shape[1]
 674 |         self.n_classes = len(np.unique(y[~pd.isnull(y)]))
 675 | 
 676 |         self._data_info = {
 677 |             "columns": X.columns.tolist(),
 678 |             "rows": y.shape[0],
 679 |             "cols": X.shape[1],
 680 |             "target_is_numeric": target_is_numeric,
 681 |             "columns_info": columns_and_target_info["columns_info"],
 682 |             "target_info": columns_and_target_info["target_info"],
 683 |             "n_features": self.n_features_in_,
 684 |             "is_sample_weighted": sample_weight is not None,
 685 |             "is_fairness_applied": sensitive_features is not None,
 686 |         }
 687 |         # Add n_classes if not regression
 688 |         if self._ml_task != REGRESSION:
 689 |             self._data_info["n_classes"] = self.n_classes
 690 | 
 691 |         if columns_and_target_info.get("num_class") is not None:
 692 |             self._data_info["num_class"] = columns_and_target_info["num_class"]
 693 |         data_info_path = os.path.join(self._results_path, "data_info.json")
 694 |         with open(data_info_path, "w") as fout:
 695 |             fout.write(json.dumps(self._data_info, indent=4, cls=MLJSONEncoder))
 696 | 
 697 |     def save_progress(self, step=None, generated_params=None):
 698 |         if step is not None and generated_params is not None:
 699 |             self._all_params[step] = generated_params
 700 | 
 701 |         state = {}
 702 | 
 703 |         state["fit_level"] = self._fit_level
 704 |         state["time_controller"] = self._time_ctrl.to_json()
 705 |         state["all_params"] = self._all_params
 706 |         state["adjust_validation"] = self._adjust_validation
 707 | 
 708 |         fname = os.path.join(self._results_path, "progress.json")
 709 |         with open(fname, "w") as fout:
 710 |             fout.write(json.dumps(state, indent=4, cls=MLJSONEncoder))
 711 | 
 712 |     def load_progress(self):
 713 |         state = {}
 714 |         fname = os.path.join(self._results_path, "progress.json")
 715 |         if not os.path.exists(fname):
 716 |             return
 717 |         with open(fname, "r") as file:
 718 |             state = json.load(file)
 719 |         self._fit_level = state.get("fit_level", self._fit_level)
 720 |         self._all_params = state.get("all_params", self._all_params)
 721 |         self._time_ctrl = TimeController.from_json(state.get("time_controller"))
 722 |         self._adjust_validation = state.get("adjust_validation", False)
 723 | 
 724 |     def _validate_X_predict(self, X):
 725 |         """Validate X whenever one tries to predict, apply, predict_proba"""
 726 |         # X = check_array(X, ensure_2d=False)
 727 |         X = np.atleast_2d(X)
 728 |         n_features = X.shape[1]
 729 |         if self.n_features_in_ != n_features:
 730 |             raise ValueError(
 731 |                 f"Number of features of the model must match the input. Model n_features_in_ is {self.n_features_in_} and input n_features is {n_features}. Reshape your data."
 732 |             )
 733 | 
 734 |     # This method builds pandas.Dataframe from input. The input can be numpy.ndarray, matrix, or pandas.Dataframe
 735 |     # This method is used to build dataframes in `fit()` and in `predict`. That's the reason y can be None (`predict()` method)
 736 |     def _build_dataframe(self, X, y=None, sample_weight=None, sensitive_features=None):
 737 |         if X is None or X.shape[0] == 0:
 738 |             raise AutoMLException("Empty input dataset")
 739 |         # If Inputs are not pandas dataframes use scikit-learn validation for X array
 740 |         if not isinstance(X, pd.DataFrame):
 741 |             # Validate X as array
 742 |             X = check_array(X, ensure_2d=False, ensure_all_finite=False)
 743 |             # Force X to be 2D
 744 |             X = np.atleast_2d(X)
 745 |             # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index}
 746 |             X = pd.DataFrame(
 747 |                 X, columns=["feature_" + str(i) for i in range(1, len(X[0]) + 1)]
 748 |             )
 749 |         # Enforce column names
 750 |         # Enforce X_train columns to be string
 751 |         X.columns = X.columns.astype(str)
 752 | 
 753 |         X.reset_index(drop=True, inplace=True)
 754 | 
 755 |         if y is None:
 756 |             return X
 757 | 
 758 |         # Check if y is np.ndarray, transform to pd.Series
 759 |         if isinstance(y, np.ndarray):
 760 |             y = check_array(
 761 |                 y,
 762 |                 ensure_2d=False,
 763 |                 dtype="str" if PreprocessingUtils.is_categorical(y) else "numeric",
 764 |             )
 765 |             y = pd.Series(np.array(y), name="target")
 766 |         # if pd.DataFrame, slice first column
 767 |         elif isinstance(y, pd.DataFrame):
 768 |             y = np.array(y.iloc[:, 0])
 769 |             y = check_array(y, ensure_2d=False)
 770 |             y = pd.Series(np.array(y), name="target")
 771 | 
 772 |         if sample_weight is not None:
 773 |             if isinstance(sample_weight, np.ndarray):
 774 |                 sample_weight = check_array(sample_weight, ensure_2d=False)
 775 |                 sample_weight = pd.Series(np.array(sample_weight), name="sample_weight")
 776 |             elif isinstance(sample_weight, pd.DataFrame):
 777 |                 sample_weight = np.array(sample_weight.iloc[:, 0])
 778 |                 sample_weight = check_array(sample_weight, ensure_2d=False)
 779 |                 sample_weight = pd.Series(np.array(sample_weight), name="sample_weight")
 780 | 
 781 |         if sensitive_features is not None:
 782 |             if isinstance(sensitive_features, np.ndarray):
 783 |                 sensitive_features = check_array(sensitive_features, ensure_2d=False)
 784 |                 sensitive_features = pd.DataFrame(
 785 |                     sensitive_features,
 786 |                     columns=[
 787 |                         "sensitive_" + str(i)
 788 |                         for i in range(1, len(sensitive_features[0]) + 1)
 789 |                     ],
 790 |                 )
 791 |             elif isinstance(sensitive_features, pd.Series):
 792 |                 sensitive_features = pd.DataFrame(sensitive_features)
 793 | 
 794 |         X, y, sample_weight, sensitive_features = ExcludeRowsMissingTarget.transform(
 795 |             X, y, sample_weight, sensitive_features, warn=True
 796 |         )
 797 | 
 798 |         X.reset_index(drop=True, inplace=True)
 799 |         y.reset_index(drop=True, inplace=True)
 800 | 
 801 |         if sample_weight is not None:
 802 |             sample_weight.reset_index(drop=True, inplace=True)
 803 | 
 804 |         if sensitive_features is not None:
 805 |             sensitive_features.reset_index(drop=True, inplace=True)
 806 | 
 807 |             for col in sensitive_features.columns:
 808 |                 if not sensitive_features[col].dtype.name in ["category", "object"]:
 809 |                     self.verbose_print("Sensitive features should be categorical")
 810 |                     self.verbose_print(
 811 |                         f"Apply automatic binarization for feature {col}"
 812 |                     )
 813 |                     sensitive_features[col] = pd.DataFrame(
 814 |                         pd.qcut(sensitive_features[col], q=2).astype(str)
 815 |                     )
 816 |                     self.verbose_print(
 817 |                         f"New values {list(sensitive_features[col].unique())} for feature {col} are applied"
 818 |                     )
 819 | 
 820 |         return X, y, sample_weight, sensitive_features
 821 | 
 822 |     def _apply_constraints(self):
 823 |         if "Neural Network" in self._algorithms and self._n_jobs != -1:
 824 |             self._algorithms.remove("Neural Network")
 825 |             self.verbose_print(
 826 |                 "Neural Network algorithm was disabled because it doesn't support n_jobs parameter."
 827 |             )
 828 |         if "Linear" in self._algorithms and not (
 829 |             self.n_rows_in_ < 10000 and self.n_features_in_ < 1000
 830 |         ):
 831 |             self._algorithms.remove("Linear")
 832 |             self.verbose_print("Linear algorithm was disabled.")
 833 | 
 834 |         # remove algorithms in the case of multiclass
 835 |         # and too many classes and columns
 836 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
 837 |             if self.n_classes >= 10 and self.n_features_in_ * self.n_classes > 500:
 838 |                 if self.algorithms == "auto":
 839 |                     for a in ["CatBoost"]:
 840 |                         if a in self._algorithms:
 841 |                             self._algorithms.remove(a)
 842 | 
 843 |             if self.n_features_in_ * self.n_classes > 1000:
 844 |                 if self.algorithms == "auto":
 845 |                     for a in ["Xgboost", "CatBoost"]:
 846 |                         if a in self._algorithms:
 847 |                             self._algorithms.remove(a)
 848 |                 if self.validation_strategy == "auto":
 849 |                     self._validation_strategy = {
 850 |                         "validation_type": "split",
 851 |                         "train_ratio": 0.9,
 852 |                         "shuffle": True,
 853 |                     }
 854 |                     if self._get_ml_task() != REGRESSION:
 855 |                         self._validation_strategy["stratify"] = True
 856 | 
 857 |             if self.n_features_in_ * self.n_classes > 10000:
 858 |                 if self.algorithms == "auto":
 859 |                     for a in ["Random Forest", "Extra Trees"]:
 860 |                         if a in self._algorithms:
 861 |                             self._algorithms.remove(a)
 862 | 
 863 |         # Adjust the validation type based on speed of Decision Tree learning
 864 |         if (
 865 |             self._get_mode() == "Compete"
 866 |             and self._total_time_limit is not None
 867 |             and self.validation_strategy == "auto"
 868 |             and self._validation_strategy["validation_type"]
 869 |             != "split"  # split is the fastest validation type, no need to change
 870 |         ):
 871 |             # the validation will be adjusted after first Decision Tree learning on
 872 |             # train/test split (1-fold)
 873 |             self._adjust_validation = True
 874 |             self._validation_strategy = self._fastest_validation()
 875 | 
 876 |     def _fastest_validation(self):
 877 |         strategy = {"validation_type": "split", "train_ratio": 0.9, "shuffle": True}
 878 |         if self._get_ml_task() != REGRESSION:
 879 |             strategy["stratify"] = True
 880 |         return strategy
 881 | 
 882 |     def _set_adjusted_validation(self):
 883 |         if self._validation_strategy["validation_type"] != "split":
 884 |             return
 885 |         train_time = self._models[-1].get_train_time()
 886 |         # the time of Decision Tree training multiply by 5.0
 887 |         # to get the rough estimation how much time is needed for
 888 |         # other algorithms
 889 |         one_fold_time = train_time * 5.0
 890 |         # it will be good to train at least 10 models
 891 |         min_model_cnt = 10.0
 892 |         # the number of folds we can afford during the training
 893 |         folds_cnt = np.round(self._total_time_limit / one_fold_time / min_model_cnt)
 894 | 
 895 |         # adjust the validation if possible
 896 |         if folds_cnt >= 5.0:
 897 |             self.verbose_print(f"Adjust validation. Remove: {self._model_subpaths[0]}")
 898 |             k_folds = 5
 899 |             if folds_cnt >= 15:
 900 |                 k_folds = 10
 901 |             # too small dataset for stacking
 902 |             if self.n_rows_in_ < 500:
 903 |                 self._stack_models = False
 904 |                 self.verbose_print(
 905 |                     "*** Disable stacking for small dataset (nrows < 500)"
 906 |                 )
 907 | 
 908 |             self._validation_strategy["validation_type"] = "kfold"
 909 |             del self._validation_strategy["train_ratio"]
 910 |             self._validation_strategy["k_folds"] = k_folds
 911 |             self.tuner._validation_strategy = self._validation_strategy
 912 |             shutil.rmtree(
 913 |                 os.path.join(self._results_path, self._model_subpaths[0]),
 914 |                 ignore_errors=True,
 915 |             )
 916 |             del self._models[0]
 917 |             del self._model_subpaths[0]
 918 |             del self.tuner._unique_params_keys[0]
 919 |             self._adjust_validation = False
 920 |             cv = []
 921 |             if self._validation_strategy.get("shuffle", False):
 922 |                 cv += ["Shuffle"]
 923 |             if self._validation_strategy.get("stratify", False):
 924 |                 cv += ["Stratify"]
 925 |             self.select_and_save_best()  # save validation strategy
 926 | 
 927 |             self.verbose_print(f"Validation strategy: {k_folds}-fold CV {','.join(cv)}")
 928 |         else:
 929 |             # cant stack models for train/test split
 930 |             self._stack_models = False
 931 |             self.verbose_print("Disable stacking for split validation")
 932 | 
 933 |         self._apply_constraints_stack_models()
 934 | 
 935 |     def _apply_constraints_stack_models(self):
 936 |         if self._validation_strategy["validation_type"] == "split":
 937 |             if self._stack_models:
 938 |                 self.verbose_print("Disable stacking for split validation")
 939 |             self._stack_models = False
 940 |             self._boost_on_errors = False
 941 |         if "repeats" in self._validation_strategy:
 942 |             if self._stack_models:
 943 |                 self.verbose_print("Disable stacking for repeated validation")
 944 |             self._stack_models = False
 945 |             self._boost_on_errors = False
 946 | 
 947 |         # update Tuner
 948 |         if self.tuner is not None:
 949 |             self.tuner._stack_models = self._stack_models
 950 |             self.tuner._boost_on_errors = self._boost_on_errors
 951 | 
 952 |         # update Time Controler
 953 |         if self._time_ctrl is not None:
 954 |             self._time_ctrl._is_stacking = self._stack_models
 955 | 
 956 |             if "stack" in self._time_ctrl._steps and not self._stack_models:
 957 |                 self._time_ctrl._steps.remove("stack")
 958 |             if (
 959 |                 "boost_on_errors" in self._time_ctrl._steps
 960 |                 and not self._boost_on_errors
 961 |             ):
 962 |                 self._time_ctrl._steps.remove("boost_on_errors")
 963 | 
 964 |     def _fit(self, X, y, sample_weight=None, cv=None, sensitive_features=None):
 965 |         """Fits the AutoML model with data"""
 966 |         if self._fit_level == "finished":
 967 |             print(
 968 |                 "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'."
 969 |             )
 970 |             return
 971 |         # Validate input and build dataframes
 972 |         X, y, sample_weight, sensitive_features = self._build_dataframe(
 973 |             X, y, sample_weight, sensitive_features
 974 |         )
 975 | 
 976 |         self.n_rows_in_ = X.shape[0]
 977 |         self.n_features_in_ = X.shape[1]
 978 |         self.n_classes = len(np.unique(y[~pd.isnull(y)]))
 979 | 
 980 |         # Get attributes (__init__ params)
 981 |         self._mode = self._get_mode()
 982 |         self._ml_task = self._get_ml_task()
 983 |         self._results_path = self._get_results_path()
 984 |         self._total_time_limit = self._get_total_time_limit()
 985 |         self._model_time_limit = self._get_model_time_limit()
 986 |         self._algorithms = self._get_algorithms()
 987 |         self._train_ensemble = self._get_train_ensemble()
 988 |         self._stack_models = self._get_stack_models()
 989 |         self._eval_metric = self._get_eval_metric()
 990 |         self._validation_strategy = self._get_validation_strategy()
 991 |         self._verbose = self._get_verbose()
 992 |         self._explain_level = self._get_explain_level()
 993 |         self._golden_features = self._get_golden_features()
 994 |         self._features_selection = self._get_features_selection()
 995 |         self._start_random_models = self._get_start_random_models()
 996 |         self._hill_climbing_steps = self._get_hill_climbing_steps()
 997 |         self._top_models_to_improve = self._get_top_models_to_improve()
 998 |         self._boost_on_errors = self._get_boost_on_errors()
 999 |         self._kmeans_features = self._get_kmeans_features()
1000 |         self._mix_encoding = self._get_mix_encoding()
1001 |         self._max_single_prediction_time = self._get_max_single_prediction_time()
1002 |         self._optuna_time_budget = self._get_optuna_time_budget()
1003 |         self._optuna_init_params = self._get_optuna_init_params()
1004 |         self._optuna_verbose = self._get_optuna_verbose()
1005 |         self._n_jobs = self._get_n_jobs()
1006 |         self._random_state = self._get_random_state()
1007 | 
1008 |         if sensitive_features is not None:
1009 |             self._fairness_metric = self._get_fairness_metric()
1010 |             self._fairness_threshold = self._get_fairness_threshold()
1011 |             self._privileged_groups = self._get_privileged_groups()
1012 |             self._underprivileged_groups = self._get_underprivileged_groups()
1013 | 
1014 |         self._adjust_validation = False
1015 |         self._apply_constraints()
1016 |         if not self._adjust_validation:
1017 |             # if there is no validation adjustement
1018 |             # then we can apply stack_models constraints immediately
1019 |             # if there is validation adjustement
1020 |             # then we will apply contraints after the adjustement
1021 |             self._apply_constraints_stack_models()
1022 | 
1023 |         try:
1024 |             self.load_progress()
1025 |             if self._fit_level == "finished":
1026 |                 print(
1027 |                     "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'."
1028 |                 )
1029 |                 return
1030 |             self._check_can_load()
1031 | 
1032 |             self.verbose_print(f"AutoML directory: {self._results_path}")
1033 |             if self._mode == "Optuna":
1034 |                 ttl = int(len(self._algorithms) * self._optuna_time_budget)
1035 |                 self.verbose_print("Expected computing time:")
1036 |                 self.verbose_print(
1037 |                     f"Time for tuning with Optuna: len(algorithms) * optuna_time_budget = {int(len(self._algorithms) * self._optuna_time_budget)} seconds"
1038 |                 )
1039 |                 self.verbose_print(
1040 |                     f"There is no time limit for ML model training after Optuna tuning (total_time_limit parameter is ignored)."
1041 |                 )
1042 | 
1043 |             self.verbose_print(
1044 |                 f"The task is {self._ml_task} with evaluation metric {self._eval_metric}"
1045 |             )
1046 |             self.verbose_print(f"AutoML will use algorithms: {self._algorithms}")
1047 |             if self._stack_models:
1048 |                 self.verbose_print("AutoML will stack models")
1049 |             if self._train_ensemble:
1050 |                 self.verbose_print("AutoML will ensemble available models")
1051 | 
1052 |             self._start_time = time.time()
1053 |             if self._time_ctrl is not None:
1054 |                 self._start_time -= self._time_ctrl.already_spend()
1055 | 
1056 |             # Automatic Exloratory Data Analysis
1057 |             # I disabled EDA, because it won't be supported
1058 |             # I recomend use pandas_profiling or Sweetviz
1059 |             # if self._explain_level == 2:
1060 |             #     EDA.compute(X, y, os.path.join(self._results_path, "EDA"))
1061 | 
1062 |             # Save data
1063 | 
1064 |             self._save_data(
1065 |                 X.copy(deep=False),
1066 |                 y.copy(deep=False),
1067 |                 None if sample_weight is None else sample_weight.copy(deep=False),
1068 |                 cv,
1069 |                 None
1070 |                 if sensitive_features is None
1071 |                 else sensitive_features.copy(deep=False),
1072 |             )
1073 | 
1074 |             tuner = MljarTuner(
1075 |                 self._get_tuner_params(
1076 |                     self._start_random_models,
1077 |                     self._hill_climbing_steps,
1078 |                     self._top_models_to_improve,
1079 |                 ),
1080 |                 self._algorithms,
1081 |                 self._ml_task,
1082 |                 self._eval_metric,
1083 |                 self._validation_strategy,
1084 |                 self._explain_level,
1085 |                 self._data_info,
1086 |                 self._golden_features,
1087 |                 self._features_selection,
1088 |                 self._train_ensemble,
1089 |                 self._stack_models,
1090 |                 self._adjust_validation,
1091 |                 self._boost_on_errors,
1092 |                 self._kmeans_features,
1093 |                 self._mix_encoding,
1094 |                 self._optuna_time_budget,
1095 |                 self._optuna_init_params,
1096 |                 self._optuna_verbose,
1097 |                 self._n_jobs,
1098 |                 self._random_state,
1099 |                 self._fairness_metric,
1100 |                 self._fairness_threshold,
1101 |                 self._privileged_groups,
1102 |                 self._underprivileged_groups,
1103 |             )
1104 |             self.tuner = tuner
1105 | 
1106 |             steps = tuner.steps()
1107 |             self.verbose_print(
1108 |                 f'AutoML steps: {[s for s in steps if "update_" not in s]}'
1109 |             )
1110 |             if self._time_ctrl is None:
1111 |                 self._time_ctrl = TimeController(
1112 |                     self._start_time,
1113 |                     self._total_time_limit,
1114 |                     self._model_time_limit,
1115 |                     steps,
1116 |                     self._algorithms,
1117 |                 )
1118 | 
1119 |             self._time_ctrl.log_time(
1120 |                 "prepare_data",
1121 |                 "prepare_data",
1122 |                 "prepare_data",
1123 |                 time.time() - self._start_time,
1124 |             )
1125 | 
1126 |             for step in steps:
1127 |                 self._fit_level = step
1128 |                 start = time.time()
1129 |                 # self._time_start[step] = start
1130 | 
1131 |                 if step in ["stack", "ensemble_stacked"] and not self._stack_models:
1132 |                     continue
1133 | 
1134 |                 if step == "stack":
1135 |                     self.prepare_for_stacking()
1136 |                 if "hill_climbing" in step or step in ["ensemble", "stack"]:
1137 |                     if len(self._models) == 0:
1138 |                         raise AutoMLException(
1139 |                             "No models produced. \nPlease check your data or"
1140 |                             " submit a Github issue at https://github.com/mljar/mljar-supervised/issues/new."
1141 |                         )
1142 | 
1143 |                 generated_params = []
1144 |                 if step in self._all_params:
1145 |                     generated_params = self._all_params[step]
1146 |                 else:
1147 |                     generated_params = tuner.generate_params(
1148 |                         step,
1149 |                         self._models,
1150 |                         self._results_path,
1151 |                         self._stacked_models,
1152 |                         self._total_time_limit,
1153 |                     )
1154 | 
1155 |                 if generated_params is None or not generated_params:
1156 |                     if "_update_" not in step:
1157 |                         self.verbose_print(
1158 |                             f"Skip {step} because no parameters were generated."
1159 |                         )
1160 |                     continue
1161 |                 if generated_params:
1162 |                     if not self._time_ctrl.enough_time_for_step(self._fit_level):
1163 |                         self.verbose_print(f"Skip {step} because of the time limit.")
1164 |                         continue
1165 |                     else:
1166 |                         model_str = "models" if len(generated_params) > 1 else "model"
1167 |                         self.verbose_print(
1168 |                             f"* Step {step} will try to check up to {len(generated_params)} {model_str}"
1169 |                         )
1170 | 
1171 |                 for params in generated_params:
1172 |                     if params.get("status", "") in ["trained", "skipped", "error"]:
1173 |                         self.verbose_print(f"{params['name']}: {params['status']}.")
1174 |                         continue
1175 | 
1176 |                     try:
1177 |                         trained = False
1178 |                         if "ensemble" in step:
1179 |                             trained = self.ensemble_step(
1180 |                                 is_stacked=params["is_stacked"]
1181 |                             )
1182 |                         else:
1183 |                             trained = self.train_model(params)
1184 |                         params["status"] = "trained" if trained else "skipped"
1185 |                         params["final_loss"] = self._models[-1].get_final_loss()
1186 |                         params["train_time"] = self._models[-1].get_train_time()
1187 | 
1188 |                         if (
1189 |                             self._adjust_validation
1190 |                             and len(self._models) == 1
1191 |                             and step == "adjust_validation"
1192 |                         ):
1193 |                             self._set_adjusted_validation()
1194 | 
1195 |                     except NotTrainedException as e:
1196 |                         params["status"] = "error"
1197 |                         self.verbose_print(
1198 |                             params.get("name") + " not trained. " + str(e)
1199 |                         )
1200 |                     except Exception as e:
1201 |                         import traceback
1202 | 
1203 |                         self._update_errors_report(
1204 |                             params.get("name"), str(e) + "\n" + traceback.format_exc()
1205 |                         )
1206 |                         params["status"] = "error"
1207 | 
1208 |                     self.save_progress(step, generated_params)
1209 | 
1210 |             if not self._models:
1211 |                 raise AutoMLException("No models produced.")
1212 |             self._fit_level = "finished"
1213 |             self.save_progress()
1214 |             self.select_and_save_best(show_warnings=True)
1215 | 
1216 |             self.verbose_print(
1217 |                 f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds"
1218 |             )
1219 |             self.verbose_print(f"AutoML best model: {self._best_model.get_name()}")
1220 | 
1221 |             if self._fairness_metric is not None:
1222 |                 # check if we have fair model
1223 |                 has_fair_model = False
1224 |                 for m in self._models:
1225 |                     if m.is_fair():
1226 |                         has_fair_model = True
1227 |                         break
1228 |                 if not has_fair_model:
1229 |                     self.verbose_print(
1230 |                         "AutoML can't construct model that meets your fairness criteria."
1231 |                     )
1232 |                     self.verbose_print("What you can do?")
1233 |                     self.verbose_print(
1234 |                         "1. Please include more samples that are not biased."
1235 |                     )
1236 |                     self.verbose_print(
1237 |                         "2. Please examine the most unfairly treated samples."
1238 |                     )
1239 |                     self.verbose_print("3. Please change fairness threshold.")
1240 | 
1241 |         except Exception as e:
1242 |             raise e
1243 | 
1244 |         return self
1245 | 
1246 |     def _update_errors_report(self, model_name, error_msg):
1247 |         """Append error message to errors.md file."""
1248 |         errors_filename = os.path.join(self._get_results_path(), "errors.md")
1249 |         with open(errors_filename, "a") as fout:
1250 |             self.verbose_print(f"There was an error during {model_name} training.")
1251 |             self.verbose_print(f"Please check {errors_filename} for details.")
1252 |             fout.write(f"## Error for {model_name}\n\n")
1253 |             fout.write(error_msg)
1254 |             link = "https://github.com/mljar/mljar-supervised/issues/new"
1255 |             fout.write(
1256 |                 f"\n\nPlease set a GitHub issue with above error message at: {link}"
1257 |             )
1258 |             fout.write("\n\n")
1259 | 
1260 |     def select_and_save_best(self, show_warnings=False):
1261 |         # Select best model based on the lowest loss
1262 |         self._best_model = None
1263 | 
1264 |         if self._models:
1265 |             if self._fairness_metric is not None:
1266 |                 models = [
1267 |                     m
1268 |                     for m in self._models
1269 |                     if m.is_valid()
1270 |                     # and m.is_fast_enough(self._max_single_prediction_time)
1271 |                     and m.is_fair()
1272 |                 ]
1273 | 
1274 |                 if models:
1275 |                     # if there are fair models, we select the one with best performance
1276 |                     self._best_model = min(
1277 |                         models,
1278 |                         key=lambda x: x.get_final_loss(),
1279 |                     )
1280 |                 else:
1281 |                     # if no models are fair, we select the most fair model
1282 |                     if "ratio" in self._fairness_metric.lower():
1283 |                         self._best_model = max(
1284 |                             [m for m in self._models if m.is_valid()],
1285 |                             key=lambda x: x.get_best_fairness(),
1286 |                         )
1287 |                     else:
1288 |                         self._best_model = min(
1289 |                             [m for m in self._models if m.is_valid()],
1290 |                             key=lambda x: x.get_best_fairness(),
1291 |                         )
1292 | 
1293 |             else:
1294 |                 model_list = [
1295 |                     m
1296 |                     for m in self._models
1297 |                     if m.is_valid()
1298 |                     and m.is_fast_enough(self._max_single_prediction_time)
1299 |                 ]
1300 |                 if model_list:
1301 |                     self._best_model = min(
1302 |                         model_list,
1303 |                         key=lambda x: x.get_final_loss(),
1304 |                     )
1305 |         # if none selected please select again and warn the user
1306 |         if (
1307 |             len(self._models)
1308 |             and self._best_model is None
1309 |             and self._max_single_prediction_time is not None
1310 |         ):
1311 |             if show_warnings:
1312 |                 msg = (
1313 |                     "*" * 64
1314 |                     + "\nThere were no model with prediction time smaller than the limit.\n"
1315 |                     + "Please increase the prediction time for single sample,\n"
1316 |                     + "or please to use train/test split for validation\n"
1317 |                     + "*" * 64
1318 |                 )
1319 |                 self.verbose_print(msg)
1320 | 
1321 |             self._best_model = min(
1322 |                 [m for m in self._models if m.is_valid()],
1323 |                 key=lambda x: x.get_final_loss(),
1324 |             )
1325 | 
1326 |         with open(os.path.join(self._results_path, "params.json"), "w") as fout:
1327 |             params = {
1328 |                 "mode": self._mode,
1329 |                 "ml_task": self._ml_task,
1330 |                 "results_path": self._results_path,
1331 |                 "total_time_limit": self._total_time_limit,
1332 |                 "model_time_limit": self._model_time_limit,
1333 |                 "algorithms": self._algorithms,
1334 |                 "train_ensemble": self._train_ensemble,
1335 |                 "stack_models": self._stack_models,
1336 |                 "eval_metric": self._eval_metric,
1337 |                 "validation_strategy": self._validation_strategy,
1338 |                 "verbose": self._verbose,
1339 |                 "explain_level": self._explain_level,
1340 |                 "golden_features": self._golden_features,
1341 |                 "features_selection": self._features_selection,
1342 |                 "start_random_models": self._start_random_models,
1343 |                 "hill_climbing_steps": self._hill_climbing_steps,
1344 |                 "top_models_to_improve": self._top_models_to_improve,
1345 |                 "boost_on_errors": self._boost_on_errors,
1346 |                 "kmeans_features": self._kmeans_features,
1347 |                 "mix_encoding": self._mix_encoding,
1348 |                 "max_single_prediction_time": self._max_single_prediction_time,
1349 |                 "n_jobs": self._n_jobs,
1350 |                 "random_state": self._random_state,
1351 |                 "saved": self._model_subpaths,
1352 |                 "fit_level": self._fit_level,
1353 |             }
1354 |             if self._best_model is not None:
1355 |                 params["best_model"] = self._best_model.get_name()
1356 |                 load_on_predict = []
1357 |                 load_on_predict += self._best_model.involved_model_names()
1358 |                 if self._best_model._is_stacked and self._stacked_models is not None:
1359 |                     for m in self._stacked_models:
1360 |                         load_on_predict += m.involved_model_names()
1361 |                 params["load_on_predict"] = list(np.unique(load_on_predict))
1362 | 
1363 |             if self._stacked_models is not None:
1364 |                 params["stacked"] = [m.get_name() for m in self._stacked_models]
1365 |             fout.write(json.dumps(params, indent=4, cls=MLJSONEncoder))
1366 | 
1367 |         if self._models:
1368 |             ldb = self.get_leaderboard(original_metric_values=True)
1369 |             ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False)
1370 | 
1371 |             # save report
1372 |             ldb.insert(loc=0, column="Best model", value="")
1373 |             ldb.loc[
1374 |                 ldb.name == self._best_model.get_name(), "Best model"
1375 |             ] = "**the best**"
1376 |             ldb["name"] = [f"[{m}]({m}/README.md)" for m in ldb["name"].values]
1377 | 
1378 |             with open(os.path.join(self._results_path, "README.md"), "w") as fout:
1379 |                 fout.write(f"# AutoML Leaderboard\n\n")
1380 |                 fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe"))
1381 |                 LeaderboardPlots.compute(
1382 |                     ldb, self._results_path, fout, self._fairness_threshold
1383 |                 )
1384 | 
1385 |                 if self._fit_level == "finished":
1386 |                     AutoMLPlots.add(self._results_path, self._models, fout)
1387 | 
1388 |     def get_ensemble_models(self, ensemble_name="Ensemble"):
1389 |         try:
1390 |             with open(os.path.join(self.results_path, ensemble_name, "ensemble.json")) as file:
1391 |                 params = json.load(file)
1392 |             return [m["model"] for m in params["selected_models"]]
1393 |         except Exception as e:
1394 |             return []
1395 | 
1396 |     def models_needed_on_predict(self, required_model_name):
1397 |         with open(os.path.join(self.results_path, "params.json")) as file:
1398 |             params = json.load(file)
1399 |         saved_models = params.get("saved", [])
1400 |         stacked_models = params.get("stacked", [])
1401 | 
1402 |         if required_model_name not in saved_models:
1403 |             raise AutoMLException(
1404 |                 f"Can't load model {required_model_name}. Please check if the model's name is correct."
1405 |             )
1406 |         # single model needed
1407 |         if (
1408 |             "Stacked" not in required_model_name
1409 |             and "Ensemble" not in required_model_name
1410 |         ):
1411 |             return [required_model_name]
1412 |         ensemble_models = self.get_ensemble_models("Ensemble")
1413 |         # ensemble of single models
1414 |         if required_model_name == "Ensemble":
1415 |             return ensemble_models + [required_model_name]
1416 |         # single model on stacked data
1417 |         if required_model_name != "Stacked_Ensemble":
1418 |             return list(
1419 |                 np.unique(
1420 |                     ensemble_models
1421 |                     + ["Ensemble"]
1422 |                     + stacked_models
1423 |                     + [required_model_name]
1424 |                 )
1425 |             )
1426 |         # must be stacked ensemble
1427 |         stacked_ensemble_models = self.get_ensemble_models("Stacked_Ensemble")
1428 |         return list(
1429 |             np.unique(
1430 |                 ensemble_models
1431 |                 + ["Ensemble"]
1432 |                 + stacked_models
1433 |                 + stacked_ensemble_models
1434 |                 + [required_model_name]
1435 |             )
1436 |         )
1437 | 
1438 |     def _base_predict(self, X, model=None):
1439 |         if model is None:
1440 |             if self._best_model is None:
1441 |                 self.load(self.results_path)
1442 |             model = self._best_model
1443 | 
1444 |         if model is None:
1445 |             raise AutoMLException(
1446 |                 "This model has not been fitted yet. Please call `fit()` first."
1447 |             )
1448 | 
1449 |         X = self._build_dataframe(X)
1450 |         if not isinstance(X.columns[0], str):
1451 |             X.columns = [str(c) for c in X.columns]
1452 | 
1453 |         input_columns = X.columns.tolist()
1454 |         for column in self._data_info["columns"]:
1455 |             if column not in input_columns:
1456 |                 raise AutoMLException(
1457 |                     f"Missing column: {column} in input data. Cannot predict"
1458 |                 )
1459 | 
1460 |         X = X[self._data_info["columns"]]
1461 |         self._validate_X_predict(X)
1462 | 
1463 |         # is stacked model
1464 |         if model._is_stacked:
1465 |             self._perform_model_stacking()
1466 |             X_stacked = self.get_stacked_data(X, mode="predict")
1467 | 
1468 |             if model.get_type() == "Ensemble":
1469 |                 # Ensemble is using both original and stacked data
1470 |                 predictions = model.predict(X, X_stacked)
1471 |             else:
1472 |                 predictions = model.predict(X_stacked)
1473 |         else:
1474 |             predictions = model.predict(X)
1475 | 
1476 |         if self._ml_task == BINARY_CLASSIFICATION:
1477 |             # need to predict the label based on predictions and threshold
1478 |             neg_label, pos_label = (
1479 |                 predictions.columns[0][11:],
1480 |                 predictions.columns[1][11:],
1481 |             )
1482 | 
1483 |             if neg_label == "0" and pos_label == "1":
1484 |                 neg_label, pos_label = 0, 1
1485 |             target_is_numeric = self._data_info.get("target_is_numeric", False)
1486 |             if target_is_numeric:
1487 |                 neg_label = int(neg_label)
1488 |                 pos_label = int(pos_label)
1489 |             # assume that it is binary classification
1490 |             predictions["label"] = predictions.iloc[:, 1] > model._threshold
1491 |             predictions["label"] = predictions["label"].map(
1492 |                 {True: pos_label, False: neg_label}
1493 |             )
1494 |             return predictions
1495 |         elif self._ml_task == MULTICLASS_CLASSIFICATION:
1496 |             target_is_numeric = self._data_info.get("target_is_numeric", False)
1497 |             if target_is_numeric:
1498 |                 try:
1499 |                     predictions["label"] = predictions["label"].astype(int)
1500 |                 except Exception as e:
1501 |                     predictions["label"] = predictions["label"].astype(float)
1502 |             return predictions
1503 |         # Regression
1504 |         else:
1505 |             return predictions
1506 | 
1507 |     def _predict(self, X):
1508 |         predictions = self._base_predict(X)
1509 |         # Return predictions
1510 |         # If classification task the result is in column 'label'
1511 |         # If regression task the result is in column 'prediction'
1512 |         return (
1513 |             predictions["label"].to_numpy()
1514 |             if self._ml_task != REGRESSION
1515 |             else predictions["prediction"].to_numpy()
1516 |         )
1517 | 
1518 |     def _predict_proba(self, X):
1519 |         # Check is task type is correct
1520 |         if self._ml_task == REGRESSION:
1521 |             raise AutoMLException(
1522 |                 f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'."
1523 |             )
1524 | 
1525 |         # Make and return predictions
1526 |         # If classification task the result is in column 'label'
1527 |         # Need to drop `label` column.
1528 |         return self._base_predict(X).drop(["label"], axis=1).to_numpy()
1529 | 
1530 |     def _predict_all(self, X):
1531 |         # Make and return predictions
1532 |         return self._base_predict(X)
1533 | 
1534 |     def _score(self, X, y=None, sample_weight=None):
1535 |         # y default must be None for scikit-learn compatibility
1536 | 
1537 |         # Check if y is None
1538 |         if y is None:
1539 |             raise AutoMLException("y must be specified.")
1540 | 
1541 |         predictions = self._predict(X)
1542 |         return (
1543 |             r2_score(y, predictions, sample_weight=sample_weight)
1544 |             if self._ml_task == REGRESSION
1545 |             else accuracy_score(y, predictions, sample_weight=sample_weight)
1546 |         )
1547 | 
1548 |     def _get_mode(self):
1549 |         """Gets the current mode"""
1550 |         self._validate_mode()
1551 |         return deepcopy(self.mode)
1552 | 
1553 |     def _get_ml_task(self):
1554 |         """Gets the current ml_task. If "auto" it is determined"""
1555 |         self._validate_ml_task()
1556 |         if self.ml_task == "auto":
1557 |             classes_number = self.n_classes
1558 |             if classes_number == 2:
1559 |                 self._estimator_type = "classifier"  # for sk-learn api
1560 |                 return BINARY_CLASSIFICATION
1561 |             elif classes_number <= 20:
1562 |                 self._estimator_type = "classifier"  # for sk-learn api
1563 |                 return MULTICLASS_CLASSIFICATION
1564 |             else:
1565 |                 self._estimator_type = "regressor"  # for sk-learn api
1566 |                 return REGRESSION
1567 |         else:
1568 |             return deepcopy(self.ml_task)
1569 | 
1570 |     def _get_results_path(self):
1571 |         """Gets the current results_path"""
1572 |         # if we already have the results path set, please return it
1573 |         if self._results_path is not None:
1574 |             return self._results_path
1575 | 
1576 |         self._validate_results_path()
1577 | 
1578 |         path = self.results_path
1579 | 
1580 |         if path is None:
1581 |             for i in range(1, 10001):
1582 |                 name = f"AutoML_{i}"
1583 |                 if not os.path.exists(name):
1584 |                     self.create_dir(name)
1585 |                     self._results_path = name
1586 |                     return name
1587 |             # If it got here, could not create, raise expection
1588 |             raise AutoMLException("Cannot create directory for AutoML results")
1589 |         elif os.path.exists(self.results_path) and os.path.exists(
1590 |             os.path.join(self.results_path, "params.json")
1591 |         ):  # AutoML already loaded, return path
1592 |             self._results_path = path
1593 |             return path
1594 |         # Dir does not exist, create it
1595 |         elif not os.path.exists(path):
1596 |             self.create_dir(path)
1597 |             self._results_path = path
1598 |             return path
1599 |         # Dir exists and is empty, use it
1600 |         elif os.path.exists(path) and not len(os.listdir(path)):
1601 |             self._results_path = path
1602 |             return path
1603 |         elif os.path.exists(path) and len(os.listdir(path)):
1604 |             raise AutoMLException(
1605 |                 f"Cannot set directory for AutoML. Directory '{path}' is not empty."
1606 |             )
1607 | 
1608 |         raise AutoMLException("Cannot set directory for AutoML results")
1609 | 
1610 |     def _get_total_time_limit(self):
1611 |         """Gets the current total_time_limit"""
1612 |         self._validate_total_time_limit()
1613 |         if self._get_mode() == "Optuna":
1614 |             return None  # there no training limit for model in the Optuna mode
1615 |             # just train and be happy with super models :)
1616 |         return deepcopy(self.total_time_limit)
1617 | 
1618 |     def _get_model_time_limit(self):
1619 |         """Gets the current model_time_limit"""
1620 |         self._validate_model_time_limit()
1621 |         return deepcopy(self.model_time_limit)
1622 | 
1623 |     def _get_algorithms(self):
1624 |         """Gets the current algorithms. If "auto" it is determined"""
1625 |         self._validate_algorithms()
1626 |         if self.algorithms == "auto":
1627 |             if self._get_mode() == "Explain":
1628 |                 return [
1629 |                     "Baseline",
1630 |                     "Linear",
1631 |                     "Decision Tree",
1632 |                     "Random Forest",
1633 |                     "Xgboost",
1634 |                     "Neural Network",
1635 |                 ]
1636 |             if self._get_mode() == "Perform":
1637 |                 return [
1638 |                     "Linear",
1639 |                     "Random Forest",
1640 |                     "LightGBM",
1641 |                     "Xgboost",
1642 |                     "CatBoost",
1643 |                     "Neural Network",
1644 |                 ]
1645 |             if self._get_mode() == "Compete":
1646 |                 return [
1647 |                     "Decision Tree",
1648 |                     "Linear",
1649 |                     "Random Forest",
1650 |                     "Extra Trees",
1651 |                     "LightGBM",
1652 |                     "Xgboost",
1653 |                     "CatBoost",
1654 |                     "Neural Network",
1655 |                     "Nearest Neighbors",
1656 |                 ]
1657 |             if self._get_mode() == "Optuna":
1658 |                 return [
1659 |                     "Random Forest",
1660 |                     "Extra Trees",
1661 |                     "LightGBM",
1662 |                     "Xgboost",
1663 |                     "CatBoost",
1664 |                     "Neural Network",
1665 |                 ]
1666 |         else:
1667 |             return deepcopy(self.algorithms)
1668 | 
1669 |     def _get_train_ensemble(self):
1670 |         """Gets the current train_ensemble"""
1671 |         self._validate_train_ensemble()
1672 |         return deepcopy(self.train_ensemble)
1673 | 
1674 |     def _get_stack_models(self):
1675 |         """Gets the current stack_models"""
1676 |         self._validate_stack_models()
1677 |         if self.stack_models == "auto":
1678 |             val = self._get_validation_strategy()
1679 |             if val.get("validation_type", "") == "custom":
1680 |                 return False
1681 |             return True if self.mode in ["Compete", "Optuna"] else False
1682 |         else:
1683 |             return deepcopy(self.stack_models)
1684 | 
1685 |     def _get_eval_metric(self):
1686 |         """Gets the current eval_metric"""
1687 |         self._validate_eval_metric()
1688 |         if isinstance(self.eval_metric, types.FunctionType):
1689 |             UserDefinedEvalMetric().set_metric(self.eval_metric)
1690 |             return "user_defined_metric"
1691 | 
1692 |         if self.eval_metric == "auto":
1693 |             if self._get_ml_task() == BINARY_CLASSIFICATION:
1694 |                 return "logloss"
1695 |             elif self._get_ml_task() == MULTICLASS_CLASSIFICATION:
1696 |                 return "logloss"
1697 |             elif self._get_ml_task() == REGRESSION:
1698 |                 return "rmse"
1699 |         else:
1700 |             return deepcopy(self.eval_metric)
1701 | 
1702 |     def _get_validation_strategy(self):
1703 |         """Gets the current validation_strategy"""
1704 |         strat = {}
1705 |         self._validate_validation_strategy()
1706 |         if self.validation_strategy == "auto":
1707 |             if self._get_mode() == "Explain":
1708 |                 strat = {
1709 |                     "validation_type": "split",
1710 |                     "train_ratio": 0.75,
1711 |                     "shuffle": True,
1712 |                     "stratify": True,
1713 |                 }
1714 |             elif self._get_mode() == "Perform":
1715 |                 strat = {
1716 |                     "validation_type": "kfold",
1717 |                     "k_folds": 5,
1718 |                     "shuffle": True,
1719 |                     "stratify": True,
1720 |                 }
1721 |             elif self._get_mode() in ["Compete", "Optuna"]:
1722 |                 strat = {
1723 |                     "validation_type": "kfold",
1724 |                     "k_folds": 10,
1725 |                     "shuffle": True,
1726 |                     "stratify": True,
1727 |                 }
1728 |             if self._get_ml_task() == REGRESSION:
1729 |                 if "stratify" in strat:
1730 |                     # it's better to always check
1731 |                     # before delete (trust me)
1732 |                     del strat["stratify"]
1733 |             return strat
1734 |         else:
1735 |             strat = deepcopy(self.validation_strategy)
1736 |             if self._get_ml_task() == REGRESSION:
1737 |                 if "stratify" in strat:
1738 |                     del strat["stratify"]
1739 |             return strat
1740 | 
1741 |     def _get_verbose(self):
1742 |         """Gets the current verbose"""
1743 |         self._validate_verbose()
1744 |         return deepcopy(self.verbose)
1745 | 
1746 |     def _get_explain_level(self):
1747 |         """Gets the current explain_level"""
1748 |         self._validate_explain_level()
1749 |         if self.explain_level == "auto":
1750 |             if self._get_mode() == "Explain":
1751 |                 return 2
1752 |             if self._get_mode() == "Perform":
1753 |                 return 1
1754 |             if self._get_mode() == "Compete":
1755 |                 return 0
1756 |             if self._get_mode() == "Optuna":
1757 |                 return 0
1758 |         else:
1759 |             return deepcopy(self.explain_level)
1760 | 
1761 |     def _get_golden_features(self):
1762 |         self._validate_golden_features()
1763 |         if self.golden_features == "auto":
1764 |             if self._get_mode() == "Explain":
1765 |                 return False
1766 |             if self._get_mode() == "Perform":
1767 |                 return True
1768 |             if self._get_mode() == "Compete":
1769 |                 return True
1770 |             if self._get_mode() == "Optuna":
1771 |                 return False
1772 |         else:
1773 |             return deepcopy(self.golden_features)
1774 | 
1775 |     def _get_features_selection(self):
1776 |         """Gets the current features_selection"""
1777 |         self._validate_features_selection()
1778 |         if self.features_selection == "auto":
1779 |             if self._get_mode() == "Explain":
1780 |                 return False
1781 |             if self._get_mode() == "Perform":
1782 |                 return True
1783 |             if self._get_mode() == "Compete":
1784 |                 return True
1785 |             if self._get_mode() == "Optuna":
1786 |                 return False
1787 |         else:
1788 |             return deepcopy(self.features_selection)
1789 | 
1790 |     def _get_start_random_models(self):
1791 |         """Gets the current start_random_models"""
1792 |         self._validate_start_random_models()
1793 |         if self.start_random_models == "auto":
1794 |             if self._get_mode() == "Explain":
1795 |                 return 1
1796 |             if self._get_mode() == "Perform":
1797 |                 return 5
1798 |             if self._get_mode() == "Compete":
1799 |                 return 10
1800 |             if self._get_mode() == "Optuna":
1801 |                 return 1  # just 1, because it will be tuned by Optuna
1802 |         else:
1803 |             return deepcopy(self.start_random_models)
1804 | 
1805 |     def _get_hill_climbing_steps(self):
1806 |         """Gets the current hill_climbing_steps"""
1807 |         self._validate_hill_climbing_steps()
1808 |         if self.hill_climbing_steps == "auto":
1809 |             if self._get_mode() == "Explain":
1810 |                 return 0
1811 |             if self._get_mode() == "Perform":
1812 |                 return 2
1813 |             if self._get_mode() == "Compete":
1814 |                 return 2
1815 |             if self._get_mode() == "Optuna":
1816 |                 return 0  # all tuning is done in Optuna
1817 |         else:
1818 |             return deepcopy(self.hill_climbing_steps)
1819 | 
1820 |     def _get_top_models_to_improve(self):
1821 |         """Gets the current top_models_to_improve"""
1822 |         self._validate_top_models_to_improve()
1823 |         if self.top_models_to_improve == "auto":
1824 |             if self._get_mode() == "Explain":
1825 |                 return 0
1826 |             if self._get_mode() == "Perform":
1827 |                 return 2
1828 |             if self._get_mode() == "Compete":
1829 |                 return 3
1830 |             if self._get_mode() == "Optuna":
1831 |                 return 0
1832 |         else:
1833 |             return deepcopy(self.top_models_to_improve)
1834 | 
1835 |     def _get_boost_on_errors(self):
1836 |         """Gets the current boost_on_errors"""
1837 |         self._validate_boost_on_errors()
1838 |         if self.boost_on_errors == "auto":
1839 |             val = self._get_validation_strategy()
1840 |             if val.get("validation_type", "") == "custom":
1841 |                 return False
1842 |             if self._get_mode() == "Explain":
1843 |                 return False
1844 |             if self._get_mode() == "Perform":
1845 |                 return False
1846 |             if self._get_mode() == "Compete":
1847 |                 return True
1848 |             if self._get_mode() == "Optuna":
1849 |                 return False
1850 |         else:
1851 |             return deepcopy(self.boost_on_errors)
1852 | 
1853 |     def _get_kmeans_features(self):
1854 |         """Gets the current kmeans_features"""
1855 |         self._validate_kmeans_features()
1856 |         if self.kmeans_features == "auto":
1857 |             if self._get_mode() == "Explain":
1858 |                 return False
1859 |             if self._get_mode() == "Perform":
1860 |                 return False
1861 |             if self._get_mode() == "Compete":
1862 |                 return True
1863 |             if self._get_mode() == "Optuna":
1864 |                 return False
1865 |         else:
1866 |             return deepcopy(self.kmeans_features)
1867 | 
1868 |     def _get_mix_encoding(self):
1869 |         """Gets the current mix_encoding"""
1870 |         self._validate_mix_encoding()
1871 |         if self.mix_encoding == "auto":
1872 |             if self._get_mode() == "Explain":
1873 |                 return False
1874 |             if self._get_mode() == "Perform":
1875 |                 return False
1876 |             if self._get_mode() == "Compete":
1877 |                 return True
1878 |             if self._get_mode() == "Optuna":
1879 |                 return False
1880 |         else:
1881 |             return deepcopy(self.mix_encoding)
1882 | 
1883 |     def _get_max_single_prediction_time(self):
1884 |         """Gets the current max_single_prediction_time"""
1885 |         self._validate_max_single_prediction_time()
1886 |         if self.max_single_prediction_time is None:
1887 |             if self._get_mode() == "Perform":
1888 |                 return 0.5  # prediction time should be under 0.5 second
1889 |             return None
1890 |         else:
1891 |             return deepcopy(self.max_single_prediction_time)
1892 | 
1893 |     def _get_optuna_time_budget(self):
1894 |         """Gets the current optuna_time_budget"""
1895 |         self._validate_optuna_time_budget()
1896 | 
1897 |         if self.optuna_time_budget is None:
1898 |             if self._get_mode() == "Optuna":
1899 |                 return 3600
1900 |             return None
1901 |         else:
1902 |             if self._get_mode() != "Optuna":
1903 |                 # use only for mode Optuna
1904 |                 return None
1905 |             return deepcopy(self.optuna_time_budget)
1906 | 
1907 |     def _get_optuna_init_params(self):
1908 |         """Gets the current optuna_init_params"""
1909 |         self._validate_optuna_init_params()
1910 |         if self._get_mode() != "Optuna":
1911 |             # use only for mode Optuna
1912 |             return {}
1913 |         return deepcopy(self.optuna_init_params)
1914 | 
1915 |     def _get_optuna_verbose(self):
1916 |         """Gets the current optuna_verbose"""
1917 |         self._validate_optuna_verbose()
1918 |         # use only for mode Optuna
1919 |         if self._get_mode() != "Optuna":
1920 |             return True
1921 |         return deepcopy(self.optuna_verbose)
1922 | 
1923 |     def _get_n_jobs(self):
1924 |         """Gets the current n_jobs"""
1925 |         self._validate_n_jobs()
1926 |         return deepcopy(self.n_jobs)
1927 | 
1928 |     def _get_random_state(self):
1929 |         """Gets the current random_state"""
1930 |         self._validate_random_state()
1931 |         return deepcopy(self.random_state)
1932 | 
1933 |     def _validate_mode(self):
1934 |         """Validates mode parameter"""
1935 |         valid_modes = ["Explain", "Perform", "Compete", "Optuna"]
1936 |         if self.mode not in valid_modes:
1937 |             raise ValueError(
1938 |                 f"Expected 'mode' to be {' or '.join(valid_modes)}, got '{self.mode}'"
1939 |             )
1940 | 
1941 |     def _validate_ml_task(self):
1942 |         """Validates ml_task parameter"""
1943 |         if isinstance(self.ml_task, str) and self.ml_task == "auto":
1944 |             return
1945 | 
1946 |         if self.ml_task not in AlgorithmsRegistry.get_supported_ml_tasks():
1947 |             raise ValueError(
1948 |                 f"Expected 'ml_task' to be {' or '.join(AlgorithmsRegistry.get_supported_ml_tasks())}, got '{self.ml_task}''"
1949 |             )
1950 | 
1951 |     def _validate_results_path(self):
1952 |         """Validates path parameter"""
1953 |         if self.results_path is None or isinstance(self.results_path, str):
1954 |             return
1955 | 
1956 |         raise ValueError(
1957 |             f"Expected 'results_path' to be of type string, got '{type(self.results_path)}''"
1958 |         )
1959 | 
1960 |     def _validate_total_time_limit(self):
1961 |         """Validates total_time_limit parameter"""
1962 |         if self.total_time_limit is None:
1963 |             return
1964 |         if self.total_time_limit is not None:
1965 |             check_greater_than_zero_integer(self.total_time_limit, "total_time_limit")
1966 | 
1967 |     def _validate_model_time_limit(self):
1968 |         """Validates model_time_limit parameter"""
1969 |         if self.model_time_limit is not None:
1970 |             check_greater_than_zero_integer(self.model_time_limit, "model_time_limit")
1971 | 
1972 |     def _validate_algorithms(self):
1973 |         """Validates algorithms parameter"""
1974 |         if isinstance(self.algorithms, str) and self.algorithms == "auto":
1975 |             return
1976 | 
1977 |         for algo in self.algorithms:
1978 |             if algo not in list(AlgorithmsRegistry.registry[self._ml_task].keys()):
1979 |                 raise ValueError(
1980 |                     f"The algorithm {algo} is not allowed to use for ML task: {self._ml_task}. Allowed algorithms: {list(AlgorithmsRegistry.registry[self._ml_task].keys())}"
1981 |                 )
1982 | 
1983 |     def _validate_train_ensemble(self):
1984 |         """Validates train_ensemble parameter"""
1985 |         # `train_ensemble` defaults to True, no further checking required
1986 |         check_bool(self.train_ensemble, "train_ensemble")
1987 | 
1988 |     def _validate_stack_models(self):
1989 |         """Validates stack_models parameter"""
1990 |         # `stack_models` defaults to "auto". If "auto" return, else check if is valid bool
1991 |         if isinstance(self.stack_models, str) and self.stack_models == "auto":
1992 |             return
1993 | 
1994 |         check_bool(self.stack_models, "stack_models")
1995 | 
1996 |     def _validate_eval_metric(self):
1997 |         """Validates eval_metric parameter"""
1998 |         if isinstance(self.eval_metric, types.FunctionType):
1999 |             return
2000 | 
2001 |         if isinstance(self.eval_metric, str) and self.eval_metric == "auto":
2002 |             return
2003 | 
2004 |         if (self._get_ml_task() == BINARY_CLASSIFICATION) and self.eval_metric not in [
2005 |             "logloss",
2006 |             "auc",
2007 |             "f1",
2008 |             "average_precision",
2009 |             "accuracy",
2010 |         ]:
2011 |             raise ValueError(
2012 |                 f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \
2013 |                     Use 'logloss', 'auc', 'f1', 'average_precision', or 'accuracy'"
2014 |             )
2015 | 
2016 |         elif (
2017 |             self._get_ml_task() == MULTICLASS_CLASSIFICATION
2018 |         ) and self.eval_metric not in ["logloss", "f1", "accuracy"]:
2019 |             raise ValueError(
2020 |                 f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \
2021 |                     Use 'logloss', 'f1', or 'accuracy'"
2022 |             )
2023 | 
2024 |         elif self._get_ml_task() == REGRESSION and self.eval_metric not in [
2025 |             "rmse",
2026 |             "mse",
2027 |             "mae",
2028 |             "r2",
2029 |             "mape",
2030 |             "spearman",
2031 |             "pearson",
2032 |         ]:
2033 |             raise ValueError(
2034 |                 f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \
2035 |                 Use 'rmse', 'mse', 'mae', 'r2', 'mape', 'spearman', or 'pearson'"
2036 |             )
2037 | 
2038 |     def _validate_validation_strategy(self):
2039 |         """Validates validation parameter"""
2040 |         if (
2041 |             isinstance(self.validation_strategy, str)
2042 |             and self.validation_strategy == "auto"
2043 |         ):
2044 |             return
2045 | 
2046 |         # only validation_type is mandatory
2047 |         # other parameters of validations
2048 |         # have defaults set in their constructors
2049 |         required_keys = ["validation_type"]
2050 |         if type(self.validation_strategy) is not dict:
2051 |             raise ValueError(
2052 |                 f"Expected 'validation_strategy' to be a dict, got '{type(self.validation_strategy)}'"
2053 |             )
2054 |         if not all(key in self.validation_strategy for key in required_keys):
2055 |             raise ValueError(f"Expected dict with keys: {' , '.join(required_keys)}")
2056 | 
2057 |     def _validate_verbose(self):
2058 |         """Validates verbose parameter"""
2059 |         check_positive_integer(self.verbose, "verbose")
2060 | 
2061 |     def _validate_explain_level(self):
2062 |         """Validates explain_level parameter"""
2063 |         if isinstance(self.explain_level, str) and self.explain_level == "auto":
2064 |             return
2065 |         valid_explain_levels = [0, 1, 2]
2066 |         # Check if explain level is 0 or greater integer
2067 |         if not (
2068 |             isinstance(self.explain_level, int)
2069 |             and self.explain_level in valid_explain_levels
2070 |         ):
2071 |             raise ValueError(
2072 |                 f"Expected 'explain_level' to be {' or '.join([str(x) for x in valid_explain_levels])}, got '{self.explain_level}'"
2073 |             )
2074 | 
2075 |     def _validate_golden_features(self):
2076 |         """Validates golden_features parameter"""
2077 |         if isinstance(self.golden_features, str) and self.golden_features == "auto":
2078 |             return
2079 |         if isinstance(self.golden_features, int):
2080 |             return
2081 |         check_bool(self.golden_features, "golden_features")
2082 | 
2083 |     def _validate_features_selection(self):
2084 |         """Validates features_selection parameter"""
2085 |         if (
2086 |             isinstance(self.features_selection, str)
2087 |             and self.features_selection == "auto"
2088 |         ):
2089 |             return
2090 |         check_bool(self.features_selection, "features_selection")
2091 | 
2092 |     def _validate_start_random_models(self):
2093 |         """Validates start_random_models parameter"""
2094 |         if (
2095 |             isinstance(self.start_random_models, str)
2096 |             and self.start_random_models == "auto"
2097 |         ):
2098 |             return
2099 |         check_greater_than_zero_integer(self.start_random_models, "start_random_models")
2100 | 
2101 |     def _validate_hill_climbing_steps(self):
2102 |         """Validates hill_climbing_steps parameter"""
2103 |         if (
2104 |             isinstance(self.hill_climbing_steps, str)
2105 |             and self.hill_climbing_steps == "auto"
2106 |         ):
2107 |             return
2108 |         check_positive_integer(self.hill_climbing_steps, "hill_climbing_steps")
2109 | 
2110 |     def _validate_top_models_to_improve(self):
2111 |         """Validates top_models_to_improve parameter"""
2112 |         if (
2113 |             isinstance(self.top_models_to_improve, str)
2114 |             and self.top_models_to_improve == "auto"
2115 |         ):
2116 |             return
2117 |         check_positive_integer(self.top_models_to_improve, "top_models_to_improve")
2118 | 
2119 |     def _validate_boost_on_errors(self):
2120 |         """Validates boost_on_errors parameter"""
2121 |         if isinstance(self.boost_on_errors, str) and self.boost_on_errors == "auto":
2122 |             return
2123 |         check_bool(self.boost_on_errors, "boost_on_errors")
2124 | 
2125 |     def _validate_kmeans_features(self):
2126 |         """Validates kmeans_features parameter"""
2127 |         if isinstance(self.kmeans_features, str) and self.kmeans_features == "auto":
2128 |             return
2129 |         check_bool(self.kmeans_features, "kmeans_features")
2130 | 
2131 |     def _validate_mix_encoding(self):
2132 |         """Validates mix_encoding parameter"""
2133 |         if isinstance(self.mix_encoding, str) and self.mix_encoding == "auto":
2134 |             return
2135 |         check_bool(self.mix_encoding, "mix_encoding")
2136 | 
2137 |     def _validate_max_single_prediction_time(self):
2138 |         """Validates max_single_prediction_time parameter"""
2139 |         if self.max_single_prediction_time is None:
2140 |             return
2141 |         check_greater_than_zero_integer_or_float(
2142 |             self.max_single_prediction_time, "max_single_prediction_time"
2143 |         )
2144 | 
2145 |     def _validate_optuna_time_budget(self):
2146 |         """Validates optuna_time_budget parameter"""
2147 |         if self.optuna_time_budget is None:
2148 |             return
2149 |         check_greater_than_zero_integer(self.optuna_time_budget, "optuna_time_budget")
2150 | 
2151 |     def _validate_optuna_init_params(self):
2152 |         """Validates optuna_init_params parameter"""
2153 |         if self.optuna_init_params is None:
2154 |             return
2155 |         if type(self.optuna_init_params) is not dict:
2156 |             raise ValueError(
2157 |                 f"Expected 'optuna_init_params' to be a dict, got '{type(self.optuna_init_params)}'"
2158 |             )
2159 | 
2160 |     def _validate_optuna_verbose(self):
2161 |         """Validates optuna_verbose parameter"""
2162 |         if self.optuna_verbose is None:
2163 |             return
2164 |         check_bool(self.optuna_verbose, "optuna_verbose")
2165 | 
2166 |     def _validate_n_jobs(self):
2167 |         """Validates mix_encoding parameter"""
2168 |         check_integer(self.n_jobs, "n_jobs")
2169 | 
2170 |     def _validate_random_state(self):
2171 |         """Validates random_state parameter"""
2172 |         check_positive_integer(self.random_state, "random_state")
2173 | 
2174 |     def _validate_fairness_metric(self):
2175 |         """Validates fariness_metric parameter"""
2176 |         if isinstance(self.fairness_metric, str) and self.fairness_metric == "auto":
2177 |             return
2178 | 
2179 |         if (
2180 |             self._get_ml_task() in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]
2181 |         ) and self.fairness_metric not in [
2182 |             "demographic_parity_difference",
2183 |             "demographic_parity_ratio",
2184 |             "equalized_odds_difference",
2185 |             "equalized_odds_ratio",
2186 |         ]:
2187 |             raise ValueError(
2188 |                 f"Metric {self.fairness_metric} is not allowed in ML task: {self._get_ml_task()}. \
2189 |                     Use `demographic_parity_difference`, `demographic_parity_ratio`, `equalized_odds_difference` or `equalized_odds_ratio`"
2190 |             )
2191 |         if (self._get_ml_task() == REGRESSION) and self.fairness_metric not in [
2192 |             "group_loss_difference",
2193 |             "group_loss_ratio",
2194 |         ]:
2195 |             raise ValueError(
2196 |                 f"Metric {self.fairness_metric} is not allowed in ML task: {self._get_ml_task()}. \
2197 |                     Use `group_loss`"
2198 |             )
2199 | 
2200 |     def _get_fairness_metric(self):
2201 |         """Gets the fairness metric"""
2202 |         self._validate_fairness_metric()
2203 |         if self.fairness_metric == "auto":
2204 |             if self._get_ml_task() == BINARY_CLASSIFICATION:
2205 |                 return "demographic_parity_ratio"
2206 |             if self._get_ml_task() == REGRESSION:
2207 |                 return "group_loss_ratio"
2208 |             if self._get_ml_task() == MULTICLASS_CLASSIFICATION:
2209 |                 return "demographic_parity_ratio"
2210 |         else:
2211 |             return deepcopy(self.fairness_metric)
2212 | 
2213 |     def _get_fairness_threshold(self):
2214 |         """Gets the fairness threshold"""
2215 |         if self.fairness_threshold == "auto":
2216 |             if self._get_ml_task() in [
2217 |                 BINARY_CLASSIFICATION,
2218 |                 MULTICLASS_CLASSIFICATION,
2219 |             ]:
2220 |                 thresholds = {
2221 |                     "demographic_parity_difference": 0.1,
2222 |                     "demographic_parity_ratio": 0.8,
2223 |                     "equalized_odds_difference": 0.1,
2224 |                     "equalized_odds_ratio": 0.8,
2225 |                 }
2226 |                 return thresholds.get(self._fairness_metric, 0.8)
2227 |             elif self._get_ml_task() == REGRESSION:
2228 |                 thresholds = {
2229 |                     "group_loss_ratio": 0.8,
2230 |                 }
2231 |                 if self._fairness_metric == "group_loss_difference":
2232 |                     raise AutoMLException(
2233 |                         "We can't set default fairness threshold value. Please set `fairness_threshold` value in AutoML constructor."
2234 |                     )
2235 |                 return thresholds.get(self._fairness_metric, 0.8)
2236 |         else:
2237 |             return deepcopy(self.fairness_threshold)
2238 | 
2239 |     def _get_privileged_groups(self):
2240 |         """Gets privileged groups for fair training"""
2241 |         if self.privileged_groups == "auto":
2242 |             return []
2243 |         else:
2244 |             return deepcopy(self.privileged_groups)
2245 | 
2246 |     def _get_underprivileged_groups(self):
2247 |         """Gets underprivileged groups for fair training"""
2248 |         if self.underprivileged_groups == "auto":
2249 |             return []
2250 |         else:
2251 |             return deepcopy(self.underprivileged_groups)
2252 | 
2253 |     def to_json(self):
2254 |         if self._best_model is None:
2255 |             return None
2256 | 
2257 |         return {
2258 |             "best_model": self._best_model.to_json(),
2259 |             "threshold": self._threshold,
2260 |             "ml_task": self._ml_task,
2261 |         }
2262 | 
2263 |     def from_json(self, json_data):
2264 |         if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
2265 |             self._best_model = Ensemble()
2266 |             self._best_model.from_json(json_data["best_model"])
2267 |         else:
2268 |             self._best_model = ModelFramework(json_data["best_model"].get("params"))
2269 |             self._best_model.from_json(json_data["best_model"])
2270 |         self._threshold = json_data.get("threshold")
2271 | 
2272 |         self._ml_task = json_data.get("ml_task")
2273 | 
2274 |     report_style = f"""
2275 | .styled-table {{
2276 |     border-collapse: collapse;
2277 |     font-size: 0.9em;
2278 |     font-family: Courier New;
2279 | }}
2280 | 
2281 | .styled-table td, .styled-table th {{
2282 |     border: 1px solid #ddd;
2283 |     padding: 8px;
2284 | }}
2285 | 
2286 | .styled-table tr:nth-child(even){{background-color: #f2f2f2;}}
2287 | 
2288 | .styled-table tr:hover {{background-color: #e0ecf5;}}
2289 | 
2290 | .styled-table thead {{
2291 |     padding-top: 6px;
2292 |     padding-bottom: 6px;
2293 |     text-align: left;
2294 |     background-color: #0099cc;
2295 |     color: white;
2296 | }}
2297 | 
2298 | .mljar-automl-report {{
2299 |     font-family: ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
2300 |     background-color: rgba(236, 243, 249, 0.15);
2301 | 
2302 | 
2303 | h1 {{
2304 |     color: #004666;
2305 |     border-bottom: 1px solid rgba(0,70,102,0.3)
2306 | }}
2307 | h2 {{
2308 |     color: #004666;
2309 |     padding-bottom: 5px;
2310 |     margin-bottom: 0px;
2311 | }}
2312 | 
2313 | ul {{
2314 |     margin-top: 0px;
2315 | }}
2316 | 
2317 | p {{
2318 |     margin-top: 5px;
2319 | }}
2320 | 
2321 | h3 {{
2322 |     color: #004666;
2323 |     padding-bottom: 5px;
2324 |     margin-bottom: 0px;
2325 | }}
2326 | a {{
2327 |     font-weight: bold;
2328 |     color: #004666;
2329 | }}
2330 | 
2331 | a:hover {{
2332 |     cursor: pointer;
2333 |     color: #0099CC;
2334 | }}
2335 | }}
2336 | 
2337 | """
2338 | 
2339 |     def _md_to_html(self, md_fname, page_type, dir_path, me=None):
2340 |         import base64
2341 | 
2342 |         import markdown
2343 | 
2344 |         if not os.path.exists(md_fname):
2345 |             return None
2346 |         content = ""
2347 |         with open(md_fname) as fin:
2348 |             content = fin.read()
2349 | 
2350 |         content = content.replace("README.md", "README.html")
2351 |         content_html = markdown.markdown(
2352 |             content, extensions=["markdown.extensions.tables"]
2353 |         )
2354 |         content_html = content_html.replace("<img ", '<img style="width:750px" ')
2355 |         content_html = content_html.replace("<table>", '<table class="styled-table">')
2356 |         content_html = content_html.replace("<tr>", '<tr style="text-align: right;">')
2357 | 
2358 |         # replace png figures to base64
2359 |         for f in os.listdir(dir_path):
2360 |             if ".png" in f:
2361 |                 encoded_string = ""
2362 |                 with open(os.path.join(dir_path, f), "rb") as image_file:
2363 |                     encoded_string = base64.b64encode(image_file.read())
2364 |                     encoded_string = encoded_string.decode("utf-8")
2365 |                 encoded_figure = f"data:image/png;base64, {encoded_string}"
2366 |                 content_html = content_html.replace(f, encoded_figure)
2367 | 
2368 |         # insert svg figures
2369 |         for f in os.listdir(dir_path):
2370 |             if ".svg" in f:
2371 |                 with open(os.path.join(dir_path, f), "rb") as image_file:
2372 |                     svg_plot = image_file.read()
2373 |                     svg_plot = svg_plot.decode("utf-8")
2374 | 
2375 |                 arr = content_html.split("\n")
2376 |                 new_content = []
2377 |                 for i in arr:
2378 |                     if f in i:
2379 |                         new_content += [f"<p>{svg_plot}</p>"]
2380 |                     else:
2381 |                         new_content += [i]
2382 |                 content_html = "\n".join(new_content)
2383 | 
2384 |         # change links
2385 |         if page_type == f"automl-report-main-{self._id}":
2386 |             for f in os.listdir(dir_path):
2387 |                 if os.path.exists(os.path.join(dir_path, f, "README.md")):
2388 |                     old = f'href="{f}/README.html"'
2389 |                     new = f"onclick=\"toggleShow('{f}-{self._id}');toggleShow('automl-report-main-{self._id}')\" "
2390 |                     content_html = content_html.replace(old, new)
2391 | 
2392 |         # other links
2393 |         if me is not None:
2394 |             old = 'href="../README.html"'
2395 |             new = f"onclick=\"toggleShow('{me}-{self._id}');toggleShow('automl-report-main-{self._id}')\" "
2396 |             content_html = content_html.replace(old, new)
2397 | 
2398 |         beginning = ""
2399 | 
2400 |         if page_type == f"automl-report-main-{self._id}":
2401 |             beginning += """<img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/mljar_AutomatedML.png" style="height:128px; margin-left: auto;
2402 | margin-right: auto;display: block;"/>\n\n"""
2403 |             if os.path.exists(os.path.join(self._results_path, "optuna/README.md")):
2404 |                 beginning += f"<h2><a onclick=\"toggleShow('optuna');toggleShow('automl-report-main-{self._id}')\" >&#187; Optuna Params Tuning Report</a></h2>"
2405 | 
2406 |         content_html = beginning + content_html
2407 | 
2408 |         return content_html
2409 | 
2410 |     def _show_report(self, main_readme_html, width=900, height=1200):
2411 |         from IPython.display import HTML, IFrame
2412 | 
2413 |         if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is None:
2414 |             with open(main_readme_html) as fin:
2415 |                 return HTML(fin.read())
2416 |         else:
2417 |             return IFrame(main_readme_html, width=width, height=height)
2418 | 
2419 |     def _report(self, width=900, height=1200):
2420 |         self._results_path = self._get_results_path()
2421 |         main_readme_html = os.path.join(self._results_path, "README.html")
2422 | 
2423 |         if os.path.exists(main_readme_html):
2424 |             return self._show_report(main_readme_html, width, height)
2425 | 
2426 |         body = ""
2427 |         fname = os.path.join(self._results_path, "README.md")
2428 |         body += (
2429 |             f'<div id="automl-report-main-{self._id}">\n'
2430 |             + self._md_to_html(fname, f"automl-report-main-{self._id}", self._results_path)
2431 |             + "\n\n</div>\n\n"
2432 |         )
2433 | 
2434 |         for f in os.listdir(self._results_path):
2435 |             fname = os.path.join(self._results_path, f, "README.md")
2436 |             if os.path.exists(fname):
2437 |                 body += (
2438 |                     f'<div id="{f}-{self._id}" style="display: none">\n'
2439 |                     + self._md_to_html(
2440 |                         fname, "sub", os.path.join(self._results_path, f), f
2441 |                     )
2442 |                     + "\n\n</div>\n\n"
2443 |                 )
2444 | 
2445 |         body += """
2446 |     <script>
2447 |         function toggleShow(elementId) {
2448 |             var x = document.getElementById(elementId);
2449 |             if (x.style.display === "none") {
2450 |                 x.style.display = "block";
2451 |             } else {
2452 |                 x.style.display = "none";
2453 |             }
2454 |         }
2455 |     </script>
2456 |         """
2457 | 
2458 |         report_content = f"""
2459 | <!DOCTYPE html>
2460 | <html>
2461 | <head>
2462 |     <style>
2463 |     {self.report_style}
2464 |     </style>
2465 | </head>
2466 | <body>
2467 |     <div class="mljar-automl-report-{self._id}">
2468 |     {body}
2469 |     <div>
2470 | </body>
2471 | </html>
2472 | """
2473 |         with open(main_readme_html, "w") as fout:
2474 |             fout.write(report_content)
2475 | 
2476 |         return self._show_report(main_readme_html, width, height)
2477 | 
2478 |     def _need_retrain(self, X, y, sample_weight, decrease):
2479 |         metric = self._best_model.get_metric()
2480 | 
2481 |         X, y, sample_weight, _ = ExcludeRowsMissingTarget.transform(
2482 |             X, y, sample_weight, warn=True
2483 |         )
2484 | 
2485 |         if self._ml_task == BINARY_CLASSIFICATION:
2486 |             prediction = self._predict_proba(X)[:, 1]
2487 |         if self._ml_task == MULTICLASS_CLASSIFICATION:
2488 |             prediction = self._predict_proba(X)
2489 |         else:
2490 |             prediction = self._predict(X)
2491 | 
2492 |         sign = -1.0 if Metric.optimize_negative(metric.name) else 1.0
2493 | 
2494 |         new_score = metric(y, prediction, sample_weight)
2495 |         old_score = self._best_model.get_final_loss()
2496 | 
2497 |         change = np.abs((old_score - new_score) / old_score)
2498 | 
2499 |         # always minimize the score
2500 |         if new_score > old_score:
2501 |             self.verbose_print(
2502 |                 f"Model performance decreased by {np.round(change*100.0,2)}%"
2503 |             )
2504 |             return change > decrease
2505 |         else:
2506 |             self.verbose_print(
2507 |                 f"Model performance increased by {np.round(change*100.0,2)}%"
2508 |             )
2509 |             return False
2510 | 
```