mljar/mljar-supervised # codebase.md

This is page 3 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/supervised/utils/learning_curves.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | from supervised.utils.common import learner_name_to_fold_repeat
  9 | from supervised.utils.config import LOG_LEVEL
 10 | from supervised.utils.metric import Metric
 11 | 
 12 | logger.setLevel(LOG_LEVEL)
 13 | 
 14 | import matplotlib.colors as mcolors
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | MY_COLORS = list(mcolors.TABLEAU_COLORS.values())
 18 | 
 19 | 
 20 | class LearningCurves:
 21 |     output_file_name = "learning_curves.png"
 22 | 
 23 |     @staticmethod
 24 |     def single_iteration(learner_names, model_path):
 25 |         for ln in learner_names:
 26 |             df = pd.read_csv(
 27 |                 os.path.join(model_path, f"{ln}_training.log"),
 28 |                 names=["iteration", "train", "test"],
 29 |             )
 30 |             if df.shape[0] > 1:
 31 |                 return False
 32 |         return True
 33 | 
 34 |     @staticmethod
 35 |     def plot(learner_names, metric_name, model_path, trees_in_iteration=None):
 36 |         colors = MY_COLORS
 37 |         if len(learner_names) > len(colors):
 38 |             repeat_colors = int(np.ceil(len(learner_names) / len(colors)))
 39 |             colors = colors * repeat_colors
 40 | 
 41 |         if LearningCurves.single_iteration(learner_names, model_path):
 42 |             LearningCurves.plot_single_iter(
 43 |                 learner_names, metric_name, model_path, colors
 44 |             )
 45 |         else:
 46 |             LearningCurves.plot_iterations(
 47 |                 learner_names, metric_name, model_path, colors, trees_in_iteration
 48 |             )
 49 | 
 50 |     @staticmethod
 51 |     def plot_single_iter(learner_names, metric_name, model_path, colors):
 52 |         plt.figure(figsize=(10, 7))
 53 |         for ln in learner_names:
 54 |             df = pd.read_csv(
 55 |                 os.path.join(model_path, f"{ln}_training.log"),
 56 |                 names=["iteration", "train", "test"],
 57 |             )
 58 | 
 59 |             fold, repeat = learner_name_to_fold_repeat(ln)
 60 |             repeat_str = f" Reapeat {repeat+1}," if repeat is not None else ""
 61 |             plt.bar(
 62 |                 f"Fold {fold+1},{repeat_str} train",
 63 |                 df.train[0],
 64 |                 color="white",
 65 |                 edgecolor=colors[fold],
 66 |             )
 67 |             plt.bar(f"Fold {fold+1},{repeat_str} test", df.test[0], color=colors[fold])
 68 | 
 69 |         plt.ylabel(metric_name)
 70 |         plt.xticks(rotation=90)
 71 |         plt.tight_layout(pad=2.0)
 72 |         plot_path = os.path.join(model_path, LearningCurves.output_file_name)
 73 |         plt.savefig(plot_path)
 74 |         plt.close("all")
 75 | 
 76 |     @staticmethod
 77 |     def plot_iterations(
 78 |         learner_names, metric_name, model_path, colors, trees_in_iteration=None
 79 |     ):
 80 |         plt.figure(figsize=(10, 7))
 81 |         for ln in learner_names:
 82 |             df = pd.read_csv(
 83 |                 os.path.join(model_path, f"{ln}_training.log"),
 84 |                 names=["iteration", "train", "test"],
 85 |             )
 86 | 
 87 |             fold, repeat = learner_name_to_fold_repeat(ln)
 88 |             repeat_str = f" Reapeat {repeat+1}," if repeat is not None else ""
 89 |             # if trees_in_iteration is not None:
 90 |             #    df.iteration = df.iteration * trees_in_iteration
 91 |             any_none = np.sum(pd.isnull(df.train))
 92 |             if any_none == 0:
 93 |                 plt.plot(
 94 |                     df.iteration,
 95 |                     df.train,
 96 |                     "--",
 97 |                     color=colors[fold],
 98 |                     label=f"Fold {fold+1},{repeat_str} train",
 99 |                 )
100 |             any_none = np.sum(pd.isnull(df.test))
101 |             if any_none == 0:
102 |                 plt.plot(
103 |                     df.iteration,
104 |                     df.test,
105 |                     color=colors[fold],
106 |                     label=f"Fold {fold+1},{repeat_str} test",
107 |                 )
108 | 
109 |             
110 |             if not df.test.isnull().values.any():
111 |                 best_iter = None
112 |                 if Metric.optimize_negative(metric_name):
113 |                     best_iter = df.test.argmax()
114 |                 else:
115 |                     best_iter = df.test.argmin()
116 | 
117 |                 if best_iter is not None and best_iter != -1:
118 |                     plt.axvline(best_iter, color=colors[fold], alpha=0.3)
119 | 
120 |         if trees_in_iteration is not None:
121 |             plt.xlabel("#Trees")
122 |         else:
123 |             plt.xlabel("#Iteration")
124 |         plt.ylabel(metric_name)
125 | 
126 |         # limit number of learners in the legend
127 |         # too many will raise warnings
128 |         if len(learner_names) <= 15:
129 |             plt.legend(loc="best")
130 | 
131 |         plt.tight_layout(pad=2.0)
132 |         plot_path = os.path.join(model_path, LearningCurves.output_file_name)
133 |         plt.savefig(plot_path)
134 |         plt.close("all")
135 | 
136 |     @staticmethod
137 |     def plot_for_ensemble(scores, metric_name, model_path):
138 |         plt.figure(figsize=(10, 7))
139 |         plt.plot(range(1, len(scores) + 1), scores, label=f"Ensemble")
140 |         plt.xlabel("#Iteration")
141 |         plt.ylabel(metric_name)
142 |         plt.legend(loc="best")
143 |         plot_path = os.path.join(model_path, LearningCurves.output_file_name)
144 |         plt.savefig(plot_path)
145 |         plt.close("all")
146 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/report.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | 
  3 | 
  4 | class FairnessReport:
  5 |     """Saves information about fairness in the report."""
  6 | 
  7 |     @staticmethod
  8 |     def save_classification(fairness_metrics, fout, model_path, is_multi=False):
  9 |         for k, v in fairness_metrics.items():
 10 |             if k == "fairness_optimization":
 11 |                 continue
 12 | 
 13 |             if is_multi:
 14 |                 a = k.split("__", maxsplit=1)
 15 |                 feature, class_name = a
 16 | 
 17 |             if is_multi:
 18 |                 fout.write(
 19 |                     f"\n\n## Fairness metrics for {feature} feature and {class_name} class\n\n"
 20 |                 )
 21 |             else:
 22 |                 fout.write(f"\n\n## Fairness metrics for {k} feature\n\n")
 23 | 
 24 |             fout.write(v["metrics"].to_markdown())
 25 |             fout.write("\n\n")
 26 |             fout.write(v["stats"].to_markdown())
 27 |             fout.write("\n\n")
 28 | 
 29 |             if is_multi:
 30 |                 fout.write(
 31 |                     f"\n\n## Is model fair for {feature} feature and {class_name} class?\n"
 32 |                 )
 33 |             else:
 34 |                 fout.write(f"\n\n## Is model fair for {k} feature?\n")
 35 |             fair_str = "fair" if v["is_fair"] else "unfair"
 36 |             fairness_threshold = fairness_metrics.get("fairness_optimization", {}).get(
 37 |                 "fairness_threshold"
 38 |             )
 39 |             fairness_threshold_str = ""
 40 |             if fairness_threshold is not None:
 41 |                 if "ratio" in v["fairness_metric_name"].lower():
 42 |                     fairness_threshold_str = (
 43 |                         f"It should be higher than {fairness_threshold}."
 44 |                     )
 45 |                 else:
 46 |                     fairness_threshold_str = (
 47 |                         f"It should be lower than {fairness_threshold}."
 48 |                     )
 49 | 
 50 |             if is_multi:
 51 |                 fout.write(
 52 |                     f"Model is {fair_str} for {feature} feature and {class_name} class.\n"
 53 |                 )
 54 |             else:
 55 |                 fout.write(f"Model is {fair_str} for {k} feature.\n")
 56 |             fout.write(
 57 |                 f'The {v["fairness_metric_name"]} is {v["fairness_metric_value"]}. {fairness_threshold_str}\n'
 58 |             )
 59 |             if not v["is_fair"]:
 60 |                 # display information about privileged and underprivileged groups
 61 |                 # for unfair models
 62 |                 if v.get("underprivileged_value") is not None:
 63 |                     fout.write(
 64 |                         f'Underprivileged value is {v["underprivileged_value"]}.\n'
 65 |                     )
 66 |                 if v.get("privileged_value") is not None:
 67 |                     fout.write(f'Privileged value is {v["privileged_value"]}.\n')
 68 | 
 69 |             for figure in v["figures"]:
 70 |                 fout.write(f"\n\n### {figure['title']}\n\n")
 71 |                 figure["figure"].savefig(os.path.join(model_path, figure["fname"]))
 72 |                 fout.write(f"\n![]({figure['fname']})\n\n")
 73 | 
 74 |     @staticmethod
 75 |     def regression(fairness_metrics, fout, model_path):
 76 |         for k, v in fairness_metrics.items():
 77 |             if k == "fairness_optimization":
 78 |                 continue
 79 |             fout.write(f"\n\n## Fairness metrics for {k} feature\n\n")
 80 | 
 81 |             fout.write(v["metrics"].to_markdown())
 82 |             fout.write("\n\n")
 83 | 
 84 |             fout.write(f'Privileged value: {v["privileged_value"]}\n\n')
 85 |             fout.write(f'Underprivileged value: {v["underprivileged_value"]}\n\n\n')
 86 |             fout.write(f'Fairness metric: {v["fairness_metric_name"]}\n\n')
 87 |             fout.write(f'{v["metric_name"]} Difference: {v["diff"]}\n\n')
 88 |             fout.write(f'{v["metric_name"]} Ratio: {v["ratio"]}\n\n')
 89 | 
 90 |             # add sentence about model fairness
 91 |             if v["is_fair"]:
 92 |                 fout.write(f"Model is fair for {k} feature.\n")
 93 |                 if "ratio" in v["fairness_metric_name"].lower():
 94 |                     fout.write(
 95 |                         f"The {v['fairness_metric_name']} value is above threshold {v['fairness_threshold']}.\n\n"
 96 |                     )
 97 |                 else:
 98 |                     fout.write(
 99 |                         f"The {v['fairness_metric_name']} value is below threshold {v['fairness_threshold']}.\n\n"
100 |                     )
101 |             else:
102 |                 # model is not fair
103 |                 fout.write(f"Model is unfair for {k} feature.\n")
104 |                 if "ratio" in v["fairness_metric_name"].lower():
105 |                     fout.write(
106 |                         f"The {v['fairness_metric_name']} value is below threshold {v['fairness_threshold']}.\n\n"
107 |                     )
108 |                 else:
109 |                     fout.write(
110 |                         f"The {v['fairness_metric_name']} value is above threshold {v['fairness_threshold']}.\n\n"
111 |                     )
112 | 
113 |             for figure in v["figures"]:
114 |                 fout.write(f"\n\n### {figure['title']}\n\n")
115 |                 figure["figure"].savefig(os.path.join(model_path, figure["fname"]))
116 |                 fout.write(f"\n![]({figure['fname']})\n\n")
117 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_catboost.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | import pandas as pd
  6 | from numpy.testing import assert_almost_equal
  7 | from sklearn import datasets
  8 | 
  9 | from supervised.algorithms.catboost import CatBoostAlgorithm, additional
 10 | from supervised.utils.metric import Metric
 11 | 
 12 | additional["max_rounds"] = 1
 13 | 
 14 | 
 15 | class CatBoostRegressorAlgorithmTest(unittest.TestCase):
 16 |     @classmethod
 17 |     def setUpClass(cls):
 18 |         cls.X, cls.y = datasets.make_regression(
 19 |             n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
 20 |         )
 21 |         cls.X = pd.DataFrame(cls.X, columns=[f"f_{i}" for i in range(cls.X.shape[1])])
 22 |         cls.params = {
 23 |             "learning_rate": 0.1,
 24 |             "depth": 4,
 25 |             "rsm": 0.5,
 26 |             "l2_leaf_reg": 1,
 27 |             "seed": 1,
 28 |             "ml_task": "regression",
 29 |             "loss_function": "RMSE",
 30 |             "eval_metric": "RMSE",
 31 |         }
 32 | 
 33 |     def test_reproduce_fit(self):
 34 |         metric = Metric({"name": "mse"})
 35 |         prev_loss = None
 36 |         for _ in range(2):
 37 |             model = CatBoostAlgorithm(self.params)
 38 |             model.fit(self.X, self.y)
 39 |             y_predicted = model.predict(self.X)
 40 |             loss = metric(self.y, y_predicted)
 41 |             if prev_loss is not None:
 42 |                 assert_almost_equal(prev_loss, loss, decimal=3)
 43 |             prev_loss = loss
 44 | 
 45 |     def test_get_metric_name(self):
 46 |         model = CatBoostAlgorithm(self.params)
 47 |         self.assertEqual(model.get_metric_name(), "rmse")
 48 | 
 49 | 
 50 | class CatBoostAlgorithmTest(unittest.TestCase):
 51 |     @classmethod
 52 |     def setUpClass(cls):
 53 |         cls.X, cls.y = datasets.make_classification(
 54 |             n_samples=100,
 55 |             n_features=5,
 56 |             n_informative=4,
 57 |             n_redundant=1,
 58 |             n_classes=2,
 59 |             n_clusters_per_class=3,
 60 |             n_repeated=0,
 61 |             shuffle=False,
 62 |             random_state=0,
 63 |         )
 64 |         cls.X = pd.DataFrame(cls.X, columns=[f"f_{i}" for i in range(cls.X.shape[1])])
 65 |         cls.params = {
 66 |             "learning_rate": 0.1,
 67 |             "depth": 4,
 68 |             "rsm": 0.5,
 69 |             "l2_leaf_reg": 1,
 70 |             "seed": 1,
 71 |             "ml_task": "binary_classification",
 72 |             "loss_function": "Logloss",
 73 |             "eval_metric": "Logloss",
 74 |         }
 75 | 
 76 |     def test_reproduce_fit(self):
 77 |         metric = Metric({"name": "logloss"})
 78 |         prev_loss = None
 79 |         for _ in range(2):
 80 |             model = CatBoostAlgorithm(self.params)
 81 |             model.fit(self.X, self.y)
 82 |             y_predicted = model.predict(self.X)
 83 |             loss = metric(self.y, y_predicted)
 84 |             if prev_loss is not None:
 85 |                 assert_almost_equal(prev_loss, loss, decimal=3)
 86 |             prev_loss = loss
 87 | 
 88 |     def test_fit_predict(self):
 89 |         metric = Metric({"name": "logloss"})
 90 |         loss_prev = None
 91 |         for _ in range(2):
 92 |             cat = CatBoostAlgorithm(self.params)
 93 |             cat.fit(self.X, self.y)
 94 |             y_predicted = cat.predict(self.X)
 95 |             loss = metric(self.y, y_predicted)
 96 |             if loss_prev is not None:
 97 |                 assert_almost_equal(loss, loss_prev, decimal=3)
 98 |             loss_prev = loss
 99 | 
100 |     def test_copy(self):
101 |         # train model #1
102 |         metric = Metric({"name": "logloss"})
103 |         cat = CatBoostAlgorithm(self.params)
104 |         cat.fit(self.X, self.y)
105 |         y_predicted = cat.predict(self.X)
106 |         loss = metric(self.y, y_predicted)
107 |         # create model #2
108 |         cat2 = CatBoostAlgorithm(self.params)
109 |         # model #2 is initialized in constructor
110 |         self.assertTrue(cat2.model is not None)
111 |         # do a copy and use it for predictions
112 |         cat2 = cat.copy()
113 |         self.assertEqual(type(cat), type(cat2))
114 |         y_predicted = cat2.predict(self.X)
115 |         loss2 = metric(self.y, y_predicted)
116 |         self.assertEqual(loss, loss2)
117 | 
118 |     def test_save_and_load(self):
119 |         metric = Metric({"name": "logloss"})
120 |         cat = CatBoostAlgorithm(self.params)
121 |         cat.fit(self.X, self.y)
122 |         y_predicted = cat.predict(self.X)
123 |         loss = metric(self.y, y_predicted)
124 | 
125 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
126 | 
127 |         cat.save(filename)
128 |         cat2 = CatBoostAlgorithm(self.params)
129 |         self.assertTrue(cat.uid != cat2.uid)
130 |         self.assertTrue(cat2.model is not None)
131 |         cat2.load(filename)
132 |         # Finished with the file, delete it
133 |         os.remove(filename)
134 | 
135 |         y_predicted = cat2.predict(self.X)
136 |         loss2 = metric(self.y, y_predicted)
137 |         assert_almost_equal(loss, loss2, decimal=3)
138 | 
139 |     def test_get_metric_name(self):
140 |         model = CatBoostAlgorithm(self.params)
141 |         self.assertEqual(model.get_metric_name(), "logloss")
142 |         params = dict(self.params)
143 |         params["loss_function"] = "MultiClass"
144 |         params["eval_metric"] = "MultiClass"
145 |         model = CatBoostAlgorithm(params)
146 |         self.assertEqual(model.get_metric_name(), "logloss")
147 | 
148 |     def test_is_fitted(self):
149 |         cat = CatBoostAlgorithm(self.params)
150 |         self.assertFalse(cat.is_fitted())
151 |         cat.fit(self.X, self.y)
152 |         self.assertTrue(cat.is_fitted())
153 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/extra_trees.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | import sklearn
  4 | from sklearn.base import ClassifierMixin, RegressorMixin
  5 | from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
  6 | 
  7 | from supervised.algorithms.registry import (
  8 |     BINARY_CLASSIFICATION,
  9 |     MULTICLASS_CLASSIFICATION,
 10 |     REGRESSION,
 11 |     AlgorithmsRegistry,
 12 | )
 13 | from supervised.algorithms.sklearn import (
 14 |     SklearnTreesEnsembleClassifierAlgorithm,
 15 |     SklearnTreesEnsembleRegressorAlgorithm,
 16 | )
 17 | from supervised.utils.config import LOG_LEVEL
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(LOG_LEVEL)
 21 | 
 22 | 
 23 | class ExtraTreesAlgorithm(ClassifierMixin, SklearnTreesEnsembleClassifierAlgorithm):
 24 |     algorithm_name = "Extra Trees Classifier"
 25 |     algorithm_short_name = "Extra Trees"
 26 | 
 27 |     def __init__(self, params):
 28 |         super(ExtraTreesAlgorithm, self).__init__(params)
 29 |         logger.debug("ExtraTreesAlgorithm.__init__")
 30 | 
 31 |         self.library_version = sklearn.__version__
 32 |         self.trees_in_step = additional.get("trees_in_step", 100)
 33 |         self.max_steps = additional.get("max_steps", 50)
 34 |         self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
 35 |         self.model = ExtraTreesClassifier(
 36 |             n_estimators=self.trees_in_step,
 37 |             criterion=params.get("criterion", "gini"),
 38 |             max_features=params.get("max_features", 0.8),
 39 |             max_depth=params.get("max_depth", 6),
 40 |             min_samples_split=params.get("min_samples_split", 4),
 41 |             min_samples_leaf=params.get("min_samples_leaf", 1),
 42 |             warm_start=True,
 43 |             n_jobs=params.get("n_jobs", -1),
 44 |             random_state=params.get("seed", 1),
 45 |         )
 46 |         self.max_steps = self.params.get("max_steps", self.max_steps)
 47 | 
 48 |     def file_extension(self):
 49 |         return "extra_trees"
 50 | 
 51 | 
 52 | class ExtraTreesRegressorAlgorithm(
 53 |     RegressorMixin, SklearnTreesEnsembleRegressorAlgorithm
 54 | ):
 55 |     algorithm_name = "Extra Trees Regressor"
 56 |     algorithm_short_name = "Extra Trees"
 57 | 
 58 |     def __init__(self, params):
 59 |         super(ExtraTreesRegressorAlgorithm, self).__init__(params)
 60 |         logger.debug("ExtraTreesRegressorAlgorithm.__init__")
 61 | 
 62 |         self.library_version = sklearn.__version__
 63 |         self.trees_in_step = regression_additional.get("trees_in_step", 100)
 64 |         self.max_steps = regression_additional.get("max_steps", 50)
 65 |         self.early_stopping_rounds = regression_additional.get(
 66 |             "early_stopping_rounds", 50
 67 |         )
 68 |         self.model = ExtraTreesRegressor(
 69 |             n_estimators=self.trees_in_step,
 70 |             criterion=params.get("criterion", "squared_error"),
 71 |             max_features=params.get("max_features", 0.6),
 72 |             max_depth=params.get("max_depth", 6),
 73 |             min_samples_split=params.get("min_samples_split", 30),
 74 |             min_samples_leaf=params.get("min_samples_leaf", 1),
 75 |             warm_start=True,
 76 |             n_jobs=params.get("n_jobs", -1),
 77 |             random_state=params.get("seed", 1),
 78 |         )
 79 |         self.max_steps = self.params.get("max_steps", self.max_steps)
 80 | 
 81 |     def file_extension(self):
 82 |         return "extra_trees"
 83 | 
 84 | 
 85 | # For binary classification target should be 0, 1. There should be no NaNs in target.
 86 | et_params = {
 87 |     "criterion": ["gini", "entropy"],
 88 |     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 89 |     "min_samples_split": [10, 20, 30, 40, 50],
 90 |     "max_depth": [3, 4, 5, 6, 7],
 91 | }
 92 | 
 93 | classification_default_params = {
 94 |     "criterion": "gini",
 95 |     "max_features": 0.9,
 96 |     "min_samples_split": 30,
 97 |     "max_depth": 4,
 98 | }
 99 | 
100 | additional = {
101 |     "trees_in_step": 100,
102 |     "max_steps": 50,
103 |     "early_stopping_rounds": 50,
104 |     "max_rows_limit": None,
105 |     "max_cols_limit": None,
106 | }
107 | required_preprocessing = [
108 |     "missing_values_inputation",
109 |     "convert_categorical",
110 |     "datetime_transform",
111 |     "text_transform",
112 |     "target_as_integer",
113 | ]
114 | 
115 | AlgorithmsRegistry.add(
116 |     BINARY_CLASSIFICATION,
117 |     ExtraTreesAlgorithm,
118 |     et_params,
119 |     required_preprocessing,
120 |     additional,
121 |     classification_default_params,
122 | )
123 | 
124 | AlgorithmsRegistry.add(
125 |     MULTICLASS_CLASSIFICATION,
126 |     ExtraTreesAlgorithm,
127 |     et_params,
128 |     required_preprocessing,
129 |     additional,
130 |     classification_default_params,
131 | )
132 | 
133 | 
134 | #
135 | # REGRESSION
136 | #
137 | 
138 | regression_et_params = {
139 |     "criterion": [
140 |         "squared_error"
141 |     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
142 |     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
143 |     "min_samples_split": [10, 20, 30, 40, 50],
144 |     "max_depth": [3, 4, 5, 6, 7],
145 | }
146 | 
147 | regression_default_params = {
148 |     "criterion": "squared_error",
149 |     "max_features": 0.9,
150 |     "min_samples_split": 30,
151 |     "max_depth": 4,
152 | }
153 | 
154 | regression_additional = {
155 |     "trees_in_step": 100,
156 |     "max_steps": 50,
157 |     "early_stopping_rounds": 50,
158 |     "max_rows_limit": None,
159 |     "max_cols_limit": None,
160 | }
161 | regression_required_preprocessing = [
162 |     "missing_values_inputation",
163 |     "convert_categorical",
164 |     "datetime_transform",
165 |     "text_transform",
166 |     "target_scale",
167 | ]
168 | 
169 | AlgorithmsRegistry.add(
170 |     REGRESSION,
171 |     ExtraTreesRegressorAlgorithm,
172 |     regression_et_params,
173 |     regression_required_preprocessing,
174 |     regression_additional,
175 |     regression_default_params,
176 | )
177 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/random_forest.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | 
  3 | import sklearn
  4 | from sklearn.base import ClassifierMixin, RegressorMixin
  5 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
  6 | 
  7 | from supervised.algorithms.registry import (
  8 |     BINARY_CLASSIFICATION,
  9 |     MULTICLASS_CLASSIFICATION,
 10 |     REGRESSION,
 11 |     AlgorithmsRegistry,
 12 | )
 13 | from supervised.algorithms.sklearn import (
 14 |     SklearnTreesEnsembleClassifierAlgorithm,
 15 |     SklearnTreesEnsembleRegressorAlgorithm,
 16 | )
 17 | from supervised.utils.config import LOG_LEVEL
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(LOG_LEVEL)
 21 | 
 22 | 
 23 | class RandomForestAlgorithm(ClassifierMixin, SklearnTreesEnsembleClassifierAlgorithm):
 24 |     algorithm_name = "Random Forest"
 25 |     algorithm_short_name = "Random Forest"
 26 | 
 27 |     def __init__(self, params):
 28 |         super(RandomForestAlgorithm, self).__init__(params)
 29 |         logger.debug("RandomForestAlgorithm.__init__")
 30 | 
 31 |         self.library_version = sklearn.__version__
 32 |         self.trees_in_step = additional.get("trees_in_step", 5)
 33 |         self.max_steps = additional.get("max_steps", 3)
 34 |         self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
 35 |         self.model = RandomForestClassifier(
 36 |             n_estimators=self.trees_in_step,
 37 |             criterion=params.get("criterion", "gini"),
 38 |             max_features=params.get("max_features", 0.8),
 39 |             max_depth=params.get("max_depth", 6),
 40 |             min_samples_split=params.get("min_samples_split", 4),
 41 |             min_samples_leaf=params.get("min_samples_leaf", 1),
 42 |             warm_start=True,
 43 |             n_jobs=params.get("n_jobs", -1),
 44 |             random_state=params.get("seed", 1),
 45 |         )
 46 |         self.max_steps = self.params.get("max_steps", self.max_steps)
 47 | 
 48 |     def file_extension(self):
 49 |         return "random_forest"
 50 | 
 51 | 
 52 | class RandomForestRegressorAlgorithm(
 53 |     RegressorMixin, SklearnTreesEnsembleRegressorAlgorithm
 54 | ):
 55 |     algorithm_name = "Random Forest"
 56 |     algorithm_short_name = "Random Forest"
 57 | 
 58 |     def __init__(self, params):
 59 |         super(RandomForestRegressorAlgorithm, self).__init__(params)
 60 |         logger.debug("RandomForestRegressorAlgorithm.__init__")
 61 | 
 62 |         self.library_version = sklearn.__version__
 63 |         self.trees_in_step = regression_additional.get("trees_in_step", 5)
 64 |         self.max_steps = regression_additional.get("max_steps", 3)
 65 |         self.early_stopping_rounds = regression_additional.get(
 66 |             "early_stopping_rounds", 50
 67 |         )
 68 |         self.model = RandomForestRegressor(
 69 |             n_estimators=self.trees_in_step,
 70 |             criterion=params.get("criterion", "squared_error"),
 71 |             max_features=params.get("max_features", 0.8),
 72 |             max_depth=params.get("max_depth", 6),
 73 |             min_samples_split=params.get("min_samples_split", 4),
 74 |             min_samples_leaf=params.get("min_samples_leaf", 1),
 75 |             warm_start=True,
 76 |             n_jobs=params.get("n_jobs", -1),
 77 |             random_state=params.get("seed", 1),
 78 |         )
 79 |         self.max_steps = self.params.get("max_steps", self.max_steps)
 80 | 
 81 |     def file_extension(self):
 82 |         return "random_forest"
 83 | 
 84 | 
 85 | # For binary classification target should be 0, 1. There should be no NaNs in target.
 86 | rf_params = {
 87 |     "criterion": ["gini", "entropy"],
 88 |     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 89 |     "min_samples_split": [10, 20, 30, 40, 50],
 90 |     "max_depth": [3, 4, 5, 6, 7],
 91 | }
 92 | 
 93 | classification_default_params = {
 94 |     "criterion": "gini",
 95 |     "max_features": 0.9,
 96 |     "min_samples_split": 30,
 97 |     "max_depth": 4,
 98 | }
 99 | 
100 | 
101 | additional = {
102 |     "trees_in_step": 100,
103 |     "train_cant_improve_limit": 1,
104 |     "min_steps": 1,
105 |     "max_steps": 50,
106 |     "early_stopping_rounds": 50,
107 |     "max_rows_limit": None,
108 |     "max_cols_limit": None,
109 | }
110 | required_preprocessing = [
111 |     "missing_values_inputation",
112 |     "convert_categorical",
113 |     "datetime_transform",
114 |     "text_transform",
115 |     "target_as_integer",
116 | ]
117 | 
118 | AlgorithmsRegistry.add(
119 |     BINARY_CLASSIFICATION,
120 |     RandomForestAlgorithm,
121 |     rf_params,
122 |     required_preprocessing,
123 |     additional,
124 |     classification_default_params,
125 | )
126 | 
127 | AlgorithmsRegistry.add(
128 |     MULTICLASS_CLASSIFICATION,
129 |     RandomForestAlgorithm,
130 |     rf_params,
131 |     required_preprocessing,
132 |     additional,
133 |     classification_default_params,
134 | )
135 | 
136 | 
137 | #
138 | # REGRESSION
139 | #
140 | 
141 | regression_rf_params = {
142 |     "criterion": [
143 |         "squared_error"
144 |     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
145 |     "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
146 |     "min_samples_split": [10, 20, 30, 40, 50],
147 |     "max_depth": [3, 4, 5, 6, 7],
148 | }
149 | 
150 | regression_default_params = {
151 |     "criterion": "squared_error",
152 |     "max_features": 0.9,
153 |     "min_samples_split": 30,
154 |     "max_depth": 4,
155 | }
156 | 
157 | regression_additional = {
158 |     "trees_in_step": 100,
159 |     "train_cant_improve_limit": 1,
160 |     "min_steps": 1,
161 |     "max_steps": 50,
162 |     "early_stopping_rounds": 50,
163 |     "max_rows_limit": None,
164 |     "max_cols_limit": None,
165 | }
166 | regression_required_preprocessing = [
167 |     "missing_values_inputation",
168 |     "convert_categorical",
169 |     "datetime_transform",
170 |     "text_transform",
171 |     "target_scale",
172 | ]
173 | 
174 | AlgorithmsRegistry.add(
175 |     REGRESSION,
176 |     RandomForestRegressorAlgorithm,
177 |     regression_rf_params,
178 |     regression_required_preprocessing,
179 |     regression_additional,
180 |     regression_default_params,
181 | )
182 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_xgboost.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from numpy.testing import assert_almost_equal
  8 | from sklearn import datasets
  9 | 
 10 | from supervised.algorithms.xgboost import XgbAlgorithm, additional
 11 | from supervised.utils.constants import BINARY_CLASSIFICATION
 12 | from supervised.utils.metric import Metric
 13 | 
 14 | additional["max_rounds"] = 1
 15 | 
 16 | 
 17 | class XgboostAlgorithmTest(unittest.TestCase):
 18 |     @classmethod
 19 |     def setUpClass(cls):
 20 |         cls.X, cls.y = datasets.make_classification(
 21 |             n_samples=100,
 22 |             n_features=5,
 23 |             n_informative=4,
 24 |             n_redundant=1,
 25 |             n_classes=2,
 26 |             n_clusters_per_class=3,
 27 |             n_repeated=0,
 28 |             shuffle=False,
 29 |             random_state=0,
 30 |         )
 31 | 
 32 |     def test_reproduce_fit(self):
 33 |         metric = Metric({"name": "logloss"})
 34 |         params = {
 35 |             "objective": "binary:logistic",
 36 |             "eval_metric": "logloss",
 37 |             "seed": 1,
 38 |             "ml_task": BINARY_CLASSIFICATION,
 39 |         }
 40 |         prev_loss = None
 41 |         for _ in range(3):
 42 |             xgb = XgbAlgorithm(params)
 43 |             xgb.fit(self.X, self.y)
 44 |             y_predicted = xgb.predict(self.X)
 45 |             loss = metric(self.y, y_predicted)
 46 |             if prev_loss is not None:
 47 |                 assert_almost_equal(prev_loss, loss)
 48 |             prev_loss = loss
 49 | 
 50 |     def test_copy(self):
 51 |         metric = Metric({"name": "logloss"})
 52 |         params = {
 53 |             "objective": "binary:logistic",
 54 |             "eval_metric": "logloss",
 55 |             "ml_task": BINARY_CLASSIFICATION,
 56 |         }
 57 |         xgb = XgbAlgorithm(params)
 58 |         xgb.fit(self.X, self.y)
 59 |         y_predicted = xgb.predict(self.X)
 60 |         loss = metric(self.y, y_predicted)
 61 | 
 62 |         xgb2 = XgbAlgorithm(params)
 63 |         self.assertTrue(xgb2.model is None)  # model is set to None, while initialized
 64 |         xgb2 = xgb.copy()
 65 |         self.assertEqual(type(xgb), type(xgb2))
 66 |         y_predicted = xgb2.predict(self.X)
 67 |         loss2 = metric(self.y, y_predicted)
 68 |         self.assertEqual(loss, loss2)
 69 |         self.assertNotEqual(id(xgb), id(xgb2))
 70 | 
 71 |     def test_save_and_load(self):
 72 |         metric = Metric({"name": "logloss"})
 73 |         params = {
 74 |             "objective": "binary:logistic",
 75 |             "eval_metric": "logloss",
 76 |             "ml_task": BINARY_CLASSIFICATION,
 77 |         }
 78 |         xgb = XgbAlgorithm(params)
 79 |         xgb.fit(self.X, self.y)
 80 |         y_predicted = xgb.predict(self.X)
 81 |         loss = metric(self.y, y_predicted)
 82 | 
 83 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
 84 | 
 85 |         xgb.save(filename)
 86 | 
 87 |         xgb2 = XgbAlgorithm(params)
 88 |         self.assertTrue(xgb2.model is None)
 89 |         xgb2.load(filename)
 90 |         # Finished with the file, delete it
 91 |         os.remove(filename)
 92 | 
 93 |         y_predicted = xgb2.predict(self.X)
 94 |         loss2 = metric(self.y, y_predicted)
 95 |         assert_almost_equal(loss, loss2)
 96 | 
 97 |     def test_save_and_load_with_early_stopping(self):
 98 |         metric = Metric({"name": "logloss"})
 99 |         params = {
100 |             "objective": "binary:logistic",
101 |             "eval_metric": "logloss",
102 |             "ml_task": BINARY_CLASSIFICATION,
103 |         }
104 |         xgb = XgbAlgorithm(params)
105 |         xgb.fit(self.X, self.y, X_validation=self.X, y_validation=self.y)
106 |         y_predicted = xgb.predict(self.X)
107 |         loss = metric(self.y, y_predicted)
108 | 
109 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
110 |         prev_best_iteration = xgb.model.best_iteration
111 |         xgb.save(filename)
112 | 
113 |         xgb2 = XgbAlgorithm(params)
114 |         self.assertTrue(xgb2.model is None)
115 |         xgb2.load(filename)
116 |         # Finished with the file, delete it
117 |         os.remove(filename)
118 | 
119 |         y_predicted = xgb2.predict(self.X)
120 |         loss2 = metric(self.y, y_predicted)
121 |         assert_almost_equal(loss, loss2)
122 |         self.assertEqual(prev_best_iteration, xgb2.model.best_iteration)
123 | 
124 |     def test_restricted_characters_in_feature_name(self):
125 |         df = pd.DataFrame(
126 |             {
127 |                 "y": np.random.randint(0, 2, size=100),
128 |                 "[test1]": np.random.uniform(0, 1, size=100),
129 |                 "test2 < 1": np.random.uniform(0, 1, size=100),
130 |             }
131 |         )
132 | 
133 |         y = df.iloc[:, 0]
134 |         X = df.iloc[:, 1:]
135 | 
136 |         metric = Metric({"name": "logloss"})
137 |         params = {
138 |             "objective": "binary:logistic",
139 |             "eval_metric": "logloss",
140 |             "ml_task": BINARY_CLASSIFICATION,
141 |         }
142 |         xgb = XgbAlgorithm(params)
143 |         xgb.fit(X, y)
144 |         xgb.predict(X)
145 | 
146 |     def test_get_metric_name(self):
147 |         params = {
148 |             "objective": "binary:logistic",
149 |             "eval_metric": "logloss",
150 |             "ml_task": BINARY_CLASSIFICATION,
151 |         }
152 |         model = XgbAlgorithm(params)
153 |         self.assertEqual(model.get_metric_name(), "logloss")
154 | 
155 |         params = {"eval_metric": "rmse"}
156 |         model = XgbAlgorithm(params)
157 |         self.assertEqual(model.get_metric_name(), "rmse")
158 | 
159 |     def test_is_fitted(self):
160 |         params = {
161 |             "objective": "binary:logistic",
162 |             "eval_metric": "logloss",
163 |             "ml_task": BINARY_CLASSIFICATION,
164 |         }
165 |         model = XgbAlgorithm(params)
166 |         self.assertFalse(model.is_fitted())
167 |         model.fit(self.X, self.y)
168 |         self.assertTrue(model.is_fitted())
169 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_goldenfeatures_transformer.py:
--------------------------------------------------------------------------------

```python
  1 | import shutil
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn import datasets
  8 | 
  9 | from supervised.algorithms.registry import (
 10 |     BINARY_CLASSIFICATION,
 11 |     MULTICLASS_CLASSIFICATION,
 12 |     REGRESSION,
 13 | )
 14 | from supervised.preprocessing.goldenfeatures_transformer import (
 15 |     GoldenFeaturesTransformer,
 16 | )
 17 | 
 18 | 
 19 | class GoldenFeaturesTransformerTest(unittest.TestCase):
 20 |     automl_dir = "automl_testing"
 21 | 
 22 |     def tearDown(self):
 23 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 24 | 
 25 |     def test_transformer(self):
 26 |         X, y = datasets.make_classification(
 27 |             n_samples=100,
 28 |             n_features=10,
 29 |             n_informative=6,
 30 |             n_redundant=1,
 31 |             n_classes=2,
 32 |             n_clusters_per_class=3,
 33 |             n_repeated=0,
 34 |             shuffle=False,
 35 |             random_state=0,
 36 |         )
 37 | 
 38 |         df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
 39 | 
 40 |         with tempfile.TemporaryDirectory() as tmpdir:
 41 |             gft = GoldenFeaturesTransformer(tmpdir, "binary_classification")
 42 |             gft.fit(df, y)
 43 | 
 44 |             df = gft.transform(df)
 45 | 
 46 |             gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
 47 |             gft3.from_json(gft.to_json(), tmpdir)
 48 | 
 49 |     def test_subsample_regression_10k(self):
 50 |         rows = 10000
 51 |         X = np.random.rand(rows, 3)
 52 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 53 |         y = pd.Series(np.random.rand(rows), name="target")
 54 | 
 55 |         gft3 = GoldenFeaturesTransformer(self.automl_dir, REGRESSION)
 56 |         X_train, X_test, y_train, y_test = gft3._subsample(X, y)
 57 | 
 58 |         self.assertTrue(X_train.shape[0], 2500)
 59 |         self.assertTrue(X_test.shape[0], 2500)
 60 |         self.assertTrue(y_train.shape[0], 2500)
 61 |         self.assertTrue(y_test.shape[0], 2500)
 62 | 
 63 |     def test_subsample_regression_4k(self):
 64 |         rows = 4000
 65 |         X = np.random.rand(rows, 3)
 66 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 67 |         y = pd.Series(np.random.rand(rows), name="target")
 68 | 
 69 |         gft3 = GoldenFeaturesTransformer(self.automl_dir, REGRESSION)
 70 |         X_train, X_test, y_train, y_test = gft3._subsample(X, y)
 71 | 
 72 |         self.assertTrue(X_train.shape[0], 2000)
 73 |         self.assertTrue(X_test.shape[0], 2000)
 74 |         self.assertTrue(y_train.shape[0], 2000)
 75 |         self.assertTrue(y_test.shape[0], 2000)
 76 | 
 77 |     def test_subsample_multiclass_10k(self):
 78 |         rows = 10000
 79 |         X = np.random.rand(rows, 3)
 80 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 81 |         y = pd.Series(np.random.randint(0, 4, rows), name="target")
 82 | 
 83 |         gft3 = GoldenFeaturesTransformer(self.automl_dir, MULTICLASS_CLASSIFICATION)
 84 |         X_train, X_test, y_train, y_test = gft3._subsample(X, y)
 85 | 
 86 |         self.assertTrue(X_train.shape[0], 2500)
 87 |         self.assertTrue(X_test.shape[0], 2500)
 88 |         self.assertTrue(y_train.shape[0], 2500)
 89 |         self.assertTrue(y_test.shape[0], 2500)
 90 | 
 91 |         for uni in [np.unique(y_train), np.unique(y_test)]:
 92 |             for i in range(4):
 93 |                 self.assertTrue(i in uni)
 94 | 
 95 |     def test_subsample_multiclass_4k(self):
 96 |         rows = 4000
 97 |         X = np.random.rand(rows, 3)
 98 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
 99 |         y = pd.Series(np.random.randint(0, 4, rows), name="target")
100 | 
101 |         gft3 = GoldenFeaturesTransformer(self.automl_dir, MULTICLASS_CLASSIFICATION)
102 |         X_train, X_test, y_train, y_test = gft3._subsample(X, y)
103 | 
104 |         self.assertTrue(X_train.shape[0], 2000)
105 |         self.assertTrue(X_test.shape[0], 2000)
106 |         self.assertTrue(y_train.shape[0], 2000)
107 |         self.assertTrue(y_test.shape[0], 2000)
108 | 
109 |         for uni in [np.unique(y_train), np.unique(y_test)]:
110 |             for i in range(4):
111 |                 self.assertTrue(i in uni)
112 | 
113 |     def test_subsample_binclass_4k(self):
114 |         rows = 4000
115 |         X = np.random.rand(rows, 3)
116 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
117 |         y = pd.Series(np.random.randint(0, 2, rows), name="target")
118 | 
119 |         gft3 = GoldenFeaturesTransformer(self.automl_dir, BINARY_CLASSIFICATION)
120 |         X_train, X_test, y_train, y_test = gft3._subsample(X, y)
121 | 
122 |         self.assertTrue(X_train.shape[0], 2000)
123 |         self.assertTrue(X_test.shape[0], 2000)
124 |         self.assertTrue(y_train.shape[0], 2000)
125 |         self.assertTrue(y_test.shape[0], 2000)
126 | 
127 |         for uni in [np.unique(y_train), np.unique(y_test)]:
128 |             for i in range(2):
129 |                 self.assertTrue(i in uni)
130 | 
131 |     def test_features_count(self):
132 |         N_COLS = 10
133 |         X, y = datasets.make_classification(
134 |             n_samples=100,
135 |             n_features=N_COLS,
136 |             n_informative=6,
137 |             n_redundant=1,
138 |             n_classes=2,
139 |             n_clusters_per_class=3,
140 |             n_repeated=0,
141 |             shuffle=False,
142 |             random_state=0,
143 |         )
144 | 
145 |         df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
146 | 
147 |         with tempfile.TemporaryDirectory() as tmpdir:
148 |             FEATURES_COUNT = 42
149 |             gft = GoldenFeaturesTransformer(
150 |                 tmpdir, "binary_classification", features_count=FEATURES_COUNT
151 |             )
152 |             gft.fit(df, y)
153 | 
154 |             self.assertEqual(len(gft._new_features), FEATURES_COUNT)
155 | 
156 |             gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification")
157 |             gft3.from_json(gft.to_json(), tmpdir)
158 | 
159 |             df = gft3.transform(df)
160 |             self.assertEqual(df.shape[1], N_COLS + FEATURES_COUNT)
161 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/catboost.py:
--------------------------------------------------------------------------------

```python
  1 | import optuna
  2 | from catboost import CatBoostClassifier, CatBoostRegressor, Pool
  3 | 
  4 | from supervised.algorithms.catboost import catboost_eval_metric, catboost_objective
  5 | from supervised.algorithms.registry import (
  6 |     BINARY_CLASSIFICATION,
  7 |     MULTICLASS_CLASSIFICATION,
  8 |     REGRESSION,
  9 | )
 10 | from supervised.utils.metric import (
 11 |     CatBoostEvalMetricAveragePrecision,
 12 |     CatBoostEvalMetricMSE,
 13 |     CatBoostEvalMetricPearson,
 14 |     CatBoostEvalMetricSpearman,
 15 |     CatBoostEvalMetricUserDefined,
 16 |     Metric,
 17 | )
 18 | 
 19 | EPS = 1e-8
 20 | 
 21 | 
 22 | class CatBoostObjective:
 23 |     def __init__(
 24 |         self,
 25 |         ml_task,
 26 |         X_train,
 27 |         y_train,
 28 |         sample_weight,
 29 |         X_validation,
 30 |         y_validation,
 31 |         sample_weight_validation,
 32 |         eval_metric,
 33 |         cat_features_indices,
 34 |         n_jobs,
 35 |         random_state,
 36 |     ):
 37 |         self.ml_task = ml_task
 38 |         self.X_train = X_train
 39 |         self.y_train = y_train
 40 |         self.sample_weight = sample_weight
 41 |         self.X_validation = X_validation
 42 |         self.y_validation = y_validation
 43 |         self.eval_metric = eval_metric
 44 |         self.cat_features = cat_features_indices
 45 |         self.eval_set = Pool(
 46 |             data=X_validation,
 47 |             label=y_validation,
 48 |             cat_features=self.cat_features,
 49 |             weight=sample_weight_validation,
 50 |         )
 51 |         self.n_jobs = n_jobs
 52 |         self.rounds = 1000
 53 |         self.learning_rate = 0.0125
 54 |         self.early_stopping_rounds = 50
 55 |         self.seed = random_state
 56 | 
 57 |         self.objective = catboost_objective(ml_task, self.eval_metric.name)
 58 |         self.eval_metric_name = catboost_eval_metric(ml_task, self.eval_metric.name)
 59 |         self.custom_eval_metric = None
 60 |         if self.eval_metric_name == "spearman":
 61 |             self.custom_eval_metric = CatBoostEvalMetricSpearman()
 62 |         elif self.eval_metric_name == "pearson":
 63 |             self.custom_eval_metric = CatBoostEvalMetricPearson()
 64 |         elif self.eval_metric_name == "average_precision":
 65 |             self.custom_eval_metric = CatBoostEvalMetricAveragePrecision()
 66 |         elif self.eval_metric_name == "mse":
 67 |             self.custom_eval_metric = CatBoostEvalMetricMSE()
 68 |         elif self.eval_metric_name == "user_defined_metric":
 69 |             self.custom_eval_metric = CatBoostEvalMetricUserDefined()
 70 | 
 71 |     def __call__(self, trial):
 72 |         try:
 73 |             params = {
 74 |                 "iterations": self.rounds,
 75 |                 "learning_rate": trial.suggest_categorical(
 76 |                     "learning_rate", [0.05, 0.1, 0.2]
 77 |                 ),
 78 |                 "depth": trial.suggest_int("depth", 2, 9),
 79 |                 "l2_leaf_reg": trial.suggest_float(
 80 |                     "l2_leaf_reg", 0.0001, 10.0, log=False
 81 |                 ),
 82 |                 "random_strength": trial.suggest_float(
 83 |                     "random_strength", EPS, 10.0, log=False
 84 |                 ),
 85 |                 "rsm": trial.suggest_float("rsm", 0.1, 1),  # colsample_bylevel=rsm
 86 |                 "loss_function": self.objective,
 87 |                 "eval_metric": self.eval_metric_name,
 88 |                 "verbose": False,
 89 |                 "allow_writing_files": False,
 90 |                 "thread_count": self.n_jobs,
 91 |                 "random_seed": self.seed,
 92 |                 # "border_count": trial.suggest_int("border_count", 16, 2048),
 93 |                 "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
 94 |                 # "bootstrap_type": "Bernoulli"
 95 |                 # trial.suggest_categorical(
 96 |                 #    "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
 97 |                 # ),
 98 |             }
 99 |             # if params["bootstrap_type"] == "Bayesian":
100 |             #    params["bagging_temperature"] = trial.suggest_float(
101 |             #        "bagging_temperature", 0, 10
102 |             #    )
103 |             # elif params["bootstrap_type"] in ["Bernoulli", "MVS"]:
104 |             # params["subsample"] = trial.suggest_float("subsample", 0.1, 1)
105 | 
106 |             Algorithm = (
107 |                 CatBoostRegressor if self.ml_task == REGRESSION else CatBoostClassifier
108 |             )
109 |             if self.custom_eval_metric is not None:
110 |                 params["eval_metric"] = self.custom_eval_metric
111 |             model = Algorithm(**params)
112 | 
113 |             model.fit(
114 |                 self.X_train,
115 |                 self.y_train,
116 |                 sample_weight=self.sample_weight,
117 |                 early_stopping_rounds=self.early_stopping_rounds,
118 |                 eval_set=self.eval_set,
119 |                 verbose_eval=False,
120 |                 cat_features=self.cat_features,
121 |             )
122 | 
123 |             if self.ml_task == BINARY_CLASSIFICATION:
124 |                 preds = model.predict_proba(
125 |                     self.X_validation, ntree_end=model.best_iteration_ + 1
126 |                 )[:, 1]
127 |             elif self.ml_task == MULTICLASS_CLASSIFICATION:
128 |                 preds = model.predict_proba(
129 |                     self.X_validation, ntree_end=model.best_iteration_ + 1
130 |                 )
131 |             else:  # REGRESSION
132 |                 preds = model.predict(
133 |                     self.X_validation, ntree_end=model.best_iteration_ + 1
134 |                 )
135 | 
136 |             score = self.eval_metric(self.y_validation, preds)
137 |             if Metric.optimize_negative(self.eval_metric.name):
138 |                 score *= -1.0
139 | 
140 |         except optuna.exceptions.TrialPruned as e:
141 |             raise e
142 |         except Exception as e:
143 |             print("Exception in CatBoostObjective", str(e))
144 |             # import traceback
145 |             # print(traceback.format_exc())
146 |             return None
147 | 
148 |         return score
149 | 
```

--------------------------------------------------------------------------------
/supervised/validation/validator_kfold.py:
--------------------------------------------------------------------------------

```python
  1 | import gc
  2 | import logging
  3 | import os
  4 | import warnings
  5 | 
  6 | import numpy as np
  7 | 
  8 | log = logging.getLogger(__name__)
  9 | 
 10 | from sklearn.model_selection import KFold, StratifiedKFold
 11 | 
 12 | from supervised.exceptions import AutoMLException
 13 | from supervised.utils.utils import load_data
 14 | from supervised.validation.validator_base import BaseValidator
 15 | 
 16 | 
 17 | class KFoldValidator(BaseValidator):
 18 |     def __init__(self, params):
 19 |         BaseValidator.__init__(self, params)
 20 | 
 21 |         self.k_folds = self.params.get("k_folds", 5)
 22 |         self.shuffle = self.params.get("shuffle", True)
 23 |         self.stratify = self.params.get("stratify", False)
 24 |         self.random_seed = self.params.get("random_seed", 1906)
 25 |         self.repeats = self.params.get("repeats", 1)
 26 | 
 27 |         if not self.shuffle and self.repeats > 1:
 28 |             warnings.warn(
 29 |                 "Disable repeats in validation because shuffle is disabled", UserWarning
 30 |             )
 31 |             self.repeats = 1
 32 | 
 33 |         self.skf = []
 34 | 
 35 |         for r in range(self.repeats):
 36 |             random_seed = self.random_seed + r if self.shuffle else None
 37 |             if self.stratify:
 38 |                 if self.shuffle:
 39 |                     self.skf += [
 40 |                         StratifiedKFold(
 41 |                             n_splits=self.k_folds,
 42 |                             shuffle=self.shuffle,
 43 |                             random_state=random_seed,
 44 |                         )
 45 |                     ]
 46 |                 else:
 47 |                     self.skf += [
 48 |                         StratifiedKFold(
 49 |                             n_splits=self.k_folds,
 50 |                             shuffle=self.shuffle,
 51 |                             random_state=random_seed,
 52 |                         )
 53 |                     ]
 54 |             else:
 55 |                 self.skf += [
 56 |                     KFold(
 57 |                         n_splits=self.k_folds,
 58 |                         shuffle=self.shuffle,
 59 |                         random_state=random_seed,
 60 |                     )
 61 |                 ]
 62 | 
 63 |         self._results_path = self.params.get("results_path")
 64 |         self._X_path = self.params.get("X_path")
 65 |         self._y_path = self.params.get("y_path")
 66 |         self._sample_weight_path = self.params.get("sample_weight_path")
 67 |         self._sensitive_features_path = self.params.get("sensitive_features_path")
 68 | 
 69 |         if self._X_path is None or self._y_path is None:
 70 |             raise AutoMLException("No data path set in KFoldValidator params")
 71 | 
 72 |         folds_path = os.path.join(self._results_path, "folds")
 73 | 
 74 |         if not os.path.exists(folds_path):
 75 |             os.mkdir(folds_path)
 76 |             X = load_data(self._X_path)
 77 |             y = load_data(self._y_path)
 78 |             y = y["target"]
 79 | 
 80 |             if isinstance(y[0], bytes):
 81 |                 # see https://github.com/scikit-learn/scikit-learn/issues/16980
 82 |                 y = y.astype(str)
 83 | 
 84 |             for repeat_cnt, skf in enumerate(self.skf):
 85 |                 for fold_cnt, (train_index, validation_index) in enumerate(
 86 |                     skf.split(X, y)
 87 |                 ):
 88 |                     repeat_str = f"_repeat_{repeat_cnt}" if len(self.skf) > 1 else ""
 89 |                     train_index_file = os.path.join(
 90 |                         self._results_path,
 91 |                         "folds",
 92 |                         f"fold_{fold_cnt}{repeat_str}_train_indices.npy",
 93 |                     )
 94 |                     validation_index_file = os.path.join(
 95 |                         self._results_path,
 96 |                         "folds",
 97 |                         f"fold_{fold_cnt}{repeat_str}_validation_indices.npy",
 98 |                     )
 99 | 
100 |                     np.save(train_index_file, train_index)
101 |                     np.save(validation_index_file, validation_index)
102 |             del X
103 |             del y
104 |             gc.collect()
105 | 
106 |         else:
107 |             log.debug("Folds split already done, reuse it")
108 | 
109 |     def get_split(self, k, repeat=0):
110 |         repeat_str = f"_repeat_{repeat}" if self.repeats > 1 else ""
111 | 
112 |         train_index_file = os.path.join(
113 |             self._results_path, "folds", f"fold_{k}{repeat_str}_train_indices.npy"
114 |         )
115 |         validation_index_file = os.path.join(
116 |             self._results_path, "folds", f"fold_{k}{repeat_str}_validation_indices.npy"
117 |         )
118 | 
119 |         train_index = np.load(train_index_file)
120 |         validation_index = np.load(validation_index_file)
121 | 
122 |         X = load_data(self._X_path)
123 |         y = load_data(self._y_path)
124 |         y = y["target"]
125 | 
126 |         sample_weight = None
127 |         if self._sample_weight_path is not None:
128 |             sample_weight = load_data(self._sample_weight_path)
129 |             sample_weight = sample_weight["sample_weight"]
130 | 
131 |         sensitive_features = None
132 |         if self._sensitive_features_path is not None:
133 |             sensitive_features = load_data(self._sensitive_features_path)
134 | 
135 |         train_data = {"X": X.loc[train_index], "y": y.loc[train_index]}
136 |         validation_data = {"X": X.loc[validation_index], "y": y.loc[validation_index]}
137 |         if sample_weight is not None:
138 |             train_data["sample_weight"] = sample_weight.loc[train_index]
139 |             validation_data["sample_weight"] = sample_weight.loc[validation_index]
140 | 
141 |         if sensitive_features is not None:
142 |             train_data["sensitive_features"] = sensitive_features.loc[train_index]
143 |             validation_data["sensitive_features"] = sensitive_features.loc[
144 |                 validation_index
145 |             ]
146 | 
147 |         return (train_data, validation_data)
148 | 
149 |     def get_n_splits(self):
150 |         return self.k_folds
151 | 
152 |     def get_repeats(self):
153 |         return self.repeats
154 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/disable_eda.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import shutil
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn import datasets
  8 | 
  9 | from supervised import AutoML
 10 | from supervised.preprocessing.eda import EDA
 11 | 
 12 | 
 13 | class EDATest(unittest.TestCase):
 14 |     automl_dir = "automl_tests"
 15 | 
 16 |     def tearDown(self):
 17 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
 18 | 
 19 |     def test_explain_default(self):
 20 |         a = AutoML(
 21 |             results_path=self.automl_dir,
 22 |             total_time_limit=5,
 23 |             algorithms=["Baseline"],
 24 |             train_ensemble=False,
 25 |             explain_level=2,
 26 |         )
 27 | 
 28 |         X, y = datasets.make_classification(n_samples=100, n_features=5)
 29 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 30 |         y = pd.Series(y, name="class")
 31 | 
 32 |         a.fit(X, y)
 33 | 
 34 |         result_files = os.listdir(os.path.join(a._results_path, "EDA"))
 35 | 
 36 |         for col in X.columns:
 37 |             self.assertTrue(f"{col}.png" in result_files)
 38 |         self.assertTrue("target.png" in result_files)
 39 |         self.assertTrue("README.md" in result_files)
 40 | 
 41 |     def test_column_name_to_filename(self):
 42 |         """Valid feature name should be untouched"""
 43 |         col = "feature_1"
 44 |         self.assertEqual(EDA.prepare(col), col)
 45 | 
 46 |         self.tearDown()
 47 | 
 48 |     def test_extensive_eda(self):
 49 |         """
 50 |         Test for extensive_eda feature
 51 |         """
 52 | 
 53 |         X, y = datasets.make_regression(n_samples=100, n_features=5)
 54 | 
 55 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 56 |         y = pd.Series(y, name="class")
 57 | 
 58 |         results_path = self.automl_dir
 59 |         EDA.extensive_eda(X, y, results_path)
 60 |         result_files = os.listdir(results_path)
 61 | 
 62 |         for col in X.columns:
 63 |             self.assertTrue(f"{col}_target.png" in result_files)
 64 |         self.assertTrue("heatmap.png" in result_files)
 65 |         self.assertTrue("Extensive_EDA.md" in result_files)
 66 | 
 67 |         X, y = datasets.make_classification(n_samples=100, n_features=5)
 68 | 
 69 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 70 |         y = pd.Series(y, name="class")
 71 | 
 72 |         results_path = self.automl_dir
 73 |         EDA.extensive_eda(X, y, results_path)
 74 |         result_files = os.listdir(results_path)
 75 | 
 76 |         for col in X.columns:
 77 |             self.assertTrue(f"{col}_target.png" in result_files)
 78 |         self.assertTrue("heatmap.png" in result_files)
 79 |         self.assertTrue("Extensive_EDA.md" in result_files)
 80 | 
 81 |         self.tearDown()
 82 | 
 83 |     def test_extensive_eda_missing(self):
 84 |         """
 85 |         Test for dataframe with missing values
 86 |         """
 87 | 
 88 |         X, y = datasets.make_regression(n_samples=100, n_features=5)
 89 | 
 90 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
 91 |         y = pd.Series(y, name="class")
 92 | 
 93 |         ##add some nan values
 94 |         X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan
 95 | 
 96 |         results_path = self.automl_dir
 97 |         EDA.extensive_eda(X, y, results_path)
 98 |         result_files = os.listdir(results_path)
 99 | 
100 |         for col in X.columns:
101 |             self.assertTrue(f"{col}_target.png" in result_files)
102 |         self.assertTrue("heatmap.png" in result_files)
103 |         self.assertTrue("Extensive_EDA.md" in result_files)
104 | 
105 |         X, y = datasets.make_regression(n_samples=100, n_features=5)
106 | 
107 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
108 |         y = pd.Series(y, name="class")
109 | 
110 |         ##add some nan values
111 |         X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan
112 | 
113 |         results_path = self.automl_dir
114 |         EDA.extensive_eda(X, y, results_path)
115 |         result_files = os.listdir(results_path)
116 | 
117 |         for col in X.columns:
118 |             self.assertTrue(f"{col}_target.png" in result_files)
119 |         self.assertTrue("heatmap.png" in result_files)
120 |         self.assertTrue("Extensive_EDA.md" in result_files)
121 | 
122 |         self.tearDown()
123 | 
124 |     def test_symbol_feature(self):
125 |         """
126 |         Test for columns with forbidden filenames
127 |         """
128 | 
129 |         X, y = datasets.make_regression(n_samples=100, n_features=5)
130 | 
131 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
132 |         X.rename({"f_0": "ff*", "f_1": "fg/"}, axis=1, inplace=True)
133 |         y = pd.Series(y, name="class")
134 | 
135 |         results_path = self.automl_dir
136 |         EDA.extensive_eda(X, y, results_path)
137 |         result_files = os.listdir(results_path)
138 | 
139 |         for col in X.columns:
140 |             self.assertTrue(EDA.plot_fname(f"{col}_target") in result_files)
141 |         self.assertTrue("heatmap.png" in result_files)
142 |         self.assertTrue("Extensive_EDA.md" in result_files)
143 | 
144 |         self.tearDown()
145 | 
146 |     def test_naughty_column_name_to_filename(self):
147 |         """Test with naughty strings.
148 |         String from https://github.com/minimaxir/big-list-of-naughty-strings"""
149 |         os.mkdir(self.automl_dir)
150 |         naughty_columns = [
151 |             "feature_1",
152 |             "*",
153 |             "😍",
154 |             "¯\_(ツ)_/¯",
155 |             "表",
156 |             "𠜎𠜱𠝹𠱓",
157 |             "عاملة بولندا",
158 |             "Ṱ̺̺̕o͞ ̷" "🇸🇦🇫🇦🇲",
159 |             "⁰⁴⁵",
160 |             "∆˚¬…æ",
161 |             "!@#$%^&*()`~",
162 |             "onfocus=JaVaSCript:alert(123) autofocus",
163 |             "`\"'><img src=xxx:x \x20onerror=javascript:alert(1)>",
164 |             'System("ls -al /")',
165 |             'Kernel.exec("ls -al /")',
166 |             "لُلُصّبُلُل" "{% print 'x' * 64 * 1024**3 %}",
167 |             '{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}',
168 |             "ÜBER Über German Umlaut",
169 |             "影師嗎",
170 |             "C'est déjà l'été." "Nín hǎo. Wǒ shì zhōng guó rén",
171 |             "Компьютер",
172 |             "jaja---lol-méméméoo--a",
173 |         ]
174 |         for col in naughty_columns:
175 |             fname = EDA.plot_path(self.automl_dir, col)
176 |             with open(fname, "w") as fout:
177 |                 fout.write("ok")
178 | 
179 |         self.tearDown()
180 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/linear.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import sklearn
  7 | from sklearn.base import ClassifierMixin, RegressorMixin
  8 | from sklearn.linear_model import LinearRegression, LogisticRegression
  9 | 
 10 | from supervised.algorithms.registry import (
 11 |     BINARY_CLASSIFICATION,
 12 |     MULTICLASS_CLASSIFICATION,
 13 |     REGRESSION,
 14 |     AlgorithmsRegistry,
 15 | )
 16 | from supervised.algorithms.sklearn import SklearnAlgorithm
 17 | from supervised.utils.config import LOG_LEVEL
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(LOG_LEVEL)
 21 | 
 22 | 
 23 | class LinearAlgorithm(ClassifierMixin, SklearnAlgorithm):
 24 |     algorithm_name = "Logistic Regression"
 25 |     algorithm_short_name = "Linear"
 26 | 
 27 |     def __init__(self, params):
 28 |         super(LinearAlgorithm, self).__init__(params)
 29 |         logger.debug("LinearAlgorithm.__init__")
 30 |         self.max_iters = 1
 31 |         self.library_version = sklearn.__version__
 32 |         self.model = LogisticRegression(
 33 |             max_iter=500, tol=5e-4, n_jobs=self.params.get("n_jobs", -1)
 34 |         )
 35 | 
 36 |     def is_fitted(self):
 37 |         return (
 38 |             hasattr(self.model, "coef_")
 39 |             and self.model.coef_ is not None
 40 |             and self.model.coef_.shape[0] > 0
 41 |         )
 42 | 
 43 |     def file_extension(self):
 44 |         return "linear"
 45 | 
 46 |     def interpret(
 47 |         self,
 48 |         X_train,
 49 |         y_train,
 50 |         X_validation,
 51 |         y_validation,
 52 |         model_file_path,
 53 |         learner_name,
 54 |         target_name=None,
 55 |         class_names=None,
 56 |         metric_name=None,
 57 |         ml_task=None,
 58 |         explain_level=2,
 59 |     ):
 60 |         super(LinearAlgorithm, self).interpret(
 61 |             X_train,
 62 |             y_train,
 63 |             X_validation,
 64 |             y_validation,
 65 |             model_file_path,
 66 |             learner_name,
 67 |             target_name,
 68 |             class_names,
 69 |             metric_name,
 70 |             ml_task,
 71 |             explain_level,
 72 |         )
 73 |         if explain_level == 0:
 74 |             return
 75 |         if X_train.shape[1] > 100:
 76 |             # if too many columns, skip this step
 77 |             return
 78 |         coefs = self.model.coef_
 79 |         intercept = self.model.intercept_
 80 |         if self.params["ml_task"] == BINARY_CLASSIFICATION:
 81 |             df = pd.DataFrame(
 82 |                 {
 83 |                     "feature": ["intercept"] + X_train.columns.tolist(),
 84 |                     "weight": [intercept[0]] + list(coefs[0, :]),
 85 |                 }
 86 |             )
 87 |             df.to_csv(
 88 |                 os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=False
 89 |             )
 90 |         elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
 91 |             classes = list(class_names)
 92 |             if isinstance(class_names, dict):
 93 |                 classes = class_names.values()
 94 |             if len(classes) > 20:
 95 |                 # if there are too many classes, skip this step
 96 |                 return
 97 |             df = pd.DataFrame(
 98 |                 np.transpose(np.column_stack((intercept, coefs))),
 99 |                 index=["intercept"] + X_train.columns.tolist(),
100 |                 columns=classes,
101 |             )
102 |             df.to_csv(
103 |                 os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=True
104 |             )
105 | 
106 | 
107 | class LinearRegressorAlgorithm(RegressorMixin, SklearnAlgorithm):
108 |     algorithm_name = "Linear Regression"
109 |     algorithm_short_name = "Linear"
110 | 
111 |     def __init__(self, params):
112 |         super(LinearRegressorAlgorithm, self).__init__(params)
113 |         logger.debug("LinearRegressorAlgorithm.__init__")
114 |         self.max_iters = 1
115 |         self.library_version = sklearn.__version__
116 |         self.model = LinearRegression(n_jobs=self.params.get("n_jobs", -1))
117 | 
118 |     def is_fitted(self):
119 |         return (
120 |             hasattr(self.model, "coef_")
121 |             and self.model.coef_ is not None
122 |             and self.model.coef_.shape[0] > 0
123 |         )
124 | 
125 |     def file_extension(self):
126 |         return "linear"
127 | 
128 |     def interpret(
129 |         self,
130 |         X_train,
131 |         y_train,
132 |         X_validation,
133 |         y_validation,
134 |         model_file_path,
135 |         learner_name,
136 |         target_name=None,
137 |         class_names=None,
138 |         metric_name=None,
139 |         ml_task=None,
140 |         explain_level=2,
141 |     ):
142 |         super(LinearRegressorAlgorithm, self).interpret(
143 |             X_train,
144 |             y_train,
145 |             X_validation,
146 |             y_validation,
147 |             model_file_path,
148 |             learner_name,
149 |             target_name,
150 |             class_names,
151 |             metric_name,
152 |             ml_task,
153 |             explain_level,
154 |         )
155 |         if explain_level == 0:
156 |             return
157 |         if X_train.shape[1] > 100:
158 |             # if too many columns, skip this step
159 |             return
160 |         coefs = self.model.coef_
161 |         intercept = self.model.intercept_
162 |         df = pd.DataFrame(
163 |             {
164 |                 "feature": ["intercept"] + X_train.columns.tolist(),
165 |                 "weight": [intercept] + list(coefs),
166 |             }
167 |         )
168 |         df.to_csv(
169 |             os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=False
170 |         )
171 | 
172 | 
173 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None}
174 | required_preprocessing = [
175 |     "missing_values_inputation",
176 |     "convert_categorical",
177 |     "datetime_transform",
178 |     "text_transform",
179 |     "scale",
180 |     "target_as_integer",
181 | ]
182 | 
183 | AlgorithmsRegistry.add(
184 |     BINARY_CLASSIFICATION, LinearAlgorithm, {}, required_preprocessing, additional, {}
185 | )
186 | AlgorithmsRegistry.add(
187 |     MULTICLASS_CLASSIFICATION,
188 |     LinearAlgorithm,
189 |     {},
190 |     required_preprocessing,
191 |     additional,
192 |     {},
193 | )
194 | 
195 | regression_required_preprocessing = [
196 |     "missing_values_inputation",
197 |     "convert_categorical",
198 |     "datetime_transform",
199 |     "text_transform",
200 |     "scale",
201 |     "target_scale",
202 | ]
203 | 
204 | AlgorithmsRegistry.add(
205 |     REGRESSION,
206 |     LinearRegressorAlgorithm,
207 |     {},
208 |     regression_required_preprocessing,
209 |     additional,
210 |     {},
211 | )
212 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/lightgbm.py:
--------------------------------------------------------------------------------

```python
  1 | import lightgbm as lgb
  2 | import numpy as np
  3 | import optuna
  4 | import optuna_integration
  5 | import pandas as pd
  6 | 
  7 | from supervised.algorithms.lightgbm import lightgbm_eval_metric, lightgbm_objective
  8 | from supervised.algorithms.registry import (
  9 |     MULTICLASS_CLASSIFICATION,
 10 | )
 11 | from supervised.utils.metric import (
 12 |     Metric,
 13 |     lightgbm_eval_metric_accuracy,
 14 |     lightgbm_eval_metric_average_precision,
 15 |     lightgbm_eval_metric_f1,
 16 |     lightgbm_eval_metric_pearson,
 17 |     lightgbm_eval_metric_r2,
 18 |     lightgbm_eval_metric_spearman,
 19 |     lightgbm_eval_metric_user_defined,
 20 | )
 21 | 
 22 | EPS = 1e-8
 23 | 
 24 | 
 25 | class LightgbmObjective:
 26 |     def __init__(
 27 |         self,
 28 |         ml_task,
 29 |         X_train,
 30 |         y_train,
 31 |         sample_weight,
 32 |         X_validation,
 33 |         y_validation,
 34 |         sample_weight_validation,
 35 |         eval_metric,
 36 |         cat_features_indices,
 37 |         n_jobs,
 38 |         random_state,
 39 |     ):
 40 |         self.X_train = X_train
 41 |         self.y_train = y_train
 42 |         self.sample_weight = sample_weight
 43 |         self.X_validation = X_validation
 44 |         self.y_validation = y_validation
 45 |         self.sample_weight_validation = sample_weight_validation
 46 |         self.dtrain = lgb.Dataset(
 47 |             self.X_train.to_numpy()
 48 |             if isinstance(self.X_train, pd.DataFrame)
 49 |             else self.X_train,
 50 |             label=self.y_train,
 51 |             weight=self.sample_weight,
 52 |         )
 53 |         self.dvalid = lgb.Dataset(
 54 |             self.X_validation.to_numpy()
 55 |             if isinstance(self.X_validation, pd.DataFrame)
 56 |             else self.X_validation,
 57 |             label=self.y_validation,
 58 |             weight=self.sample_weight_validation,
 59 |         )
 60 | 
 61 |         self.cat_features_indices = cat_features_indices
 62 |         self.eval_metric = eval_metric
 63 |         self.learning_rate = 0.025
 64 |         self.rounds = 1000
 65 |         self.early_stopping_rounds = 50
 66 |         self.seed = random_state
 67 | 
 68 |         self.n_jobs = n_jobs
 69 |         if n_jobs == -1:
 70 |             self.n_jobs = 0
 71 | 
 72 |         self.objective = ""
 73 |         self.eval_metric_name = ""
 74 | 
 75 |         self.eval_metric_name, self.custom_eval_metric_name = lightgbm_eval_metric(
 76 |             ml_task, eval_metric.name
 77 |         )
 78 | 
 79 |         self.custom_eval_metric = None
 80 |         if self.eval_metric.name == "r2":
 81 |             self.custom_eval_metric = lightgbm_eval_metric_r2
 82 |         elif self.eval_metric.name == "spearman":
 83 |             self.custom_eval_metric = lightgbm_eval_metric_spearman
 84 |         elif self.eval_metric.name == "pearson":
 85 |             self.custom_eval_metric = lightgbm_eval_metric_pearson
 86 |         elif self.eval_metric.name == "f1":
 87 |             self.custom_eval_metric = lightgbm_eval_metric_f1
 88 |         elif self.eval_metric.name == "average_precision":
 89 |             self.custom_eval_metric = lightgbm_eval_metric_average_precision
 90 |         elif self.eval_metric.name == "accuracy":
 91 |             self.custom_eval_metric = lightgbm_eval_metric_accuracy
 92 |         elif self.eval_metric.name == "user_defined_metric":
 93 |             self.custom_eval_metric = lightgbm_eval_metric_user_defined
 94 | 
 95 |         self.num_class = (
 96 |             len(np.unique(y_train)) if ml_task == MULTICLASS_CLASSIFICATION else None
 97 |         )
 98 |         self.objective = lightgbm_objective(ml_task, eval_metric.name)
 99 | 
100 |     def __call__(self, trial):
101 |         param = {
102 |             "objective": self.objective,
103 |             "metric": self.eval_metric_name,
104 |             "verbosity": -1,
105 |             "boosting_type": "gbdt",
106 |             "learning_rate": trial.suggest_categorical(
107 |                 "learning_rate", [0.0125, 0.025, 0.05, 0.1]
108 |             ),
109 |             "num_leaves": trial.suggest_int("num_leaves", 2, 2048),
110 |             "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
111 |             "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
112 |             "feature_fraction": min(
113 |                 trial.suggest_float("feature_fraction", 0.3, 1.0 + EPS), 1.0
114 |             ),
115 |             "bagging_fraction": min(
116 |                 trial.suggest_float("bagging_fraction", 0.3, 1.0 + EPS), 1.0
117 |             ),
118 |             "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
119 |             "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
120 |             "feature_pre_filter": False,
121 |             "seed": self.seed,
122 |             "num_threads": self.n_jobs,
123 |             "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
124 |         }
125 | 
126 |         if self.cat_features_indices:
127 |             param["cat_feature"] = self.cat_features_indices
128 |             param["cat_l2"] = trial.suggest_float("cat_l2", EPS, 100.0)
129 |             param["cat_smooth"] = trial.suggest_float("cat_smooth", EPS, 100.0)
130 | 
131 |         if self.num_class is not None:
132 |             param["num_class"] = self.num_class
133 | 
134 |         try:
135 |             metric_name = self.eval_metric_name
136 |             if metric_name == "custom":
137 |                 metric_name = self.custom_eval_metric_name
138 |             pruning_callback = optuna_integration.LightGBMPruningCallback(
139 |                 trial, metric_name, "validation"
140 |             )
141 |             early_stopping_callback = lgb.early_stopping(
142 |                 self.early_stopping_rounds, verbose=False
143 |             )
144 | 
145 |             gbm = lgb.train(
146 |                 param,
147 |                 self.dtrain,
148 |                 valid_sets=[self.dvalid],
149 |                 valid_names=["validation"],
150 |                 callbacks=[pruning_callback, early_stopping_callback],
151 |                 num_boost_round=self.rounds,
152 |                 feval=self.custom_eval_metric,
153 |             )
154 | 
155 |             preds = gbm.predict(self.X_validation)
156 |             score = self.eval_metric(self.y_validation, preds)
157 |             if Metric.optimize_negative(self.eval_metric.name):
158 |                 score *= -1.0
159 |         except optuna.exceptions.TrialPruned as e:
160 |             raise e
161 |         except Exception as e:
162 |             print("Exception in LightgbmObjective", str(e))
163 |             return None
164 | 
165 |         return score
166 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/preprocessing_tuner.py:
--------------------------------------------------------------------------------

```python
  1 | from supervised.algorithms.registry import (
  2 |     BINARY_CLASSIFICATION,
  3 |     MULTICLASS_CLASSIFICATION,
  4 |     REGRESSION,
  5 | )
  6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
  7 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
  8 | from supervised.preprocessing.scale import Scale
  9 | 
 10 | 
 11 | class PreprocessingTuner:
 12 | 
 13 |     """
 14 |     This class prepare configuration for data preprocessing
 15 |     """
 16 | 
 17 |     CATEGORICALS_MIX = "categorical_mix"  # mix int and one-hot
 18 |     CATEGORICALS_ALL_INT = "categoricals_all_integers"
 19 | 
 20 |     @staticmethod
 21 |     def get(
 22 |         required_preprocessing,
 23 |         data_info,
 24 |         machinelearning_task,
 25 |         categorical_strategy=CATEGORICALS_ALL_INT,
 26 |     ):
 27 |         columns_preprocessing = {}
 28 |         columns_info = data_info["columns_info"]
 29 | 
 30 |         for col, preprocessing_needed in columns_info.items():
 31 |             preprocessing_to_apply = []
 32 | 
 33 |             # remove empty columns and columns with only one variable
 34 |             if (
 35 |                 "empty_column" in preprocessing_needed
 36 |                 or "constant_column" in preprocessing_needed
 37 |             ):
 38 |                 preprocessing_to_apply += ["remove_column"]
 39 |                 columns_preprocessing[col] = preprocessing_to_apply
 40 |                 continue
 41 | 
 42 |             # always check for missing values
 43 |             if (
 44 |                 "missing_values_inputation" in required_preprocessing
 45 |                 and "missing_values" in preprocessing_needed
 46 |             ):
 47 |                 preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
 48 |             # convert to categorical only for categorical types
 49 |             convert_to_integer_will_be_applied = False
 50 |             if (
 51 |                 "convert_categorical"
 52 |                 in required_preprocessing  # the algorithm needs converted categoricals
 53 |                 and "categorical" in preprocessing_needed  # the feature is categorical
 54 |             ):
 55 |                 if categorical_strategy == PreprocessingTuner.CATEGORICALS_MIX:
 56 |                     if PreprocessingCategorical.MANY_CATEGORIES in preprocessing_needed:
 57 |                         preprocessing_to_apply += [
 58 |                             PreprocessingCategorical.CONVERT_INTEGER
 59 |                         ]
 60 |                         convert_to_integer_will_be_applied = True  # maybe scale needed
 61 |                     else:
 62 |                         preprocessing_to_apply += [
 63 |                             PreprocessingCategorical.CONVERT_ONE_HOT
 64 |                         ]
 65 |                 else:  # all integers
 66 |                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
 67 |                     convert_to_integer_will_be_applied = True  # maybe scale needed
 68 | 
 69 |                 """
 70 |                 if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed:
 71 |                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT]
 72 |                 elif PreprocessingCategorical.CONVERT_LOO in preprocessing_needed:
 73 |                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_LOO]
 74 |                     convert_to_integer_will_be_applied = True  # maybe scale needed
 75 |                 else:
 76 |                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
 77 |                     convert_to_integer_will_be_applied = True  # maybe scale needed
 78 |                 """
 79 | 
 80 |             if (
 81 |                 "datetime_transform" in required_preprocessing
 82 |                 and "datetime_transform" in preprocessing_needed
 83 |             ):
 84 |                 preprocessing_to_apply += ["datetime_transform"]
 85 |             if (
 86 |                 "text_transform" in required_preprocessing
 87 |                 and "text_transform" in preprocessing_needed
 88 |             ):
 89 |                 preprocessing_to_apply += ["text_transform"]
 90 | 
 91 |             if "scale" in required_preprocessing:
 92 |                 if (
 93 |                     convert_to_integer_will_be_applied
 94 |                     or "scale" in preprocessing_needed
 95 |                 ):
 96 |                     preprocessing_to_apply += [Scale.SCALE_NORMAL]
 97 | 
 98 |             # remeber which preprocessing we need to apply
 99 |             if preprocessing_to_apply:
100 |                 columns_preprocessing[col] = preprocessing_to_apply
101 | 
102 |         target_info = data_info["target_info"]
103 |         target_preprocessing = []
104 |         # always remove missing values from target,
105 |         # target with missing values might be in the train and in the validation datasets
106 |         target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]
107 | 
108 |         if "target_as_integer" in required_preprocessing:
109 |             if machinelearning_task == BINARY_CLASSIFICATION:
110 |                 if "convert_0_1" in target_info:
111 |                     target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]
112 | 
113 |             if machinelearning_task == MULTICLASS_CLASSIFICATION:
114 |                 # if PreprocessingUtils.is_categorical(y):
115 |                 # always convert to integer, there can be many situations that can break
116 |                 # for example, classes starting from 1, ...
117 |                 # or classes not for every number, for example 0,2,3,4
118 |                 # just always convert
119 |                 target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]
120 | 
121 |         elif "target_as_one_hot" in required_preprocessing:
122 |             target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT]
123 | 
124 |         if (
125 |             machinelearning_task == REGRESSION
126 |             and "target_scale" in required_preprocessing
127 |         ):
128 |             if "scale_log" in target_info:
129 |                 target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL]
130 |             elif "scale" in target_info:
131 |                 target_preprocessing += [Scale.SCALE_NORMAL]
132 | 
133 |         return {
134 |             "columns_preprocessing": columns_preprocessing,
135 |             "target_preprocessing": target_preprocessing,
136 |             "ml_task": machinelearning_task,
137 |         }
138 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/sklearn.py:
--------------------------------------------------------------------------------

```python
  1 | import copy
  2 | import logging
  3 | import time
  4 | import warnings
  5 | 
  6 | import joblib
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | from supervised.algorithms.algorithm import BaseAlgorithm
 11 | from supervised.algorithms.registry import (
 12 |     BINARY_CLASSIFICATION,
 13 |     MULTICLASS_CLASSIFICATION,
 14 |     REGRESSION,
 15 | )
 16 | from supervised.utils.config import LOG_LEVEL
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | logger.setLevel(LOG_LEVEL)
 20 | 
 21 | 
 22 | class SklearnAlgorithm(BaseAlgorithm):
 23 |     def __init__(self, params):
 24 |         super(SklearnAlgorithm, self).__init__(params)
 25 | 
 26 |     def fit(
 27 |         self,
 28 |         X,
 29 |         y,
 30 |         sample_weight=None,
 31 |         X_validation=None,
 32 |         y_validation=None,
 33 |         sample_weight_validation=None,
 34 |         log_to_file=None,
 35 |         max_time=None,
 36 |     ):
 37 |         with warnings.catch_warnings():
 38 |             warnings.simplefilter(action="ignore")
 39 |             self.model.fit(X, y, sample_weight=sample_weight)
 40 |             if self.params["ml_task"] != REGRESSION:
 41 |                 self.classes_ = np.unique(y)
 42 | 
 43 |     def copy(self):
 44 |         return copy.deepcopy(self)
 45 | 
 46 |     def save(self, model_file_path):
 47 |         logger.debug("SklearnAlgorithm save to {0}".format(model_file_path))
 48 |         joblib.dump(self.model, model_file_path, compress=True)
 49 |         self.model_file_path = model_file_path
 50 | 
 51 |     def load(self, model_file_path):
 52 |         logger.debug("SklearnAlgorithm loading model from {0}".format(model_file_path))
 53 |         self.model = joblib.load(model_file_path)
 54 |         self.model_file_path = model_file_path
 55 | 
 56 |     def is_fitted(self):
 57 |         return (
 58 |             hasattr(self.model, "n_features_in_")
 59 |             and self.model.n_features_in_ is not None
 60 |             and self.model.n_features_in_ > 0
 61 |         )
 62 | 
 63 |     def predict(self, X):
 64 |         self.reload()
 65 |         if self.params["ml_task"] == BINARY_CLASSIFICATION:
 66 |             return self.model.predict_proba(X)[:, 1]
 67 |         elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION:
 68 |             return self.model.predict_proba(X)
 69 |         return self.model.predict(X)
 70 | 
 71 | 
 72 | from supervised.utils.metric import Metric
 73 | 
 74 | 
 75 | def predict_proba_function_binary(estimator, X):
 76 |     return estimator.predict_proba(X)[:, 1]
 77 | 
 78 | 
 79 | def predict_proba_function_multiclass(estimator, X):
 80 |     return estimator.predict_proba(X)
 81 | 
 82 | 
 83 | class SklearnTreesEnsembleClassifierAlgorithm(SklearnAlgorithm):
 84 |     def __init__(self, params):
 85 |         super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params)
 86 |         self.log_metric = Metric(
 87 |             {"name": self.params.get("eval_metric_name", "logloss")}
 88 |         )
 89 |         self.max_iters = (
 90 |             1  # max iters is used by model_framework, max_steps is used internally
 91 |         )
 92 |         if params.get("ml_task") == BINARY_CLASSIFICATION:
 93 |             self.predict_function = predict_proba_function_binary
 94 |         else:
 95 |             self.predict_function = predict_proba_function_multiclass
 96 | 
 97 |     def fit(
 98 |         self,
 99 |         X,
100 |         y,
101 |         sample_weight=None,
102 |         X_validation=None,
103 |         y_validation=None,
104 |         sample_weight_validation=None,
105 |         log_to_file=None,
106 |         max_time=None,
107 |     ):
108 |         max_steps = self.max_steps
109 |         n_estimators = 0
110 | 
111 |         min_val = 10e12
112 |         min_e = 0
113 | 
114 |         p_tr, p_vd = None, None
115 |         result = {"iteration": [], "train": [], "validation": []}
116 | 
117 |         start_time = time.time()
118 |         with warnings.catch_warnings():
119 |             warnings.simplefilter(action="ignore")
120 | 
121 |             for i in range(max_steps):
122 |                 self.model.fit(X, np.ravel(y), sample_weight=sample_weight)
123 |                 self.model.n_estimators += self.trees_in_step
124 | 
125 |                 if X_validation is None or y_validation is None:
126 |                     continue
127 |                 estimators = self.model.estimators_
128 | 
129 |                 stop = False
130 |                 for e in range(n_estimators, len(estimators)):
131 |                     p = self.predict_function(estimators[e], X)
132 |                     if p_tr is None:
133 |                         p_tr = p
134 |                     else:
135 |                         p_tr += p
136 | 
137 |                     p = self.predict_function(estimators[e], X_validation)
138 |                     if p_vd is None:
139 |                         p_vd = p
140 |                     else:
141 |                         p_vd += p
142 | 
143 |                     tr = self.log_metric(
144 |                         y, p_tr / float(e + 1), sample_weight=sample_weight
145 |                     )
146 |                     vd = self.log_metric(
147 |                         y_validation,
148 |                         p_vd / float(e + 1),
149 |                         sample_weight=sample_weight_validation,
150 |                     )
151 | 
152 |                     if vd < min_val:  # optimize direction
153 |                         min_val = vd
154 |                         min_e = e
155 | 
156 |                     if e - min_e >= self.early_stopping_rounds:
157 |                         stop = True
158 |                         break
159 | 
160 |                     result["iteration"] += [e]
161 |                     result["train"] += [tr]
162 |                     result["validation"] += [vd]
163 | 
164 |                 # disable for now ...
165 |                 # if max_time is not None and time.time()-start_time > max_time:
166 |                 #    stop = True
167 | 
168 |                 if stop:
169 |                     self.model.estimators_ = estimators[: (min_e + 1)]
170 |                     break
171 |                 n_estimators = len(estimators)
172 | 
173 |         if log_to_file is not None:
174 |             df_result = pd.DataFrame(result)
175 |             if self.log_metric.is_negative():
176 |                 df_result["train"] *= -1.0
177 |                 df_result["validation"] *= -1.0
178 |             df_result.to_csv(log_to_file, index=False, header=False)
179 | 
180 |         self.classes_ = np.unique(y)
181 | 
182 |     def get_metric_name(self):
183 |         return self.params.get("eval_metric_name", "logloss")
184 | 
185 | 
186 | def predict_function(estimator, X):
187 |     return estimator.predict(X)
188 | 
189 | 
190 | class SklearnTreesEnsembleRegressorAlgorithm(SklearnTreesEnsembleClassifierAlgorithm):
191 |     def __init__(self, params):
192 |         super(SklearnTreesEnsembleRegressorAlgorithm, self).__init__(params)
193 |         self.log_metric = Metric({"name": self.params.get("eval_metric_name", "rmse")})
194 |         self.predict_function = predict_function
195 | 
196 |     def get_metric_name(self):
197 |         return self.params.get("eval_metric_name", "rmse")
198 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_preprocessing_missing.py:
--------------------------------------------------------------------------------

```python
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues
  7 | 
  8 | 
  9 | class PreprocessingMissingValuesTest(unittest.TestCase):
 10 |     def test_preprocessing_constructor(self):
 11 |         """
 12 |         Check if PreprocessingMissingValues object is properly initialized
 13 |         """
 14 |         preprocess_missing = PreprocessingMissingValues(
 15 |             PreprocessingMissingValues.FILL_NA_MEDIAN
 16 |         )
 17 |         self.assertEqual(
 18 |             preprocess_missing._na_fill_method,
 19 |             PreprocessingMissingValues.FILL_NA_MEDIAN,
 20 |         )
 21 |         self.assertEqual(preprocess_missing._na_fill_params, {})
 22 | 
 23 |     def test_get_fill_value(self):
 24 |         """
 25 |         Check if correct value is returned for filling in case of different
 26 |         column type and fill method
 27 |         """
 28 |         d = {"col1": [1, 2, 3, np.nan, np.nan], "col2": ["a", "a", np.nan, "b", "c"]}
 29 |         df = pd.DataFrame(data=d)
 30 |         # fill with median
 31 |         preprocess_missing = PreprocessingMissingValues(
 32 |             df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
 33 |         )
 34 |         self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 2)
 35 |         self.assertEqual(preprocess_missing._get_fill_value(df["col2"]), "a")
 36 |         # fill with mean
 37 |         preprocess_missing = PreprocessingMissingValues(
 38 |             df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
 39 |         )
 40 |         self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 2)
 41 |         self.assertEqual(preprocess_missing._get_fill_value(df["col2"]), "a")
 42 |         # fill with min
 43 |         preprocess_missing = PreprocessingMissingValues(
 44 |             df.columns, PreprocessingMissingValues.FILL_NA_MIN
 45 |         )
 46 |         self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 0)
 47 |         self.assertEqual(
 48 |             preprocess_missing._get_fill_value(df["col2"]), "_missing_value_"
 49 |         )  # added new value
 50 | 
 51 |     def test_fit_na_fill(self):
 52 |         """
 53 |         Check fit private method
 54 |         """
 55 |         d = {
 56 |             "col1": [1, 2, 3, np.nan, np.nan],
 57 |             "col2": ["a", "a", np.nan, "b", "c"],
 58 |             "col3": ["a", "a", "d", "b", "c"],
 59 |         }
 60 |         df = pd.DataFrame(data=d)
 61 |         # fill with median
 62 |         preprocess_missing = PreprocessingMissingValues(
 63 |             df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
 64 |         )
 65 |         preprocess_missing._fit_na_fill(df)
 66 |         self.assertTrue("col1" in preprocess_missing._na_fill_params)
 67 |         self.assertTrue("col2" in preprocess_missing._na_fill_params)
 68 |         self.assertTrue("col3" not in preprocess_missing._na_fill_params)
 69 |         self.assertEqual(2, preprocess_missing._na_fill_params["col1"])
 70 |         self.assertEqual("a", preprocess_missing._na_fill_params["col2"])
 71 |         # fill with mean
 72 |         preprocess_missing = PreprocessingMissingValues(
 73 |             df.columns, PreprocessingMissingValues.FILL_NA_MEAN
 74 |         )
 75 |         preprocess_missing._fit_na_fill(df)
 76 |         self.assertTrue("col1" in preprocess_missing._na_fill_params)
 77 |         self.assertTrue("col2" in preprocess_missing._na_fill_params)
 78 |         self.assertTrue("col3" not in preprocess_missing._na_fill_params)
 79 |         self.assertEqual(2, preprocess_missing._na_fill_params["col1"])
 80 |         self.assertEqual("a", preprocess_missing._na_fill_params["col2"])
 81 |         # fill with min
 82 |         preprocess_missing = PreprocessingMissingValues(
 83 |             df.columns, PreprocessingMissingValues.FILL_NA_MIN
 84 |         )
 85 |         preprocess_missing._fit_na_fill(df)
 86 |         self.assertTrue("col1" in preprocess_missing._na_fill_params)
 87 |         self.assertTrue("col2" in preprocess_missing._na_fill_params)
 88 |         self.assertTrue("col3" not in preprocess_missing._na_fill_params)
 89 |         self.assertEqual(0, preprocess_missing._na_fill_params["col1"])
 90 |         self.assertEqual("_missing_value_", preprocess_missing._na_fill_params["col2"])
 91 | 
 92 |     def test_transform(self):
 93 |         """
 94 |         Check transform
 95 |         """
 96 |         # training data
 97 |         d = {
 98 |             "col1": [1, 2, 3, np.nan, np.nan],
 99 |             "col2": ["a", "a", np.nan, "a", "c"],
100 |             "col3": [1, 1, 3, 1, 1],
101 |             "col4": ["a", "a", "a", "c", "a"],
102 |         }
103 |         df = pd.DataFrame(data=d)
104 |         # test data
105 |         d_test = {
106 |             "col1": [1, 2, 3, np.nan, np.nan],
107 |             "col2": ["b", "b", np.nan, "b", "c"],
108 |             "col3": [1, 2, 2, np.nan, 2],
109 |             "col4": ["b", "b", np.nan, "b", "c"],
110 |         }
111 |         df_test = pd.DataFrame(data=d_test)
112 |         # fill with median
113 |         preprocess_missing = PreprocessingMissingValues(
114 |             df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
115 |         )
116 |         preprocess_missing.fit(df)
117 |         self.assertEqual(
118 |             2, len(preprocess_missing._na_fill_params)
119 |         )  # there should be only two columns
120 |         df_transformed = preprocess_missing.transform(df_test)
121 |         self.assertTrue(
122 |             np.isnan(df.loc[3, "col1"])
123 |         )  # training data frame is not filled
124 |         self.assertEqual(
125 |             2, df_test.loc[3, "col1"]
126 |         )  # data frame is filled after transform
127 |         self.assertEqual("a", df_test.loc[2, "col2"])
128 | 
129 |         # it is disabled, should be treated separately at the end of preprocessing
130 |         # columns without missing values in training set are also filled
131 |         # but they are filled based on their own values
132 |         # self.assertEqual(2, df_test.loc[3, "col3"])
133 |         # self.assertEqual("b", df_test.loc[3, "col4"])
134 | 
135 |     def test_transform_on_new_data(self):
136 |         # training data
137 |         d = {
138 |             "col1": [1, 1, np.nan, 3],
139 |             "col2": ["a", "a", np.nan, "a"],
140 |             "col3": [1, 1, 1, 3],
141 |             "col4": ["a", "a", "b", "c"],
142 |             "y": [0, 1, 1, 1],
143 |         }
144 |         df = pd.DataFrame(data=d)
145 |         X_train = df.loc[:, ["col1", "col2", "col3", "col4"]]
146 |         y_train = df.loc[:, "y"]
147 | 
148 |         d_test = {
149 |             "col1": [1, 1, np.nan, 3],
150 |             "col2": ["a", "a", np.nan, "a"],
151 |             "col3": [1, 1, 1, 3],
152 |             "col4": ["a", "a", "b", "c"],
153 |             "y": [np.nan, 1, np.nan, 1],
154 |         }
155 |         df_test = pd.DataFrame(data=d_test)
156 |         X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
157 |         y_test = df_test.loc[:, "y"]
158 | 
159 |         pm = PreprocessingMissingValues(
160 |             X_train.columns, PreprocessingMissingValues.FILL_NA_MEDIAN
161 |         )
162 |         pm.fit(X_train)
163 |         X_train = pm.transform(X_train)
164 |         X_test = pm.transform(X_test)
165 | 
166 |         self.assertEqual(1, X_test.loc[2, "col1"])
167 |         self.assertEqual("a", X_test.loc[2, "col2"])
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     unittest.main()
172 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_categorical_integers.py:
--------------------------------------------------------------------------------

```python
  1 | import unittest
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
  6 | 
  7 | import warnings
  8 | 
  9 | 
 10 | class CategoricalIntegersTest(unittest.TestCase):
 11 |     def test_constructor_preprocessing_categorical(self):
 12 |         """
 13 |         Check if PreprocessingCategorical object is properly initialized
 14 |         """
 15 |         categorical = PreprocessingCategorical(
 16 |             [], PreprocessingCategorical.CONVERT_INTEGER
 17 |         )
 18 |         self.assertEqual(
 19 |             categorical._convert_method, PreprocessingCategorical.CONVERT_INTEGER
 20 |         )
 21 |         self.assertEqual(categorical._convert_params, {})
 22 | 
 23 |     def test_fit_integers(self):
 24 |         # training data
 25 |         d = {
 26 |             "col1": [1, 2, 3],
 27 |             "col2": ["a", "a", "c"],
 28 |             "col3": [1, 1, 3],
 29 |             "col4": ["a", "b", "c"],
 30 |         }
 31 |         df = pd.DataFrame(data=d)
 32 |         categorical = PreprocessingCategorical(
 33 |             df.columns, PreprocessingCategorical.CONVERT_INTEGER
 34 |         )
 35 |         categorical.fit(df)
 36 | 
 37 |         self.assertTrue("col2" in categorical._convert_params)
 38 |         self.assertTrue("col4" in categorical._convert_params)
 39 |         self.assertTrue("a" in categorical._convert_params["col2"])
 40 |         self.assertTrue("c" in categorical._convert_params["col2"])
 41 |         self.assertTrue("b" not in categorical._convert_params["col2"])
 42 |         self.assertTrue("a" in categorical._convert_params["col4"])
 43 |         self.assertTrue("b" in categorical._convert_params["col4"])
 44 |         self.assertTrue("c" in categorical._convert_params["col4"])
 45 | 
 46 |     def test_fit_transform_integers(self):
 47 |         # training data
 48 |         d = {
 49 |             "col1": [1, 2, 3],
 50 |             "col2": ["a", "a", "c"],
 51 |             "col3": [1, 1, 3],
 52 |             "col4": ["a", "b", "c"],
 53 |         }
 54 |         df = pd.DataFrame(data=d)
 55 |         categorical = PreprocessingCategorical(
 56 |             df.columns, PreprocessingCategorical.CONVERT_INTEGER
 57 |         )
 58 |         categorical.fit(df)
 59 |         df = categorical.transform(df)
 60 |         for col in ["col1", "col2", "col3", "col4"]:
 61 |             self.assertTrue(col in df.columns)
 62 |         self.assertEqual(df["col2"][0], 0)
 63 |         self.assertEqual(df["col2"][1], 0)
 64 |         self.assertEqual(df["col2"][2], 1)
 65 |         self.assertEqual(df["col4"][0], 0)
 66 |         self.assertEqual(df["col4"][1], 1)
 67 |         self.assertEqual(df["col4"][2], 2)
 68 | 
 69 |     def test_future_warning_pandas_transform(self):
 70 |         with warnings.catch_warnings():
 71 |             warnings.simplefilter("error")
 72 | 
 73 |             # training data
 74 |             d = {
 75 |                 "col1": [False, True, True],
 76 |                 "col2": [False, False, True],
 77 |                 "col3": [True, False, True],
 78 |             }
 79 |             df = pd.DataFrame(data=d)
 80 |             categorical = PreprocessingCategorical(
 81 |                 df.columns, PreprocessingCategorical.CONVERT_INTEGER
 82 |             )
 83 |             categorical.fit(df)
 84 | 
 85 |             df = categorical.transform(df).astype(int)
 86 | 
 87 |     def test_future_warning_pandas_inverse_transform(self):
 88 |         with warnings.catch_warnings():
 89 |             warnings.simplefilter("error")
 90 | 
 91 |             # training data
 92 |             d = {
 93 |                 "col1": [False, True, True],
 94 |                 "col2": [False, False, True],
 95 |                 "col3": [True, False, True],
 96 |             }
 97 |             df = pd.DataFrame(data=d)
 98 |             categorical = PreprocessingCategorical(
 99 |                 df.columns, PreprocessingCategorical.CONVERT_INTEGER
100 |             )
101 |             categorical.fit(df)
102 | 
103 |             df = categorical.transform(df).astype(int)
104 |             df = categorical.inverse_transform(df)
105 | 
106 |     def test_fit_transform_inverse_transform_integers(self):
107 |         # training data
108 |         d = {
109 |             "col1": [1, 2, 3],
110 |             "col2": ["a", "a", "c"],
111 |             "col3": [1, 1, 3],
112 |             "col4": ["a", "b", "c"],
113 |         }
114 |         df = pd.DataFrame(data=d)
115 |         categorical = PreprocessingCategorical(
116 |             df.columns, PreprocessingCategorical.CONVERT_INTEGER
117 |         )
118 |         categorical.fit(df)
119 |         df_transform = categorical.transform(df).astype(int)
120 |         df_inverse = categorical.inverse_transform(df_transform)
121 |         for col in ["col1", "col2", "col3", "col4"]:
122 |             self.assertTrue(col in df_inverse.columns)
123 |         self.assertEqual(d["col2"][0], df_inverse["col2"][0])
124 |         self.assertEqual(d["col2"][1], df_inverse["col2"][1])
125 |         self.assertEqual(d["col2"][2], df_inverse["col2"][2])
126 |         self.assertEqual(d["col4"][0], df_inverse["col4"][0])
127 |         self.assertEqual(d["col4"][1], df_inverse["col4"][1])
128 |         self.assertEqual(d["col4"][2], df_inverse["col4"][2])
129 | 
130 |     def test_fit_transform_integers_with_new_values(self):
131 |         # training data
132 |         d_train = {
133 |             "col1": [1, 2, 3],
134 |             "col2": ["a", "a", "c"],
135 |             "col3": [1, 1, 3],
136 |             "col4": ["a", "b", "c"],
137 |         }
138 |         df_train = pd.DataFrame(data=d_train)
139 |         categorical = PreprocessingCategorical(
140 |             df_train.columns, PreprocessingCategorical.CONVERT_INTEGER
141 |         )
142 |         categorical.fit(df_train)
143 |         # testing data
144 |         d = {
145 |             "col1": [1, 2, 3],
146 |             "col2": ["a", "d", "f"],
147 |             "col3": [1, 1, 3],
148 |             "col4": ["e", "b", "z"],
149 |         }
150 |         df = pd.DataFrame(data=d)
151 |         df = categorical.transform(df)
152 |         for col in ["col1", "col2", "col3", "col4"]:
153 |             self.assertTrue(col in df.columns)
154 |         self.assertEqual(df["col2"][0], 0)
155 |         self.assertEqual(df["col2"][1], 2)  # new values get higher indexes
156 |         self.assertEqual(df["col2"][2], 3)  # new values get higher indexes
157 |         self.assertEqual(df["col4"][0], 3)  # new values get higher indexes
158 |         self.assertEqual(df["col4"][1], 1)
159 |         self.assertEqual(df["col4"][2], 4)  # new values get higher indexes
160 | 
161 |     def test_to_and_from_json_convert_integers(self):
162 |         # training data
163 |         d = {
164 |             "col1": [1, 2, 3],
165 |             "col2": ["a", "a", "c"],
166 |             "col3": [1, 1, 3],
167 |             "col4": ["a", "b", "c"],
168 |         }
169 |         df = pd.DataFrame(data=d)
170 |         cat1 = PreprocessingCategorical(
171 |             df.columns, PreprocessingCategorical.CONVERT_INTEGER
172 |         )
173 |         cat1.fit(df)
174 | 
175 |         cat2 = PreprocessingCategorical(
176 |             df.columns, PreprocessingCategorical.CONVERT_INTEGER
177 |         )
178 |         cat2.from_json(cat1.to_json())
179 |         df = cat2.transform(df)
180 |         for col in ["col1", "col2", "col3", "col4"]:
181 |             self.assertTrue(col in df.columns)
182 |         self.assertEqual(df["col2"][0], 0)
183 |         self.assertEqual(df["col2"][1], 0)
184 |         self.assertEqual(df["col2"][2], 1)
185 |         self.assertEqual(df["col4"][0], 0)
186 |         self.assertEqual(df["col4"][1], 1)
187 |         self.assertEqual(df["col4"][2], 2)
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     unittest.main()
192 | 
```

--------------------------------------------------------------------------------
/tests/tests_validation/test_validator_kfold.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | import pytest
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from supervised.utils.utils import dump_data
 10 | from supervised.validation.validator_kfold import KFoldValidator
 11 | 
 12 | 
 13 | class KFoldValidatorTest(unittest.TestCase):
 14 |     def test_create(self):
 15 |         with tempfile.TemporaryDirectory() as results_path:
 16 |             data = {
 17 |                 "X": pd.DataFrame(
 18 |                     np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
 19 |                 ),
 20 |                 "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
 21 |             }
 22 | 
 23 |             X_path = os.path.join(results_path, "X.data")
 24 |             y_path = os.path.join(results_path, "y.data")
 25 | 
 26 |             dump_data(X_path, data["X"])
 27 |             dump_data(y_path, data["y"])
 28 | 
 29 |             params = {
 30 |                 "shuffle": False,
 31 |                 "stratify": True,
 32 |                 "k_folds": 2,
 33 |                 "results_path": results_path,
 34 |                 "X_path": X_path,
 35 |                 "y_path": y_path,
 36 |             }
 37 |             vl = KFoldValidator(params)
 38 | 
 39 |             self.assertEqual(params["k_folds"], vl.get_n_splits())
 40 |             # for train, validation in vl.split():
 41 |             for k_fold in range(vl.get_n_splits()):
 42 |                 train, validation = vl.get_split(k_fold)
 43 | 
 44 |                 X_train, y_train = train.get("X"), train.get("y")
 45 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
 46 | 
 47 |                 self.assertEqual(X_train.shape[0], 2)
 48 |                 self.assertEqual(y_train.shape[0], 2)
 49 |                 self.assertEqual(X_validation.shape[0], 2)
 50 |                 self.assertEqual(y_validation.shape[0], 2)
 51 | 
 52 |     def test_missing_target_values(self):
 53 |         with tempfile.TemporaryDirectory() as results_path:
 54 |             data = {
 55 |                 "X": pd.DataFrame(
 56 |                     np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
 57 |                     columns=["a", "b"],
 58 |                 ),
 59 |                 "y": pd.DataFrame(
 60 |                     np.array(["a", "b", "a", "b", np.nan, np.nan]), columns=["target"]
 61 |                 ),
 62 |             }
 63 | 
 64 |             X_path = os.path.join(results_path, "X.data")
 65 |             y_path = os.path.join(results_path, "y.data")
 66 | 
 67 |             dump_data(X_path, data["X"])
 68 |             dump_data(y_path, data["y"])
 69 | 
 70 |             params = {
 71 |                 "shuffle": False,
 72 |                 "stratify": True,
 73 |                 "k_folds": 2,
 74 |                 "results_path": results_path,
 75 |                 "X_path": X_path,
 76 |                 "y_path": y_path,
 77 |             }
 78 |             vl = KFoldValidator(params)
 79 | 
 80 |             self.assertEqual(params["k_folds"], vl.get_n_splits())
 81 | 
 82 |             for k_fold in range(vl.get_n_splits()):
 83 |                 train, validation = vl.get_split(k_fold)
 84 |                 X_train, y_train = train.get("X"), train.get("y")
 85 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
 86 | 
 87 |                 self.assertEqual(X_train.shape[0], 3)
 88 |                 self.assertEqual(y_train.shape[0], 3)
 89 |                 self.assertEqual(X_validation.shape[0], 3)
 90 |                 self.assertEqual(y_validation.shape[0], 3)
 91 | 
 92 |     def test_create_with_target_as_labels(self):
 93 |         with tempfile.TemporaryDirectory() as results_path:
 94 |             data = {
 95 |                 "X": pd.DataFrame(
 96 |                     np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
 97 |                 ),
 98 |                 "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]),
 99 |             }
100 | 
101 |             X_path = os.path.join(results_path, "X.data")
102 |             y_path = os.path.join(results_path, "y.data")
103 | 
104 |             dump_data(X_path, data["X"])
105 |             dump_data(y_path, data["y"])
106 | 
107 |             params = {
108 |                 "shuffle": True,
109 |                 "stratify": True,
110 |                 "k_folds": 2,
111 |                 "results_path": results_path,
112 |                 "X_path": X_path,
113 |                 "y_path": y_path,
114 |             }
115 |             vl = KFoldValidator(params)
116 | 
117 |             self.assertEqual(params["k_folds"], vl.get_n_splits())
118 | 
119 |             for k_fold in range(vl.get_n_splits()):
120 |                 train, validation = vl.get_split(k_fold)
121 |                 X_train, y_train = train.get("X"), train.get("y")
122 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
123 | 
124 |                 self.assertEqual(X_train.shape[0], 2)
125 |                 self.assertEqual(y_train.shape[0], 2)
126 |                 self.assertEqual(X_validation.shape[0], 2)
127 |                 self.assertEqual(y_validation.shape[0], 2)
128 | 
129 |     def test_repeats(self):
130 |         with tempfile.TemporaryDirectory() as results_path:
131 |             data = {
132 |                 "X": pd.DataFrame(
133 |                     np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
134 |                 ),
135 |                 "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
136 |             }
137 | 
138 |             X_path = os.path.join(results_path, "X.data")
139 |             y_path = os.path.join(results_path, "y.data")
140 | 
141 |             dump_data(X_path, data["X"])
142 |             dump_data(y_path, data["y"])
143 | 
144 |             params = {
145 |                 "shuffle": True,
146 |                 "stratify": False,
147 |                 "k_folds": 2,
148 |                 "repeats": 10,
149 |                 "results_path": results_path,
150 |                 "X_path": X_path,
151 |                 "y_path": y_path,
152 |                 "random_seed": 1,
153 |             }
154 |             vl = KFoldValidator(params)
155 | 
156 |             self.assertEqual(params["k_folds"], vl.get_n_splits())
157 |             self.assertEqual(params["repeats"], vl.get_repeats())
158 | 
159 |             for repeat in range(vl.get_repeats()):
160 |                 for k_fold in range(vl.get_n_splits()):
161 |                     train, validation = vl.get_split(k_fold, repeat)
162 | 
163 |                     X_train, y_train = train.get("X"), train.get("y")
164 |                     X_validation, y_validation = validation.get("X"), validation.get(
165 |                         "y"
166 |                     )
167 | 
168 |                     self.assertEqual(X_train.shape[0], 2)
169 |                     self.assertEqual(y_train.shape[0], 2)
170 |                     self.assertEqual(X_validation.shape[0], 2)
171 |                     self.assertEqual(y_validation.shape[0], 2)
172 | 
173 |     def test_disable_repeats_when_disabled_shuffle(self):
174 |         with tempfile.TemporaryDirectory() as results_path:
175 |             data = {
176 |                 "X": pd.DataFrame(
177 |                     np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
178 |                 ),
179 |                 "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]),
180 |             }
181 | 
182 |             X_path = os.path.join(results_path, "X.data")
183 |             y_path = os.path.join(results_path, "y.data")
184 | 
185 |             dump_data(X_path, data["X"])
186 |             dump_data(y_path, data["y"])
187 | 
188 |             params = {
189 |                 "shuffle": False,
190 |                 "stratify": False,
191 |                 "k_folds": 2,
192 |                 "repeats": 10,
193 |                 "results_path": results_path,
194 |                 "X_path": X_path,
195 |                 "y_path": y_path,
196 |                 "random_seed": 1,
197 |             }
198 | 
199 |             with pytest.warns(
200 |                 expected_warning=UserWarning,
201 |                 match="Disable repeats in validation because shuffle is disabled",
202 |             ) as record:
203 |                 vl = KFoldValidator(params)
204 | 
205 |             # check that only one warning was raised
206 |             self.assertEqual(len(record), 1)
207 | 
208 |             self.assertEqual(params["k_folds"], vl.get_n_splits())
209 |             self.assertEqual(1, vl.get_repeats())
210 | 
```

--------------------------------------------------------------------------------
/tests/tests_validation/test_validator_split.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import unittest
  4 | import pytest
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from supervised.utils.utils import dump_data
 10 | from supervised.validation.validator_split import SplitValidator
 11 | 
 12 | 
 13 | class SplitValidatorTest(unittest.TestCase):
 14 |     def test_create(self):
 15 |         with tempfile.TemporaryDirectory() as results_path:
 16 |             data = {
 17 |                 "X": pd.DataFrame(
 18 |                     np.array(
 19 |                         [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
 20 |                     ),
 21 |                     columns=["a", "b"],
 22 |                 ),
 23 |                 "y": pd.DataFrame(
 24 |                     np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]
 25 |                 ),
 26 |             }
 27 | 
 28 |             X_path = os.path.join(results_path, "X.data")
 29 |             y_path = os.path.join(results_path, "y.data")
 30 | 
 31 |             dump_data(X_path, data["X"])
 32 |             dump_data(y_path, data["y"])
 33 | 
 34 |             params = {
 35 |                 "shuffle": False,
 36 |                 "stratify": False,
 37 |                 "train_ratio": 0.5,
 38 |                 "results_path": results_path,
 39 |                 "X_path": X_path,
 40 |                 "y_path": y_path,
 41 |             }
 42 |             vl = SplitValidator(params)
 43 | 
 44 |             self.assertEqual(1, vl.get_n_splits())
 45 |             # for train, validation in vl.split():
 46 |             for k_fold in range(vl.get_n_splits()):
 47 |                 train, validation = vl.get_split(k_fold)
 48 | 
 49 |                 X_train, y_train = train.get("X"), train.get("y")
 50 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
 51 | 
 52 |                 self.assertEqual(X_train.shape[0], 4)
 53 |                 self.assertEqual(y_train.shape[0], 4)
 54 |                 self.assertEqual(X_validation.shape[0], 4)
 55 |                 self.assertEqual(y_validation.shape[0], 4)
 56 | 
 57 |     def test_missing_target_values(self):
 58 |         with tempfile.TemporaryDirectory() as results_path:
 59 |             data = {
 60 |                 "X": pd.DataFrame(
 61 |                     np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]),
 62 |                     columns=["a", "b"],
 63 |                 ),
 64 |                 "y": pd.DataFrame(
 65 |                     np.array(["a", "b", np.nan, "a", "b", np.nan]), columns=["target"]
 66 |                 ),
 67 |             }
 68 | 
 69 |             X_path = os.path.join(results_path, "X.data")
 70 |             y_path = os.path.join(results_path, "y.data")
 71 | 
 72 |             dump_data(X_path, data["X"])
 73 |             dump_data(y_path, data["y"])
 74 | 
 75 |             params = {
 76 |                 "shuffle": False,
 77 |                 "stratify": False,
 78 |                 "train_ratio": 0.5,
 79 |                 "results_path": results_path,
 80 |                 "X_path": X_path,
 81 |                 "y_path": y_path,
 82 |             }
 83 |             vl = SplitValidator(params)
 84 | 
 85 |             self.assertEqual(1, vl.get_n_splits())
 86 | 
 87 |             for k_fold in range(vl.get_n_splits()):
 88 |                 train, validation = vl.get_split(k_fold)
 89 |                 X_train, y_train = train.get("X"), train.get("y")
 90 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
 91 | 
 92 |                 self.assertEqual(X_train.shape[0], 3)
 93 |                 self.assertEqual(y_train.shape[0], 3)
 94 |                 self.assertEqual(X_validation.shape[0], 3)
 95 |                 self.assertEqual(y_validation.shape[0], 3)
 96 | 
 97 |     def test_create_with_target_as_labels(self):
 98 |         with tempfile.TemporaryDirectory() as results_path:
 99 |             data = {
100 |                 "X": pd.DataFrame(
101 |                     np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"]
102 |                 ),
103 |                 "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]),
104 |             }
105 | 
106 |             X_path = os.path.join(results_path, "X.data")
107 |             y_path = os.path.join(results_path, "y.data")
108 | 
109 |             dump_data(X_path, data["X"])
110 |             dump_data(y_path, data["y"])
111 | 
112 |             params = {
113 |                 "shuffle": True,
114 |                 "stratify": True,
115 |                 "train_ratio": 0.5,
116 |                 "results_path": results_path,
117 |                 "X_path": X_path,
118 |                 "y_path": y_path,
119 |             }
120 |             vl = SplitValidator(params)
121 | 
122 |             self.assertEqual(1, vl.get_n_splits())
123 | 
124 |             for k_fold in range(vl.get_n_splits()):
125 |                 train, validation = vl.get_split(k_fold)
126 |                 X_train, y_train = train.get("X"), train.get("y")
127 |                 X_validation, y_validation = validation.get("X"), validation.get("y")
128 | 
129 |                 self.assertEqual(X_train.shape[0], 2)
130 |                 self.assertEqual(y_train.shape[0], 2)
131 |                 self.assertEqual(X_validation.shape[0], 2)
132 |                 self.assertEqual(y_validation.shape[0], 2)
133 | 
134 |     def test_repeats(self):
135 |         with tempfile.TemporaryDirectory() as results_path:
136 |             data = {
137 |                 "X": pd.DataFrame(
138 |                     np.array(
139 |                         [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
140 |                     ),
141 |                     columns=["a", "b"],
142 |                 ),
143 |                 "y": pd.DataFrame(
144 |                     np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]
145 |                 ),
146 |             }
147 | 
148 |             X_path = os.path.join(results_path, "X.data")
149 |             y_path = os.path.join(results_path, "y.data")
150 | 
151 |             dump_data(X_path, data["X"])
152 |             dump_data(y_path, data["y"])
153 | 
154 |             params = {
155 |                 "shuffle": True,
156 |                 "stratify": False,
157 |                 "train_ratio": 0.5,
158 |                 "results_path": results_path,
159 |                 "X_path": X_path,
160 |                 "y_path": y_path,
161 |                 "repeats": 3,
162 |             }
163 |             vl = SplitValidator(params)
164 | 
165 |             self.assertEqual(1, vl.get_n_splits())
166 |             self.assertEqual(3, vl.get_repeats())
167 | 
168 |             cnt = 0
169 |             for repeat in range(vl.get_repeats()):
170 |                 for k_fold in range(vl.get_n_splits()):
171 |                     train, validation = vl.get_split(k_fold, repeat)
172 | 
173 |                     X_train, y_train = train.get("X"), train.get("y")
174 |                     X_validation, y_validation = validation.get("X"), validation.get(
175 |                         "y"
176 |                     )
177 | 
178 |                     self.assertEqual(X_train.shape[0], 4)
179 |                     self.assertEqual(y_train.shape[0], 4)
180 |                     self.assertEqual(X_validation.shape[0], 4)
181 |                     self.assertEqual(y_validation.shape[0], 4)
182 |                     cnt += 1
183 | 
184 |             self.assertEqual(cnt, 3)
185 | 
186 |     def test_disable_repeats_when_disabled_shuffle(self):
187 |         with tempfile.TemporaryDirectory() as results_path:
188 |             data = {
189 |                 "X": pd.DataFrame(
190 |                     np.array(
191 |                         [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]]
192 |                     ),
193 |                     columns=["a", "b"],
194 |                 ),
195 |                 "y": pd.DataFrame(
196 |                     np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"]
197 |                 ),
198 |             }
199 | 
200 |             X_path = os.path.join(results_path, "X.data")
201 |             y_path = os.path.join(results_path, "y.data")
202 | 
203 |             dump_data(X_path, data["X"])
204 |             dump_data(y_path, data["y"])
205 | 
206 |             params = {
207 |                 "shuffle": False,
208 |                 "stratify": False,
209 |                 "train_ratio": 0.5,
210 |                 "results_path": results_path,
211 |                 "X_path": X_path,
212 |                 "y_path": y_path,
213 |                 "repeats": 3,
214 |             }
215 | 
216 |             with pytest.warns(
217 |                 expected_warning=UserWarning,
218 |                 match="Disable repeats in validation because shuffle is disabled",
219 |             ) as record:
220 |                 vl = SplitValidator(params)
221 | 
222 |             # check that only one warning was raised
223 |             self.assertEqual(len(record), 1)
224 | 
225 |             self.assertEqual(1, vl.get_n_splits())
226 |             self.assertEqual(1, vl.get_repeats())
227 | 
```

--------------------------------------------------------------------------------
/supervised/utils/additional_plots.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import scikitplot as skplt
  5 | from matplotlib import pyplot as plt
  6 | 
  7 | 
  8 | class AdditionalPlots:
  9 |     @staticmethod
 10 |     def plots_binary(target, predicted_labels, predicted_probas):
 11 |         figures = []
 12 |         try:
 13 |             #
 14 |             fig = plt.figure(figsize=(10, 7))
 15 |             ax1 = fig.add_subplot(1, 1, 1)
 16 |             _ = skplt.metrics.plot_confusion_matrix(
 17 |                 target, predicted_labels, normalize=False, ax=ax1
 18 |             )
 19 |             figures += [
 20 |                 {
 21 |                     "title": "Confusion Matrix",
 22 |                     "fname": "confusion_matrix.png",
 23 |                     "figure": fig,
 24 |                 }
 25 |             ]
 26 |             #
 27 |             fig = plt.figure(figsize=(10, 7))
 28 |             ax1 = fig.add_subplot(1, 1, 1)
 29 |             _ = skplt.metrics.plot_confusion_matrix(
 30 |                 target, predicted_labels, normalize=True, ax=ax1
 31 |             )
 32 |             figures += [
 33 |                 {
 34 |                     "title": "Normalized Confusion Matrix",
 35 |                     "fname": "confusion_matrix_normalized.png",
 36 |                     "figure": fig,
 37 |                 }
 38 |             ]
 39 |             #
 40 |             fig = plt.figure(figsize=(10, 7))
 41 |             ax1 = fig.add_subplot(1, 1, 1)
 42 |             _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1)
 43 |             figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}]
 44 |             #
 45 |             fig = plt.figure(figsize=(10, 7))
 46 |             ax1 = fig.add_subplot(1, 1, 1)
 47 |             _ = skplt.metrics.plot_ks_statistic(target, predicted_probas, ax=ax1)
 48 |             figures += [
 49 |                 {
 50 |                     "title": "Kolmogorov-Smirnov Statistic",
 51 |                     "fname": "ks_statistic.png",
 52 |                     "figure": fig,
 53 |                 }
 54 |             ]
 55 |             #
 56 |             fig = plt.figure(figsize=(10, 7))
 57 |             ax1 = fig.add_subplot(1, 1, 1)
 58 |             _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1)
 59 |             figures += [
 60 |                 {
 61 |                     "title": "Precision-Recall Curve",
 62 |                     "fname": "precision_recall_curve.png",
 63 |                     "figure": fig,
 64 |                 }
 65 |             ]
 66 |             #
 67 |             fig = plt.figure(figsize=(10, 7))
 68 |             ax1 = fig.add_subplot(1, 1, 1)
 69 |             # transform target if needed to be {0, 1}
 70 |             target_uniq_values = np.unique(target)
 71 |             target_transformed = target.values.ravel()
 72 |             if not (0 in target_uniq_values and 1 in target_uniq_values):
 73 |                 mapping = {target_uniq_values[0]: 0, target_uniq_values[1]: 1}
 74 |                 target_transformed = target.map(mapping)
 75 |             # create a plot
 76 |             _ = skplt.metrics.plot_calibration_curve(
 77 |                 target_transformed, [predicted_probas], ["Classifier"], ax=ax1
 78 |             )
 79 |             figures += [
 80 |                 {
 81 |                     "title": "Calibration Curve",
 82 |                     "fname": "calibration_curve_curve.png",
 83 |                     "figure": fig,
 84 |                 }
 85 |             ]
 86 |             #
 87 |             fig = plt.figure(figsize=(10, 7))
 88 |             ax1 = fig.add_subplot(1, 1, 1)
 89 |             _ = skplt.metrics.plot_cumulative_gain(target, predicted_probas, ax=ax1)
 90 |             figures += [
 91 |                 {
 92 |                     "title": "Cumulative Gains Curve",
 93 |                     "fname": "cumulative_gains_curve.png",
 94 |                     "figure": fig,
 95 |                 }
 96 |             ]
 97 |             #
 98 |             fig = plt.figure(figsize=(10, 7))
 99 |             ax1 = fig.add_subplot(1, 1, 1)
100 |             _ = skplt.metrics.plot_lift_curve(target, predicted_probas, ax=ax1)
101 |             figures += [
102 |                 {"title": "Lift Curve", "fname": "lift_curve.png", "figure": fig}
103 |             ]
104 | 
105 |         except Exception as e:
106 |             print(str(e))
107 | 
108 |         return figures
109 | 
110 |     @staticmethod
111 |     def plots_multiclass(target, predicted_labels, predicted_probas):
112 |         figures = []
113 |         try:
114 |             #
115 |             fig = plt.figure(figsize=(10, 7))
116 |             ax1 = fig.add_subplot(1, 1, 1)
117 |             _ = skplt.metrics.plot_confusion_matrix(
118 |                 target, predicted_labels, normalize=False, ax=ax1
119 |             )
120 |             figures += [
121 |                 {
122 |                     "title": "Confusion Matrix",
123 |                     "fname": "confusion_matrix.png",
124 |                     "figure": fig,
125 |                 }
126 |             ]
127 |             #
128 |             fig = plt.figure(figsize=(10, 7))
129 |             ax1 = fig.add_subplot(1, 1, 1)
130 |             _ = skplt.metrics.plot_confusion_matrix(
131 |                 target, predicted_labels, normalize=True, ax=ax1
132 |             )
133 |             figures += [
134 |                 {
135 |                     "title": "Normalized Confusion Matrix",
136 |                     "fname": "confusion_matrix_normalized.png",
137 |                     "figure": fig,
138 |                 }
139 |             ]
140 |             #
141 |             fig = plt.figure(figsize=(10, 7))
142 |             ax1 = fig.add_subplot(1, 1, 1)
143 |             _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1)
144 |             figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}]
145 |             #
146 |             fig = plt.figure(figsize=(10, 7))
147 |             ax1 = fig.add_subplot(1, 1, 1)
148 |             _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1)
149 |             figures += [
150 |                 {
151 |                     "title": "Precision Recall Curve",
152 |                     "fname": "precision_recall_curve.png",
153 |                     "figure": fig,
154 |                 }
155 |             ]
156 |             plt.close("all")
157 |         except Exception as e:
158 |             print(str(e))
159 | 
160 |         return figures
161 | 
162 |     @staticmethod
163 |     def plots_regression(target, predictions):
164 |         figures = []
165 |         try:
166 |             MAX_SAMPLES = 5000
167 |             fig = plt.figure(figsize=(10, 7))
168 |             ax1 = fig.add_subplot(1, 1, 1)
169 |             samples = target.shape[0]
170 |             if samples > MAX_SAMPLES:
171 |                 samples = MAX_SAMPLES
172 |             ax1.scatter(
173 |                 target[:samples], predictions[:samples], c="tab:blue", alpha=0.2
174 |             )
175 |             plt.xlabel("True values")
176 |             plt.ylabel("Predicted values")
177 |             plt.title(f"Target values vs Predicted values (samples={samples})")
178 |             plt.tight_layout(pad=5.0)
179 |             figures += [
180 |                 {
181 |                     "title": "True vs Predicted",
182 |                     "fname": "true_vs_predicted.png",
183 |                     "figure": fig,
184 |                 }
185 |             ]
186 | 
187 |             # residual plot
188 |             fig = plt.figure(figsize=(10, 7))
189 |             ax1 = fig.add_subplot(1, 1, 1)
190 |             residuals = target[:samples].values - predictions[:samples].values
191 |             ax1.scatter(predictions[:samples], residuals, c="tab:blue", alpha=0.2)
192 |             plt.xlabel("Predicted values")
193 |             plt.ylabel("Residuals")
194 |             plt.title(f"Predicted values vs Residuals (samples={samples})")
195 |             plt.tight_layout(pad=5.0)
196 |             bb = ax1.get_position()
197 | 
198 |             ax2 = fig.add_axes((bb.x0 + bb.size[0], bb.y0, 0.05, bb.size[1]))
199 |             ax2.set_xticklabels([])
200 |             ax2.set_yticklabels([])
201 |             ax2.hist(residuals, 50, orientation="horizontal", alpha=0.5)
202 |             ax2.axis("off")
203 | 
204 |             figures += [
205 |                 {
206 |                     "title": "Predicted vs Residuals",
207 |                     "fname": "predicted_vs_residuals.png",
208 |                     "figure": fig,
209 |                 }
210 |             ]
211 |             plt.close("all")
212 | 
213 |         except Exception as e:
214 |             print(str(e))
215 |         return figures
216 | 
217 |     @staticmethod
218 |     def append(fout, model_path, plots):
219 |         try:
220 |             for plot in plots:
221 |                 fname = plot.get("fname")
222 |                 fig = plot.get("figure")
223 |                 title = plot.get("title", "")
224 |                 fig.savefig(os.path.join(model_path, fname))
225 |                 fout.write(f"\n## {title}\n\n")
226 |                 fout.write(f"![{title}]({fname})\n\n")
227 |         except Exception as e:
228 |             print(str(e))
229 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_label_binarizer.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import unittest
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised.preprocessing.label_binarizer import LabelBinarizer
  8 | 
  9 | 
 10 | class LabelBinarizerTest(unittest.TestCase):
 11 |     def test_fit(self):
 12 |         # training data
 13 |         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
 14 |         df = pd.DataFrame(data=d)
 15 |         lb = LabelBinarizer()
 16 |         # check first column
 17 |         lb.fit(df, "col1")
 18 |         data_json = lb.to_json()
 19 |         self.assertTrue("new_columns" in data_json)
 20 |         # we take alphabetical order
 21 |         self.assertTrue("col1_c" in data_json["new_columns"])
 22 |         self.assertTrue("col1_a" not in data_json["new_columns"])
 23 |         self.assertTrue("unique_values" in data_json)
 24 |         self.assertTrue("a" in data_json["unique_values"])
 25 |         self.assertTrue("c" in data_json["unique_values"])
 26 | 
 27 |         lb = LabelBinarizer()
 28 |         # check second column
 29 |         lb.fit(df, "col2")
 30 |         data_json = lb.to_json()
 31 |         self.assertTrue("new_columns" in data_json)
 32 |         self.assertTrue("col2_w" in data_json["new_columns"])
 33 |         self.assertTrue("col2_e" in data_json["new_columns"])
 34 |         self.assertTrue("col2_d" in data_json["new_columns"])
 35 |         self.assertTrue("unique_values" in data_json)
 36 |         self.assertTrue("w" in data_json["unique_values"])
 37 |         self.assertTrue("e" in data_json["unique_values"])
 38 |         self.assertTrue("d" in data_json["unique_values"])
 39 | 
 40 |     def test_transform(self):
 41 |         # training data
 42 |         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
 43 |         df = pd.DataFrame(data=d)
 44 |         # fit binarizer
 45 |         lb1 = LabelBinarizer()
 46 |         lb1.fit(df, "col1")
 47 |         lb2 = LabelBinarizer()
 48 |         lb2.fit(df, "col2")
 49 |         # test data
 50 |         d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]}
 51 |         df_test = pd.DataFrame(data=d_test)
 52 |         # transform
 53 |         df_test = lb1.transform(df_test, "col1")
 54 |         df_test = lb2.transform(df_test, "col2")
 55 |         # for binary column, only one value is left, old column should be deleted
 56 |         self.assertTrue("col1_c" in df_test.columns)
 57 |         self.assertTrue("col1" not in df_test.columns)
 58 |         self.assertEqual(2, np.sum(df_test["col1_c"]))
 59 |         # for multiple value colum, all columns should be added
 60 |         self.assertTrue("col2_w" in df_test.columns)
 61 |         self.assertTrue("col2_e" in df_test.columns)
 62 |         self.assertTrue("col2_d" in df_test.columns)
 63 |         self.assertTrue("col2" not in df_test.columns)
 64 |         self.assertEqual(1, np.sum(df_test["col2_w"]))
 65 |         self.assertEqual(1, np.sum(df_test["col2_e"]))
 66 |         self.assertEqual(1, np.sum(df_test["col2_d"]))
 67 |         # do not touch continuous attribute
 68 |         self.assertTrue("col3" in df_test.columns)
 69 | 
 70 |     def test_transform_with_new_values(self):
 71 |         # training data
 72 |         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
 73 |         df = pd.DataFrame(data=d)
 74 |         # fit binarizer
 75 |         lb1 = LabelBinarizer()
 76 |         lb1.fit(df, "col1")
 77 |         lb2 = LabelBinarizer()
 78 |         lb2.fit(df, "col2")
 79 |         # test data
 80 |         d_test = {"col1": ["c", "d", "d"], "col2": ["g", "e", "f"], "col3": [2, 3, 4]}
 81 |         df_test = pd.DataFrame(data=d_test)
 82 |         # transform
 83 |         df_test = lb1.transform(df_test, "col1")
 84 |         df_test = lb2.transform(df_test, "col2")
 85 |         self.assertTrue("col1_c" in df_test.columns)
 86 |         self.assertTrue("col1_d" not in df_test.columns)
 87 |         self.assertTrue("col2_w" in df_test.columns)
 88 |         self.assertTrue("col2_e" in df_test.columns)
 89 |         self.assertTrue("col2_d" in df_test.columns)
 90 |         self.assertTrue("col2_g" not in df_test.columns)
 91 |         self.assertTrue("col2_f" not in df_test.columns)
 92 |         self.assertEqual(df_test["col1_c"][0], 1)
 93 |         self.assertEqual(df_test["col1_c"][1], 0)
 94 |         self.assertEqual(df_test["col1_c"][2], 0)
 95 |         self.assertEqual(np.sum(df_test["col2_w"]), 0)
 96 |         self.assertEqual(np.sum(df_test["col2_d"]), 0)
 97 |         self.assertEqual(df_test["col2_e"][0], 0)
 98 |         self.assertEqual(df_test["col2_e"][1], 1)
 99 |         self.assertEqual(df_test["col2_e"][2], 0)
100 | 
101 |     def test_to_and_from_json(self):
102 |         # training data
103 |         d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
104 |         df = pd.DataFrame(data=d)
105 |         # fit binarizer
106 |         lb1 = LabelBinarizer()
107 |         lb1.fit(df, "col1")
108 |         lb2 = LabelBinarizer()
109 |         lb2.fit(df, "col2")
110 |         # test data
111 |         d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]}
112 |         df_test = pd.DataFrame(data=d_test)
113 |         # to json and from json
114 |         new_lb1 = LabelBinarizer()
115 |         new_lb2 = LabelBinarizer()
116 |         new_lb1.from_json(lb1.to_json())
117 |         new_lb2.from_json(lb2.to_json())
118 |         # transform
119 |         df_test = new_lb1.transform(df_test, "col1")
120 |         df_test = new_lb2.transform(df_test, "col2")
121 |         # for binary column, only one value is left, old column should be deleted
122 |         self.assertTrue("col1_c" in df_test.columns)
123 |         self.assertTrue("col1" not in df_test.columns)
124 |         self.assertEqual(2, np.sum(df_test["col1_c"]))
125 |         # for multiple value colum, all columns should be added
126 |         self.assertTrue("col2_w" in df_test.columns)
127 |         self.assertTrue("col2_e" in df_test.columns)
128 |         self.assertTrue("col2_d" in df_test.columns)
129 |         self.assertTrue("col2" not in df_test.columns)
130 |         self.assertEqual(1, np.sum(df_test["col2_w"]))
131 |         self.assertEqual(1, np.sum(df_test["col2_e"]))
132 |         self.assertEqual(1, np.sum(df_test["col2_d"]))
133 |         # do not touch continuous attribute
134 |         self.assertTrue("col3" in df_test.columns)
135 | 
136 |     def test_to_and_from_json_booleans(self):
137 |         # training data
138 |         d = {"col1": ["a", "a", "c"], "col2": [True, True, False]}
139 |         df = pd.DataFrame(data=d)
140 |         # fit binarizer
141 |         lb1 = LabelBinarizer()
142 |         lb1.fit(df, "col1")
143 |         lb2 = LabelBinarizer()
144 |         lb2.fit(df, "col2")
145 |         # test data
146 |         d_test = {
147 |             "col1": ["c", "c", "a"],
148 |             "col2": [False, False, True],
149 |             "col3": [2, 3, 4],
150 |         }
151 |         df_test = pd.DataFrame(data=d_test)
152 |         # to json and from json
153 |         new_lb1 = LabelBinarizer()
154 |         new_lb2 = LabelBinarizer()
155 |         new_lb1.from_json(lb1.to_json())
156 |         new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4)))
157 | 
158 |         # transform
159 |         df_test = new_lb1.transform(df_test, "col1")
160 |         df_test = new_lb2.transform(df_test, "col2")
161 |         # for binary column, only one value is left, old column should be deleted
162 |         self.assertTrue("col1_c" in df_test.columns)
163 |         self.assertTrue("col1" not in df_test.columns)
164 |         self.assertEqual(2, np.sum(df_test["col1_c"]))
165 |         # for multiple value colum, all columns should be added
166 |         self.assertTrue("col2_True" in df_test.columns)
167 |         self.assertTrue("col2" not in df_test.columns)
168 |         self.assertEqual(1, np.sum(df_test["col2_True"]))
169 |         # do not touch continuous attribute
170 |         self.assertTrue("col3" in df_test.columns)
171 | 
172 |     def test_inverse_transform_2_unique_strings(self):
173 |         d = {"col1": ["a", "a", "c"]}
174 |         df = pd.DataFrame(data=d)
175 |         lb = LabelBinarizer()
176 |         lb.fit(df, "col1")
177 |         bb = lb.transform(df, "col1")
178 |         self.assertTrue("col1_c" in bb.columns)
179 |         self.assertTrue(np.sum(bb["col1_c"]) == 1)
180 |         bb = lb.inverse_transform(bb)
181 |         self.assertTrue("col1_c" not in bb.columns)
182 | 
183 |     def test_inverse_transform_strings(self):
184 |         d = {"col2": ["w", "e", "d"]}
185 |         df = pd.DataFrame(data=d)
186 |         lb = LabelBinarizer()
187 |         lb.fit(df, "col2")
188 |         bb = lb.transform(df, "col2")
189 |         self.assertTrue("col2_w" in bb.columns)
190 |         self.assertTrue("col2_e" in bb.columns)
191 |         self.assertTrue("col2_d" in bb.columns)
192 |         self.assertTrue(np.sum(bb["col2_w"]) == 1)
193 |         bb = lb.inverse_transform(bb)
194 |         self.assertTrue("col2_w" not in bb.columns)
195 | 
196 |     def test_inverse_transform_booleans(self):
197 |         d = {"col1": [True, False, True, True]}
198 |         df = pd.DataFrame(data=d)
199 |         lb = LabelBinarizer()
200 |         lb.fit(df, "col1")
201 | 
202 |         bb = lb.transform(df, "col1")
203 |         self.assertTrue("col1_True" in bb.columns)
204 |         self.assertEqual(bb["col1_True"].dtype, "int64")
205 |         self.assertEqual(bb["col1_True"][0], 1)
206 |         self.assertEqual(bb["col1_True"][1], 0)
207 |         self.assertEqual(bb["col1_True"][2], 1)
208 |         self.assertEqual(bb["col1_True"][3], 1)
209 | 
210 |         bb = lb.inverse_transform(bb)
211 |         self.assertTrue("col1_True" not in bb.columns)
212 |         self.assertEqual(bb["col1"].dtype, "bool")
213 |         self.assertEqual(bb["col1"][0], True)
214 |         self.assertEqual(bb["col1"][1], False)
215 |         self.assertEqual(bb["col1"][2], True)
216 |         self.assertEqual(bb["col1"][3], True)
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     unittest.main()
221 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/time_controller.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import time
  3 | 
  4 | import numpy as np
  5 | 
  6 | from supervised.utils.config import LOG_LEVEL
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | logger.setLevel(LOG_LEVEL)
 10 | 
 11 | 
 12 | class TimeController:
 13 |     def __init__(
 14 |         self, start_time, total_time_limit, model_time_limit, steps, algorithms
 15 |     ):
 16 |         self._start_time = start_time
 17 |         self._total_time_limit = total_time_limit
 18 |         self._model_time_limit = model_time_limit
 19 |         self._steps = steps
 20 |         self._algorithms = algorithms
 21 |         self._spend = []
 22 |         self._is_hill_climbing = "hill_climbing_1" in steps
 23 |         self._is_stacking = "stack" in steps
 24 | 
 25 |     def to_json(self):
 26 |         return {
 27 |             "total_time_limit": self._total_time_limit,
 28 |             "model_time_limit": self._model_time_limit,
 29 |             "steps": self._steps,
 30 |             "algorithms": self._algorithms,
 31 |             "spend": self._spend,
 32 |             "is_hill_climbing": self._is_hill_climbing,
 33 |             "is_stacking": self._is_stacking,
 34 |         }
 35 | 
 36 |     @staticmethod
 37 |     def from_json(data):
 38 |         if data is None:
 39 |             return None
 40 |         try:
 41 |             total_time_limit = data.get("total_time_limit")
 42 |             model_time_limit = data.get("model_time_limit")
 43 |             steps = data.get("steps")
 44 |             algorithms = data.get("algorithms")
 45 | 
 46 |             tc = TimeController(
 47 |                 time.time(), total_time_limit, model_time_limit, steps, algorithms
 48 |             )
 49 |             tc._spend = data.get("spend")
 50 |             tc._start_time -= tc.already_spend()  # update time with already spend
 51 |             return tc
 52 |         except Exception as e:
 53 |             logger.error(f"Cant load TimeController from json, {str(e)}")
 54 |             pass
 55 |         return None
 56 | 
 57 |     def already_spend(self):
 58 |         return np.sum([s["train_time"] for s in self._spend])
 59 | 
 60 |     def time_should_use(self, fit_level):
 61 |         if self._total_time_limit is None:
 62 |             return 7 * 24 * 3600  # 7 days
 63 | 
 64 |         ratios = {
 65 |             "default_algorithms": 0.3,
 66 |             "not_so_random": 0.35,
 67 |             "mix_encoding": 0.05,
 68 |             "golden_features": 0.05,
 69 |             "kmeans_features": 0.05,
 70 |             "insert_random_feature": 0.05,
 71 |             "features_selection": 0.05,
 72 |             "hill_climbing_1": 0.2,  # enough to have only first step from hill climbing
 73 |             "boost_on_errors": 0.05,
 74 |             "stack": 0.2,
 75 |         }
 76 | 
 77 |         if (
 78 |             fit_level
 79 |             in [
 80 |                 "default_algorithms",
 81 |                 "not_so_random",
 82 |                 "boost_on_errors",
 83 |                 "mix_encoding",
 84 |                 "golden_features",
 85 |                 "kmeans_features",
 86 |                 "insert_random_feature",
 87 |                 "features_selection",
 88 |                 "stack",
 89 |             ]
 90 |             or "hill_climbing" in fit_level
 91 |         ):
 92 |             ratio = 0
 93 |             for k, v in ratios.items():
 94 |                 if k in self._steps:
 95 |                     ratio += v
 96 | 
 97 |             fl = fit_level
 98 |             if "hill_climbing" in fit_level:
 99 |                 fl = "hill_climbing_1"
100 | 
101 |             ratio = ratios[fl] / ratio
102 | 
103 |             if "hill_climbing" in fit_level:
104 |                 # print("before hill climbing scale", ratio)
105 |                 hill_climbing_cnt = len(
106 |                     [i for i in self._steps if "hill_climbing" in i]
107 |                 )
108 |                 ratio /= float(hill_climbing_cnt)
109 | 
110 |             should_use = self._total_time_limit * ratio
111 | 
112 |             return should_use
113 | 
114 |         return 0
115 | 
116 |     def compound_time_should_use(self, fit_level):
117 |         compound = 0
118 |         for step in self._steps:
119 |             if step in [
120 |                 "adjust_validation",
121 |                 "simple_algorithms",
122 |                 # "default_algorithms",
123 |                 "ensemble",
124 |                 "ensemble_stacked",
125 |             ]:
126 |                 continue
127 |             time_should_use = self.time_should_use(step)
128 |             compound += time_should_use
129 | 
130 |             if fit_level == step:
131 |                 break
132 |         # if fit_level == "stack":
133 |         #    compound -= 120 # leave time for ensemble
134 |         # maybe not needed
135 |         return compound
136 | 
137 |     def enough_time_for_step(self, fit_level):
138 |         if fit_level in ["ensemble", "ensemble_stacked", "fairness"]:
139 |             return True
140 |         total_time_spend = time.time() - self._start_time
141 |         compound = self.compound_time_should_use(fit_level)
142 |         # print("Enough time for step", fit_level, np.round(total_time_spend,2), np.round(compound,2))
143 |         if total_time_spend > compound:
144 |             # dont train more
145 |             return False
146 | 
147 |         return True
148 | 
149 |     def enough_time_for_model(self, model_type):
150 |         if self._total_time_limit is None:
151 |             return True
152 | 
153 |         time_left = self._total_time_limit - self.already_spend()
154 |         spend = [s["train_time"] for s in self._spend if s["model_type"] == model_type]
155 |         model_mean_spend = np.mean(spend)
156 |         return model_mean_spend <= time_left
157 | 
158 |     def enough_time(self, model_type, step):
159 |         """
160 |         Check if there is enough time to train the next model.
161 | 
162 |         Parameters
163 |         ----------
164 |         model_type : str
165 |             String with type of the model.
166 | 
167 |         step: str
168 |             String with name of the step in the process of AutoML training.
169 | 
170 | 
171 |         Returns
172 |         -------
173 |         bool
174 |             `True` if there is time for training next model, `False` otherwise.
175 |         """
176 |         if step in ["ensemble", "ensemble_stacked"]:
177 |             return True
178 |         # if model_time_limit is set, train every model
179 |         # do not apply total_time_limit
180 |         if self._model_time_limit is not None:
181 |             return True
182 |         # no total time limit, just train, dont ask
183 |         if self._total_time_limit is None:
184 |             return True
185 | 
186 |         total_time_spend = time.time() - self._start_time
187 |         time_left = self._total_time_limit - total_time_spend
188 |         # no time left, do not train any more models, sorry ...
189 |         if time_left < 0:
190 |             # print("No time left", time_left)
191 |             return False
192 | 
193 |         # check the fit level type
194 |         # we dont want to spend too much time on one step
195 |         if not self.enough_time_for_step(step):
196 |             # print("Not enough time for step", step)
197 |             return False
198 | 
199 |         # there is still time and model_type was not tested yet
200 |         # we should try it
201 |         if time_left > 0 and self.model_spend(model_type) == 0:
202 |             return True
203 | 
204 |         # stacked models converge faster
205 |         # dont need to check ...
206 |         if step == "stack":
207 |             return True
208 |         # check if there is enough time for model to train
209 |         return self.enough_time_for_model(model_type)
210 | 
211 |     def learner_time_limit(self, model_type, fit_level, k_folds):
212 |         if self._total_time_limit is None:
213 |             return 7 * 24 * 3600
214 | 
215 |         if self._model_time_limit is not None:
216 |             return self._model_time_limit / k_folds
217 | 
218 |         # just train them ...
219 |         if fit_level == "simple_algorithms":
220 |             return None
221 |         if fit_level == "default_algorithms":
222 |             return None
223 | 
224 |         tune_algorithms = [
225 |             a
226 |             for a in self._algorithms
227 |             if a not in ["Baseline", "Linear", "Decision Tree", "Nearest Neighbors"]
228 |         ]
229 |         tune_algs_cnt = len(tune_algorithms)
230 |         if tune_algs_cnt == 0:
231 |             return None
232 | 
233 |         time_elapsed = time.time() - self._start_time
234 |         time_left = self._total_time_limit - time_elapsed
235 | 
236 |         if fit_level == "not_so_random":
237 |             tt = self.time_should_use(fit_level)
238 | 
239 |             tt /= tune_algs_cnt  # give time equally for each algorithm
240 |             tt /= k_folds  # time is per learner (per fold)
241 |             return tt
242 | 
243 |         if "hill_climbing" in fit_level:
244 |             tt = self.time_should_use(fit_level)
245 |             tt /= tune_algs_cnt  # give time equally for each algorithm
246 |             tt /= k_folds  # time is per learner (per fold)
247 |             return tt
248 | 
249 |         if self._is_stacking and fit_level == "stack":
250 |             tt = time_left
251 |             tt /= tune_algs_cnt  # give time equally for each algorithm
252 |             tt /= k_folds  # time is per learner (per fold)
253 |             return tt
254 | 
255 |     def log_time(self, model_name, model_type, fit_level, train_time):
256 |         self._spend += [
257 |             {
258 |                 "model_name": model_name,
259 |                 "model_type": model_type,
260 |                 "fit_level": fit_level,
261 |                 "train_time": train_time,
262 |             }
263 |         ]
264 |         # print(pd.DataFrame(self._spend))
265 |         # print("Already spend", self.already_spend())
266 | 
267 |     def step_spend(self, step):
268 |         return np.sum([s["train_time"] for s in self._spend if s["fit_level"] == step])
269 | 
270 |     def model_spend(self, model_type):
271 |         return np.sum(
272 |             [s["train_time"] for s in self._spend if s["model_type"] == model_type]
273 |         )
274 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/early_stopping.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from supervised.callbacks.callback import Callback
  8 | from supervised.utils.config import LOG_LEVEL
  9 | from supervised.utils.metric import Metric
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | logger.setLevel(LOG_LEVEL)
 13 | 
 14 | 
 15 | class EarlyStopping(Callback):
 16 |     def __init__(self, params):
 17 |         super(EarlyStopping, self).__init__(params)
 18 |         self.name = params.get("name", "early_stopping")
 19 |         self.metric = Metric(params.get("metric"))
 20 |         self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5)
 21 |         self.log_to_dir = params.get("log_to_dir")
 22 | 
 23 |         self.keep_best_model = params.get("keep_best_model", True)
 24 |         self.best_iter = {}
 25 |         self.best_loss = {}
 26 |         self.loss_values = {}
 27 |         self.best_models = {}
 28 |         self.best_y_predicted = {}
 29 |         self.best_y_oof = (
 30 |             None  # predictions computed on out of folds or on validation set
 31 |         )
 32 |         self.final_loss = (
 33 |             None  # final score computed on combined predictions from all learners
 34 |         )
 35 |         # path to best model local copy, only used if cannot deep copy
 36 |         self.best_model_paths = {}
 37 |         self.multiple_target = False
 38 |         self.target_columns = None
 39 | 
 40 |     def add_and_set_learner(self, learner):
 41 |         self.learners += [learner]
 42 |         self.learner = learner
 43 |         self.best_iter[learner.uid] = None
 44 |         self.best_loss[learner.uid] = self.metric.worst_value()
 45 |         self.loss_values[learner.uid] = {"train": [], "validation": [], "iters": []}
 46 |         self.best_models[learner.uid] = None
 47 |         self.best_model_paths[learner.uid] = None
 48 |         self.best_y_predicted[learner.uid] = None
 49 | 
 50 |     def on_learner_train_start(self, logs):
 51 |         self.no_improvement_cnt = 0
 52 | 
 53 |     def on_framework_train_end(self, logs):
 54 |         # aggregate predictions from all learners
 55 |         # it has two columns: 'prediction', 'target'
 56 |         logger.debug("early stopping on framework train end")
 57 |         self.best_y_oof = pd.concat(list(self.best_y_predicted.values()))
 58 |         self.best_y_oof.sort_index(inplace=True)
 59 |         # check for duplicates in index -> repeats of validation
 60 |         if np.sum(self.best_y_oof.index.duplicated()):
 61 |             # we need to aggregate predictions from multiple repeats
 62 |             target_cols = [c for c in self.best_y_oof.columns if "prediction" not in c]
 63 |             prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c]
 64 | 
 65 |             aggs = {}
 66 |             for t in target_cols:
 67 |                 aggs[t] = "first"
 68 |             for p in prediction_cols:
 69 |                 aggs[p] = "mean"
 70 |             # aggregate predictions from repeats
 71 |             self.best_y_oof = self.best_y_oof.groupby(
 72 |                 target_cols + prediction_cols, level=0
 73 |             ).agg(aggs)
 74 | 
 75 |         sample_weight = None
 76 |         if "sample_weight" in self.best_y_oof.columns:
 77 |             sample_weight = self.best_y_oof["sample_weight"]
 78 | 
 79 |         if "prediction" in self.best_y_oof:
 80 |             self.final_loss = self.metric(
 81 |                 self.best_y_oof[self.target_columns],
 82 |                 self.best_y_oof["prediction"],
 83 |                 sample_weight=sample_weight,
 84 |             )
 85 |         else:
 86 |             prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c]
 87 |             self.final_loss = self.metric(
 88 |                 self.best_y_oof[self.target_columns],
 89 |                 self.best_y_oof[prediction_cols],
 90 |                 sample_weight=sample_weight,
 91 |             )
 92 | 
 93 |     def on_iteration_end(self, logs, predictions):
 94 |         train_loss = 0
 95 |         if predictions.get("y_train_predicted") is not None:
 96 |             train_loss = self.metric(
 97 |                 predictions.get("y_train_true"),
 98 |                 predictions.get("y_train_predicted"),
 99 |                 predictions.get("sample_weight"),
100 |             )
101 | 
102 |         validation_loss = self.metric(
103 |             predictions.get("y_validation_true"),
104 |             predictions.get("y_validation_predicted"),
105 |             predictions.get("sample_weight_validation"),
106 |         )
107 |         self.loss_values[self.learner.uid]["train"] += [train_loss]
108 |         self.loss_values[self.learner.uid]["validation"] += [validation_loss]
109 |         self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")]
110 | 
111 |         if self.metric.improvement(
112 |             previous=self.best_loss[self.learner.uid], current=validation_loss
113 |         ):
114 |             y_validation_true = predictions.get("y_validation_true")
115 |             self.no_improvement_cnt = 0
116 |             self.best_iter[self.learner.uid] = logs.get("iter_cnt")
117 |             self.best_loss[self.learner.uid] = validation_loss
118 | 
119 |             if len(y_validation_true.shape) == 1 or y_validation_true.shape[1] == 1:
120 |                 self.best_y_predicted[self.learner.uid] = pd.DataFrame(
121 |                     {
122 |                         "target": np.array(y_validation_true)
123 |                         # y_validation_true.values.reshape(
124 |                         #    y_validation_true.shape[0]
125 |                         # )
126 |                     },
127 |                     index=predictions.get("validation_index"),
128 |                 )
129 |                 self.multiple_target = False
130 |                 self.target_columns = "target"
131 |             else:
132 |                 # in case of Neural Networks and multi-class classification with one-hot encoding
133 |                 self.best_y_predicted[self.learner.uid] = pd.DataFrame(
134 |                     y_validation_true, index=predictions.get("validation_index")
135 |                 )
136 |                 self.multiple_target = True
137 |                 self.target_columns = y_validation_true.columns
138 | 
139 |             y_validation_predicted = predictions.get("y_validation_predicted")
140 | 
141 |             if len(y_validation_predicted.shape) == 1:
142 |                 # only one prediction column (binary classification or regression)
143 |                 col = predictions.get("validation_columns", "prediction")
144 |                 self.best_y_predicted[self.learner.uid][col] = np.array(
145 |                     y_validation_predicted
146 |                 )
147 |             else:
148 |                 # several columns in multiclass classification
149 |                 cols = predictions.get("validation_columns")
150 |                 for i_col in range(y_validation_predicted.shape[1]):
151 |                     self.best_y_predicted[self.learner.uid][
152 |                         # "prediction_{}".format(i_col)
153 |                         cols[i_col]
154 |                     ] = y_validation_predicted[:, i_col]
155 | 
156 |             # store sample_weight
157 |             sample_weight_validation = predictions.get("sample_weight_validation")
158 |             if sample_weight_validation is not None:
159 |                 self.best_y_predicted[self.learner.uid]["sample_weight"] = np.array(
160 |                     sample_weight_validation
161 |                 )
162 |             # store sensitive features
163 |             sensitive_features_validation = predictions.get(
164 |                 "sensitive_features_validation"
165 |             )
166 | 
167 |             if sensitive_features_validation is not None:
168 |                 for col in list(sensitive_features_validation.columns):
169 |                     self.best_y_predicted[self.learner.uid][
170 |                         f"sensitive_{col}"
171 |                     ] = np.array(sensitive_features_validation[col])
172 | 
173 |             self.best_models[self.learner.uid] = self.learner.copy()
174 |             # if local copy is not available, save model and keep path
175 |             if self.best_models[self.learner.uid] is None:
176 |                 self.best_model_paths[self.learner.uid] = self.learner.save()
177 |         else:
178 |             self.no_improvement_cnt += 1
179 | 
180 |         if self.no_improvement_cnt > self.max_no_improvement_cnt:
181 |             self.learner.stop_training = True
182 | 
183 |         logger.info(
184 |             "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, "
185 |             "no improvement cnt {}, iters {}".format(
186 |                 train_loss,
187 |                 validation_loss,
188 |                 self.no_improvement_cnt,
189 |                 len(self.loss_values[self.learner.uid]["iters"]),
190 |             )
191 |         )
192 | 
193 |         if self.log_to_dir is not None and self.learner.algorithm_short_name not in [
194 |             "Xgboost",
195 |             "Random Forest",
196 |             "Extra Trees",
197 |             "LightGBM",
198 |             "CatBoost",
199 |             "Neural Network",
200 |         ]:
201 |             sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0
202 |             with open(
203 |                 os.path.join(self.log_to_dir, f"{self.learner.name}_training.log"), "a"
204 |             ) as fout:
205 |                 iteration = len(self.loss_values[self.learner.uid]["iters"])
206 |                 fout.write(f"{iteration},{sign*train_loss},{sign*validation_loss}\n")
207 | 
208 |     def get_status(self):
209 |         return "Train loss: {}, Validation loss: {} @ iteration {}".format(
210 |             self.loss_values[self.learner.uid]["train"][-1],
211 |             self.loss_values[self.learner.uid]["validation"][-1],
212 |             len(self.loss_values[self.learner.uid]["iters"]),
213 |         )
214 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/decision_tree.py:
--------------------------------------------------------------------------------

```python
  1 | import logging
  2 | import os
  3 | import warnings
  4 | 
  5 | import numpy as np
  6 | import sklearn
  7 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
  8 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  9 | 
 10 | from supervised.algorithms.registry import (
 11 |     BINARY_CLASSIFICATION,
 12 |     MULTICLASS_CLASSIFICATION,
 13 |     REGRESSION,
 14 |     AlgorithmsRegistry,
 15 | )
 16 | from supervised.algorithms.sklearn import SklearnAlgorithm
 17 | from supervised.utils.config import LOG_LEVEL
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(LOG_LEVEL)
 21 | 
 22 | import dtreeviz
 23 | from sklearn.tree import _tree
 24 | 
 25 | from supervised.utils.subsample import subsample
 26 | 
 27 | 
 28 | def get_rules(tree, feature_names, class_names):
 29 |     tree_ = tree.tree_
 30 |     feature_name = [
 31 |         feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
 32 |         for i in tree_.feature
 33 |     ]
 34 | 
 35 |     paths = []
 36 |     path = []
 37 | 
 38 |     def recurse(node, path, paths):
 39 |         if tree_.feature[node] != _tree.TREE_UNDEFINED:
 40 |             name = feature_name[node]
 41 |             threshold = tree_.threshold[node]
 42 |             p1, p2 = list(path), list(path)
 43 |             p1 += [f"({name} <= {np.round(threshold, 3)})"]
 44 |             recurse(tree_.children_left[node], p1, paths)
 45 |             p2 += [f"({name} > {np.round(threshold, 3)})"]
 46 |             recurse(tree_.children_right[node], p2, paths)
 47 |         else:
 48 |             path += [(tree_.value[node], tree_.n_node_samples[node])]
 49 |             paths += [path]
 50 | 
 51 |     recurse(0, path, paths)
 52 | 
 53 |     # sort by samples count
 54 |     samples_count = [p[-1][1] for p in paths]
 55 |     ii = list(np.argsort(samples_count))
 56 |     paths = [paths[i] for i in reversed(ii)]
 57 | 
 58 |     rules = []
 59 |     for path in paths:
 60 |         rule = "if "
 61 | 
 62 |         for p in path[:-1]:
 63 |             if rule != "if ":
 64 |                 rule += " and "
 65 |             rule += str(p)
 66 |         rule += " then "
 67 |         if class_names is None:
 68 |             rule += "response: " + str(np.round(path[-1][0][0][0], 3))
 69 |         else:
 70 |             classes = path[-1][0][0]
 71 |             l = np.argmax(classes)
 72 |             rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)"
 73 |         rule += f" | based on {path[-1][1]:,} samples"
 74 |         rules += [rule]
 75 | 
 76 |     return rules
 77 | 
 78 | 
 79 | def save_rules(tree, feature_names, class_names, model_file_path, learner_name):
 80 |     try:
 81 |         rules = get_rules(tree, feature_names, class_names)
 82 |         fname = os.path.join(model_file_path, f"{learner_name}_rules.txt")
 83 |         with open(fname, "w") as fout:
 84 |             for r in rules:
 85 |                 fout.write(r + "\n\n")
 86 |     except Exception as e:
 87 |         logger.info(f"Problem with extracting decision tree rules. {str(e)}")
 88 | 
 89 | 
 90 | class DecisionTreeAlgorithm(ClassifierMixin, SklearnAlgorithm):
 91 |     algorithm_name = "Decision Tree"
 92 |     algorithm_short_name = "Decision Tree"
 93 | 
 94 |     def __init__(self, params):
 95 |         super(DecisionTreeAlgorithm, self).__init__(params)
 96 |         logger.debug("DecisionTreeAlgorithm.__init__")
 97 |         self.library_version = sklearn.__version__
 98 |         self.max_iters = additional.get("max_steps", 1)
 99 |         self.model = DecisionTreeClassifier(
100 |             criterion=params.get("criterion", "gini"),
101 |             max_depth=params.get("max_depth", 3),
102 |             random_state=params.get("seed", 1),
103 |         )
104 | 
105 |     def file_extension(self):
106 |         return "decision_tree"
107 | 
108 |     def interpret(
109 |         self,
110 |         X_train,
111 |         y_train,
112 |         X_validation,
113 |         y_validation,
114 |         model_file_path,
115 |         learner_name,
116 |         target_name=None,
117 |         class_names=None,
118 |         metric_name=None,
119 |         ml_task=None,
120 |         explain_level=2,
121 |     ):
122 |         super(DecisionTreeAlgorithm, self).interpret(
123 |             X_train,
124 |             y_train,
125 |             X_validation,
126 |             y_validation,
127 |             model_file_path,
128 |             learner_name,
129 |             target_name,
130 |             class_names,
131 |             metric_name,
132 |             ml_task,
133 |             explain_level,
134 |         )
135 |         if explain_level == 0:
136 |             return
137 |         with warnings.catch_warnings():
138 |             warnings.simplefilter(action="ignore")
139 |             try:
140 |                 if len(class_names) > 10:
141 |                     # dtreeviz does not support more than 10 classes
142 |                     return
143 | 
144 |                 viz = dtreeviz.model(
145 |                     self.model,
146 |                     X_train,
147 |                     y_train,
148 |                     target_name="target",
149 |                     feature_names=X_train.columns,
150 |                     class_names=class_names,
151 |                 )
152 |                 tree_file_plot = os.path.join(
153 |                     model_file_path, learner_name + "_tree.svg"
154 |                 )
155 |                 viz.view().save(tree_file_plot)
156 |             except Exception as e:
157 |                 logger.info(f"Problem when visualizing decision tree. {str(e)}")
158 | 
159 |             save_rules(
160 |                 self.model, X_train.columns, class_names, model_file_path, learner_name
161 |             )
162 | 
163 | 
164 | class DecisionTreeRegressorAlgorithm(RegressorMixin, SklearnAlgorithm):
165 |     algorithm_name = "Decision Tree"
166 |     algorithm_short_name = "Decision Tree"
167 | 
168 |     def __init__(self, params):
169 |         super(DecisionTreeRegressorAlgorithm, self).__init__(params)
170 |         logger.debug("DecisionTreeRegressorAlgorithm.__init__")
171 |         self.library_version = sklearn.__version__
172 |         self.max_iters = additional.get("max_steps", 1)
173 |         self.model = DecisionTreeRegressor(
174 |             criterion=params.get("criterion", "squared_error"),
175 |             max_depth=params.get("max_depth", 3),
176 |             random_state=params.get("seed", 1),
177 |         )
178 | 
179 |     def file_extension(self):
180 |         return "decision_tree"
181 | 
182 |     def interpret(
183 |         self,
184 |         X_train,
185 |         y_train,
186 |         X_validation,
187 |         y_validation,
188 |         model_file_path,
189 |         learner_name,
190 |         target_name=None,
191 |         class_names=None,
192 |         metric_name=None,
193 |         ml_task=None,
194 |         explain_level=2,
195 |     ):
196 |         super(DecisionTreeRegressorAlgorithm, self).interpret(
197 |             X_train,
198 |             y_train,
199 |             X_validation,
200 |             y_validation,
201 |             model_file_path,
202 |             learner_name,
203 |             target_name,
204 |             class_names,
205 |             metric_name,
206 |             ml_task,
207 |             explain_level,
208 |         )
209 |         if explain_level == 0:
210 |             return
211 |         with warnings.catch_warnings():
212 |             warnings.simplefilter(action="ignore")
213 |             try:
214 |                 # 250 is hard limit for number of points used in visualization
215 |                 # if too many points are used then final SVG plot is very large (can be > 100MB)
216 |                 if X_train.shape[0] > 250:
217 |                     x, _, y, _ = subsample(X_train, y_train, REGRESSION, 250)
218 |                     viz = dtreeviz(
219 |                         self.model,
220 |                         x,
221 |                         y,
222 |                         target_name="target",
223 |                         feature_names=x.columns,
224 |                     )
225 |                 else:
226 |                     viz = dtreeviz.model(
227 |                         self.model,
228 |                         X_train,
229 |                         y_train,
230 |                         target_name="target",
231 |                         feature_names=X_train.columns,
232 |                     )
233 |                 tree_file_plot = os.path.join(
234 |                     model_file_path, learner_name + "_tree.svg"
235 |                 )
236 |                 viz.view().save(tree_file_plot)
237 |             except Exception as e:
238 |                 logger.info(
239 |                     f"Problem when visuzalizin decision tree regressor. {str(e)}"
240 |                 )
241 | 
242 |             save_rules(self.model, X_train.columns, None, model_file_path, learner_name)
243 | 
244 | 
245 | dt_params = {"criterion": ["gini", "entropy"], "max_depth": [2, 3, 4]}
246 | 
247 | classification_default_params = {"criterion": "gini", "max_depth": 3}
248 | 
249 | additional = {
250 |     "trees_in_step": 1,
251 |     "train_cant_improve_limit": 0,
252 |     "max_steps": 1,
253 |     "max_rows_limit": None,
254 |     "max_cols_limit": None,
255 | }
256 | required_preprocessing = [
257 |     "missing_values_inputation",
258 |     "convert_categorical",
259 |     "datetime_transform",
260 |     "text_transform",
261 |     "target_as_integer",
262 | ]
263 | 
264 | AlgorithmsRegistry.add(
265 |     BINARY_CLASSIFICATION,
266 |     DecisionTreeAlgorithm,
267 |     dt_params,
268 |     required_preprocessing,
269 |     additional,
270 |     classification_default_params,
271 | )
272 | 
273 | AlgorithmsRegistry.add(
274 |     MULTICLASS_CLASSIFICATION,
275 |     DecisionTreeAlgorithm,
276 |     dt_params,
277 |     required_preprocessing,
278 |     additional,
279 |     classification_default_params,
280 | )
281 | 
282 | dt_regression_params = {
283 |     "criterion": [
284 |         "squared_error",
285 |         "friedman_mse",
286 |     ],  # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626
287 |     "max_depth": [2, 3, 4],
288 | }
289 | regression_required_preprocessing = [
290 |     "missing_values_inputation",
291 |     "convert_categorical",
292 |     "datetime_transform",
293 |     "text_transform",
294 | ]
295 | 
296 | regression_default_params = {"criterion": "squared_error", "max_depth": 3}
297 | 
298 | AlgorithmsRegistry.add(
299 |     REGRESSION,
300 |     DecisionTreeRegressorAlgorithm,
301 |     dt_regression_params,
302 |     regression_required_preprocessing,
303 |     additional,
304 |     regression_default_params,
305 | )
306 | 
```