mljar/mljar-supervised # codebase.md

This is page 1 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   └── workflows
│       ├── run-tests.yml
│       ├── test-installation-with-conda.yml
│       └── test-installation-with-pip-on-windows.yml
├── .gitignore
├── CITATION
├── examples
│   ├── notebooks
│   │   ├── basic_run.ipynb
│   │   └── Titanic.ipynb
│   └── scripts
│       ├── binary_classifier_adult_fairness.py
│       ├── binary_classifier_ensemble.py
│       ├── binary_classifier_marketing.py
│       ├── binary_classifier_random.py
│       ├── binary_classifier_Titanic.py
│       ├── binary_classifier.py
│       ├── multi_class_classifier_digits.py
│       ├── multi_class_classifier_MNIST.py
│       ├── multi_class_classifier.py
│       ├── multi_class_drug_fairness.py
│       ├── regression_acs_fairness.py
│       ├── regression_crime_fairness.py
│       ├── regression_housing_fairness.py
│       ├── regression_law_school_fairness.py
│       ├── regression.py
│       └── tabular_mar_2021.py
├── LICENSE
├── MANIFEST.in
├── pytest.ini
├── README.md
├── requirements_dev.txt
├── requirements.txt
├── setup.py
├── supervised
│   ├── __init__.py
│   ├── algorithms
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── baseline.py
│   │   ├── catboost.py
│   │   ├── decision_tree.py
│   │   ├── extra_trees.py
│   │   ├── factory.py
│   │   ├── knn.py
│   │   ├── lightgbm.py
│   │   ├── linear.py
│   │   ├── nn.py
│   │   ├── random_forest.py
│   │   ├── registry.py
│   │   ├── sklearn.py
│   │   └── xgboost.py
│   ├── automl.py
│   ├── base_automl.py
│   ├── callbacks
│   │   ├── __init__.py
│   │   ├── callback_list.py
│   │   ├── callback.py
│   │   ├── early_stopping.py
│   │   ├── learner_time_constraint.py
│   │   ├── max_iters_constraint.py
│   │   ├── metric_logger.py
│   │   ├── terminate_on_nan.py
│   │   └── total_time_constraint.py
│   ├── ensemble.py
│   ├── exceptions.py
│   ├── fairness
│   │   ├── __init__.py
│   │   ├── metrics.py
│   │   ├── optimization.py
│   │   ├── plots.py
│   │   ├── report.py
│   │   └── utils.py
│   ├── model_framework.py
│   ├── preprocessing
│   │   ├── __init__.py
│   │   ├── datetime_transformer.py
│   │   ├── encoding_selector.py
│   │   ├── exclude_missing_target.py
│   │   ├── goldenfeatures_transformer.py
│   │   ├── kmeans_transformer.py
│   │   ├── label_binarizer.py
│   │   ├── label_encoder.py
│   │   ├── preprocessing_categorical.py
│   │   ├── preprocessing_missing.py
│   │   ├── preprocessing_utils.py
│   │   ├── preprocessing.py
│   │   ├── scale.py
│   │   └── text_transformer.py
│   ├── tuner
│   │   ├── __init__.py
│   │   ├── data_info.py
│   │   ├── hill_climbing.py
│   │   ├── mljar_tuner.py
│   │   ├── optuna
│   │   │   ├── __init__.py
│   │   │   ├── catboost.py
│   │   │   ├── extra_trees.py
│   │   │   ├── knn.py
│   │   │   ├── lightgbm.py
│   │   │   ├── nn.py
│   │   │   ├── random_forest.py
│   │   │   ├── tuner.py
│   │   │   └── xgboost.py
│   │   ├── preprocessing_tuner.py
│   │   ├── random_parameters.py
│   │   └── time_controller.py
│   ├── utils
│   │   ├── __init__.py
│   │   ├── additional_metrics.py
│   │   ├── additional_plots.py
│   │   ├── automl_plots.py
│   │   ├── common.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── data_validation.py
│   │   ├── importance.py
│   │   ├── jsonencoder.py
│   │   ├── leaderboard_plots.py
│   │   ├── learning_curves.py
│   │   ├── metric.py
│   │   ├── shap.py
│   │   ├── subsample.py
│   │   └── utils.py
│   └── validation
│       ├── __init__.py
│       ├── validation_step.py
│       ├── validator_base.py
│       ├── validator_custom.py
│       ├── validator_kfold.py
│       └── validator_split.py
└── tests
    ├── __init__.py
    ├── checks
    │   ├── __init__.py
    │   ├── check_automl_with_regression.py
    │   ├── run_ml_tests.py
    │   └── run_performance_tests.py
    ├── conftest.py
    ├── data
    │   ├── 179.csv
    │   ├── 24.csv
    │   ├── 3.csv
    │   ├── 31.csv
    │   ├── 38.csv
    │   ├── 44.csv
    │   ├── 720.csv
    │   ├── 737.csv
    │   ├── acs_income_1k.csv
    │   ├── adult_missing_values_missing_target_500rows.csv
    │   ├── boston_housing.csv
    │   ├── CrimeData
    │   │   ├── cities.json
    │   │   ├── crimedata.csv
    │   │   └── README.md
    │   ├── Drug
    │   │   ├── Drug_Consumption.csv
    │   │   └── README.md
    │   ├── housing_regression_missing_values_missing_target.csv
    │   ├── iris_classes_missing_values_missing_target.csv
    │   ├── iris_missing_values_missing_target.csv
    │   ├── LawSchool
    │   │   ├── bar_pass_prediction.csv
    │   │   └── README.md
    │   ├── PortugeseBankMarketing
    │   │   └── Data_FinalProject.csv
    │   └── Titanic
    │       ├── test_with_Survived.csv
    │       └── train.csv
    ├── README.md
    ├── tests_algorithms
    │   ├── __init__.py
    │   ├── test_baseline.py
    │   ├── test_catboost.py
    │   ├── test_decision_tree.py
    │   ├── test_extra_trees.py
    │   ├── test_factory.py
    │   ├── test_knn.py
    │   ├── test_lightgbm.py
    │   ├── test_linear.py
    │   ├── test_nn.py
    │   ├── test_random_forest.py
    │   ├── test_registry.py
    │   └── test_xgboost.py
    ├── tests_automl
    │   ├── __init__.py
    │   ├── test_adjust_validation.py
    │   ├── test_automl_init.py
    │   ├── test_automl_report.py
    │   ├── test_automl_sample_weight.py
    │   ├── test_automl_time_constraints.py
    │   ├── test_automl.py
    │   ├── test_data_types.py
    │   ├── test_dir_change.py
    │   ├── test_explain_levels.py
    │   ├── test_golden_features.py
    │   ├── test_handle_imbalance.py
    │   ├── test_integration.py
    │   ├── test_joblib_version.py
    │   ├── test_models_needed_for_predict.py
    │   ├── test_prediction_after_load.py
    │   ├── test_repeated_validation.py
    │   ├── test_restore.py
    │   ├── test_stack_models_constraints.py
    │   ├── test_targets.py
    │   └── test_update_errors_report.py
    ├── tests_callbacks
    │   ├── __init__.py
    │   └── test_total_time_constraint.py
    ├── tests_ensemble
    │   ├── __init__.py
    │   └── test_save_load.py
    ├── tests_fairness
    │   ├── __init__.py
    │   ├── test_binary_classification.py
    │   ├── test_multi_class_classification.py
    │   └── test_regression.py
    ├── tests_preprocessing
    │   ├── __init__.py
    │   ├── disable_eda.py
    │   ├── test_categorical_integers.py
    │   ├── test_datetime_transformer.py
    │   ├── test_encoding_selector.py
    │   ├── test_exclude_missing.py
    │   ├── test_goldenfeatures_transformer.py
    │   ├── test_label_binarizer.py
    │   ├── test_label_encoder.py
    │   ├── test_preprocessing_missing.py
    │   ├── test_preprocessing_utils.py
    │   ├── test_preprocessing.py
    │   ├── test_scale.py
    │   └── test_text_transformer.py
    ├── tests_tuner
    │   ├── __init__.py
    │   ├── test_hill_climbing.py
    │   ├── test_time_controller.py
    │   └── test_tuner.py
    ├── tests_utils
    │   ├── __init__.py
    │   ├── test_compute_additional_metrics.py
    │   ├── test_importance.py
    │   ├── test_learning_curves.py
    │   ├── test_metric.py
    │   ├── test_shap.py
    │   └── test_subsample.py
    └── tests_validation
        ├── __init__.py
        ├── test_validator_kfold.py
        └── test_validator_split.py
```

# Files

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
  1 | AutoML_*
  2 | .vscode
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
```

--------------------------------------------------------------------------------
/tests/data/LawSchool/README.md:
--------------------------------------------------------------------------------

```markdown
1 | Source: https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage
```

--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | # Running tests
 2 | 
 3 | 
 4 | To run all tests:
 5 | 
 6 | ```
 7 | pytest tests -v -x
 8 | ```
 9 | 
10 | To run tests for `algorithms`:
11 | 
12 | ```
13 | pytest tests/tests_algorithms -v -x -s
14 | ```
```

--------------------------------------------------------------------------------
/tests/data/CrimeData/README.md:
--------------------------------------------------------------------------------

```markdown
1 | Source: https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set?select=crimedata.csv
2 | Description: http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized
```

--------------------------------------------------------------------------------
/tests/data/Drug/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | Source https://www.kaggle.com/datasets/obeykhadija/drug-consumptions-uci
 2 | 
 3 | 
 4 | Rating's for Drug Use:
 5 | 
 6 | CL0 Never Used
 7 | 
 8 | CL1 Used over a Decade Ago
 9 | 
10 | CL2 Used in Last Decade
11 | 
12 | CL3 Used in Last Year 59
13 | 
14 | CL4 Used in Last Month
15 | 
16 | CL5 Used in Last Week
17 | 
18 | CL6 Used in Last Day
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | 
  2 | 
  3 | # MLJAR Automated Machine Learning for Humans
  4 | 
  5 | [![Tests status](https://github.com/mljar/mljar-supervised/actions/workflows/run-tests.yml/badge.svg)](https://github.com/mljar/mljar-supervised/actions/workflows/run-tests.yml)
  6 | [![PyPI version](https://badge.fury.io/py/mljar-supervised.svg)](https://badge.fury.io/py/mljar-supervised)
  7 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/mljar-supervised/badges/version.svg)](https://anaconda.org/conda-forge/mljar-supervised)
  8 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/mljar-supervised.svg)](https://pypi.python.org/pypi/mljar-supervised/)
  9 | 
 10 | 
 11 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/mljar-supervised/badges/platforms.svg)](https://anaconda.org/conda-forge/mljar-supervised)
 12 | [![Anaconda-Server Badge](https://anaconda.org/conda-forge/mljar-supervised/badges/license.svg)](https://anaconda.org/conda-forge/mljar-supervised)
 13 | [![Downloads](https://pepy.tech/badge/mljar-supervised)](https://pepy.tech/project/mljar-supervised)
 14 | 
 15 | <p align="center">
 16 |   <img 
 17 |     alt="mljar AutoML"
 18 |     src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/AutoML_white.png#gh-light-mode-only" width="50%" />  
 19 | </p>
 20 | <p align="center">
 21 |   <img 
 22 |     alt="mljar AutoML"
 23 |     src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/AutoML_black.png#gh-dark-mode-only" width="50%" />  
 24 | </p>
 25 | 
 26 | ---
 27 | 
 28 | **Documentation**: <a href="https://supervised.mljar.com/" target="_blank">https://supervised.mljar.com/</a>
 29 | 
 30 | **Source Code**: <a href="https://github.com/mljar/mljar-supervised" target="_blank">https://github.com/mljar/mljar-supervised</a>
 31 | 
 32 | **Looking for commercial support**: Please contact us by [email](https://mljar.com/contact/) for details
 33 | 
 34 | 
 35 | <p align="center">
 36 |   <img src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/pipeline_AutoML.png" width="100%" />
 37 | </p>
 38 | 
 39 | ---
 40 | 
 41 | Watch full AutoML training in Python under 2 minutes. The training is done in [MLJAR Studio](https://mljar.com).
 42 | 
 43 | [![](https://github.com/mljar/studio/blob/main/media/mljar-studio-automl-get-started.jpg?raw=true)](https://youtu.be/t_opxR5dbPU) 
 44 | 
 45 | ## Table of Contents
 46 | 
 47 |  - [Automated Machine Learning](https://github.com/mljar/mljar-supervised#automated-machine-learning)
 48 |  - [What's good in it?](https://github.com/mljar/mljar-supervised#whats-good-in-it)
 49 |  - [AutoML Web App with GUI](https://github.com/mljar/mljar-supervised#automl-web-app-with-user-interface)
 50 |  - [Automatic Documentation](https://github.com/mljar/mljar-supervised#automatic-documentation)
 51 |  - [Available Modes](https://github.com/mljar/mljar-supervised#available-modes)
 52 |  - [Fairness Aware Training](https://github.com/mljar/mljar-supervised#fairness-aware-training)
 53 |  - [Examples](https://github.com/mljar/mljar-supervised#examples)
 54 |  - [FAQ](https://github.com/mljar/mljar-supervised#faq)
 55 |  - [Documentation](https://github.com/mljar/mljar-supervised#documentation)
 56 |  - [Installation](https://github.com/mljar/mljar-supervised#installation)
 57 |  - [Demo](https://github.com/mljar/mljar-supervised#demo)
 58 |  - [Contributing](https://github.com/mljar/mljar-supervised#contributing)
 59 |  - [Cite](https://github.com/mljar/mljar-supervised#cite)
 60 |  - [License](https://github.com/mljar/mljar-supervised#license)
 61 |  - [Commercial support](https://github.com/mljar/mljar-supervised#commercial-support)
 62 |  - [MLJAR](https://github.com/mljar/mljar-supervised#mljar)
 63 |  
 64 | 
 65 | 
 66 | 
 67 | 
 68 | # Automated Machine Learning 
 69 | 
 70 | The `mljar-supervised` is an Automated Machine Learning Python package that works with tabular data. It is designed to save time for a data scientist. It abstracts the common way to preprocess the data, construct the machine learning models, and perform hyper-parameters tuning to find the best model :trophy:. It is no black box, as you can see exactly how the ML pipeline is constructed (with a detailed Markdown report for each ML model). 
 71 | 
 72 | The `mljar-supervised` will help you with:
 73 |  - explaining and understanding your data (Automatic Exploratory Data Analysis),
 74 |  - trying many different machine learning models (Algorithm Selection and Hyper-Parameters tuning),
 75 |  - creating Markdown reports from analysis with details about all models (Automatic-Documentation),
 76 |  - saving, re-running, and loading the analysis and ML models.
 77 | 
 78 | It has four built-in modes of work:
 79 |  - `Explain` mode, which is ideal for explaining and understanding the data, with many data explanations, like decision trees visualization, linear models coefficients display, permutation importance, and SHAP explanations of data,
 80 |  - `Perform` for building ML pipelines to use in production,
 81 |  - `Compete` mode that trains highly-tuned ML models with ensembling and stacking, with the purpose to use in ML competitions.
 82 |  - `Optuna` mode can be used to search for highly-tuned ML models should be used when the performance is the most important, and computation time is not limited (it is available from version `0.10.0`)
 83 | 
 84 | Of course, you can further customize the details of each `mode` to meet the requirements.
 85 | 
 86 | ## What's good in it? 
 87 | 
 88 | - It uses many algorithms: `Baseline`, `Linear`, `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, `CatBoost`, `Neural Networks`, and `Nearest Neighbors`.
 89 | - It can compute Ensemble based on a greedy algorithm from [Caruana paper](http://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf).
 90 | - It can stack models to build a level 2 ensemble (available in `Compete` mode or after setting the `stack_models` parameter).
 91 | - It can do features preprocessing, like missing values imputation and converting categoricals. What is more, it can also handle target values preprocessing.
 92 | - It can do advanced features engineering, like [Golden Features](https://supervised.mljar.com/features/golden_features/), [Features Selection](https://supervised.mljar.com/features/features_selection/), Text and Time Transformations.
 93 | - It can tune hyper-parameters with a `not-so-random-search` algorithm (random-search over a defined set of values) and hill climbing to fine-tune final models.
 94 | - It can compute the `Baseline` for your data so that you will know if you need Machine Learning or not!
 95 | - It has extensive explanations. This package is training simple `Decision Trees` with `max_depth <= 5`, so you can easily visualize them with amazing [dtreeviz](https://github.com/parrt/dtreeviz) to better understand your data.
 96 | - The `mljar-supervised` uses simple linear regression and includes its coefficients in the summary report, so you can check which features are used the most in the linear model.
 97 | - It cares about the explainability of models: for every algorithm, the feature importance is computed based on permutation. Additionally, for every algorithm, the SHAP explanations are computed: feature importance, dependence plots, and decision plots (explanations can be switched off with the `explain_level` parameter).
 98 | - There is automatic documentation for every ML experiment run with AutoML. The `mljar-supervised` creates markdown reports from AutoML training full of ML details, metrics, and charts. 
 99 | 
100 | <p align="center">
101 |   <img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/infograph.png" width="100%" />
102 | </p>
103 | 
104 | # AutoML Web App with User Interface
105 | 
106 | We created a Web App with GUI, so you don't need to write any code 🐍. Just upload your data. Please check the Web App at [github.com/mljar/automl-app](https://github.com/mljar/automl-app). You can run this Web App locally on your computer, so your data is safe and secure :cat:
107 | 
108 | <kbd>
109 | <img src="https://github.com/mljar/automl-app/blob/main/media/web-app.gif" alt="AutoML training in Web App"></img>
110 | </kbd>
111 | 
112 | # Automatic Documentation
113 | 
114 | ## The AutoML Report
115 | 
116 | The report from running AutoML will contain the table with information about each model score and the time needed to train the model. There is a link for each model, which you can click to see the model's details. The performance of all ML models is presented as scatter and box plots so you can visually inspect which algorithms perform the best :trophy:.
117 | 
118 | ![AutoML leaderboard](https://github.com/mljar/mljar-examples/blob/master/media/automl_summary.gif)
119 | 
120 | ## The `Decision Tree` Report
121 | 
122 | The example for `Decision Tree` summary with trees visualization. For classification tasks, additional metrics are provided:
123 | - confusion matrix
124 | - threshold (optimized in the case of binary classification task)
125 | - F1 score
126 | - Accuracy
127 | - Precision, Recall, MCC
128 | 
129 | ![Decision Tree summary](https://github.com/mljar/mljar-examples/blob/master/media/decision_tree_summary.gif)
130 | 
131 | ## The `LightGBM` Report
132 | 
133 | The example for `LightGBM` summary:
134 | 
135 | ![Decision Tree summary](https://github.com/mljar/mljar-examples/blob/master/media/lightgbm_summary.gif)
136 | 
137 | 
138 | ## Available Modes
139 | 
140 | In the [docs](https://supervised.mljar.com/features/modes/) you can find details about AutoML modes that are presented in the table.
141 | 
142 | <p align="center">
143 |   <img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/mljar_modes.png" width="100%" />
144 | </p>
145 | 
146 | ### Explain 
147 | 
148 | ```py
149 | automl = AutoML(mode="Explain")
150 | ```
151 | 
152 | It is aimed to be used when the user wants to explain and understand the data.
153 |  - It is using 75%/25% train/test split. 
154 |  - It uses: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `Xgboost`, `Neural Network' algorithms, and ensemble. 
155 |  - It has full explanations: learning curves, importance plots, and SHAP plots.
156 | 
157 | ### Perform
158 | 
159 | ```py
160 | automl = AutoML(mode="Perform")
161 | ```
162 | 
163 | It should be used when the user wants to train a model that will be used in real-life use cases.
164 |  - It uses a 5-fold CV.
165 |  - It uses: `Linear`, `Random Forest`, `LightGBM`, `Xgboost`, `CatBoost`, and `Neural Network`. It uses ensembling. 
166 |  - It has learning curves and importance plots in reports.
167 | 
168 | ### Compete
169 | 
170 | ```py
171 | automl = AutoML(mode="Compete")
172 | ```
173 | 
174 | It should be used for machine learning competitions.
175 |  - It adapts the validation strategy depending on dataset size and `total_time_limit`. It can be: a train/test split (80/20), 5-fold CV or 10-fold CV. 
176 |  - It is using: `Linear`, `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, `CatBoost`, `Neural Network`, and `Nearest Neighbors`. It uses ensemble and **stacking**. 
177 |  - It has only learning curves in the reports.
178 | 
179 | ### Optuna
180 | 
181 | ```py
182 | automl = AutoML(mode="Optuna", optuna_time_budget=3600)
183 | ```
184 | 
185 | It should be used when the performance is the most important and time is not limited.
186 | - It uses a 10-fold CV
187 | - It uses: `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, and `CatBoost`. Those algorithms are tuned by `Optuna` framework for `optuna_time_budget` seconds, each. Algorithms are tuned with original data, without advanced feature engineering.
188 | - It uses advanced feature engineering, stacking and ensembling. The hyperparameters found for original data are reused with those steps.
189 | - It produces learning curves in the reports.
190 | 
191 | 
192 | 
193 | ## How to save and load AutoML?
194 | 
195 | All models in the AutoML are saved and loaded automatically. No need to call `save()` or `load()`.
196 | 
197 | ### Example:
198 | 
199 | #### Train AutoML
200 | 
201 | ```python
202 | automl = AutoML(results_path="AutoML_classifier")
203 | automl.fit(X, y)
204 | ```
205 | 
206 | You will have all models saved in the `AutoML_classifier` directory. Each model will have a separate directory with the `README.md` file with all details from the training.
207 | 
208 | #### Compute predictions
209 | ```python
210 | automl = AutoML(results_path="AutoML_classifier")
211 | automl.predict(X)
212 | ```
213 | 
214 | The  AutoML automatically loads models from the `results_path` directory. If you will call `fit()` on already trained AutoML then you will get a warning message that AutoML is already fitted.
215 | 
216 | 
217 | ### Why do you automatically save all models?
218 | 
219 | All models are automatically saved to be able to restore the training after interruption. For example, you are training AutoML for 48 hours, and after 47 hours, there is some unexpected interruption. In MLJAR AutoML you just call the same training code after the interruption and AutoML reloads already trained models and finishes the training.
220 | 
221 | ## Supported evaluation metrics (`eval_metric` argument in `AutoML()`)
222 | 
223 | - for binary classification: `logloss`, `auc`, `f1`, `average_precision`, `accuracy`- default is `logloss`
224 | - for multiclass classification: `logloss`, `f1`, `accuracy` - default is `logloss`
225 | - for regression: `rmse`, `mse`, `mae`, `r2`, `mape`, `spearman`, `pearson` - default is `rmse`
226 | 
227 | If you don't find the `eval_metric` that you need, please add a new issue. We will add it.
228 | 
229 | 
230 | ## Fairness Aware Training
231 | 
232 | Starting from version `1.0.0` AutoML can optimize the Machine Learning pipeline with sensitive features. There are the following fairness related arguments in the AutoML constructor:
233 |  - `fairness_metric` - metric which will be used to decide if the model is fair,
234 |  - `fairness_threshold` - threshold used in decision about model fairness,
235 |  - `privileged_groups` - privileged groups used in fairness metrics computation,
236 |  - `underprivileged_groups` - underprivileged groups used in fairness metrics computation.
237 | 
238 | The `fit()` method accepts `sensitive_features`. When sensitive features are passed to AutoML, the best model will be selected among fair models only. In the AutoML reports, additional information about fairness metrics will be added. The MLJAR AutoML supports two methods for bias mitigation:
239 |  - Sample Weighting - assigns weights to samples to treat samples equally,
240 |  - Smart Grid Search - similar to Sample Weighting, where different weights are checked to optimize fairness metric.
241 | 
242 | The fair ML building can be used with all algorithms, including `Ensemble` and `Stacked Ensemble`. We support three Machine Learning tasks:
243 |  - binary classification,
244 |  - mutliclass classification,
245 |  - regression.
246 | 
247 | Example code:
248 | 
249 | 
250 | ```python
251 | from sklearn.model_selection import train_test_split
252 | from sklearn.datasets import fetch_openml
253 | from supervised.automl import AutoML
254 | 
255 | data = fetch_openml(data_id=1590, as_frame=True)
256 | X = data.data
257 | y = (data.target == ">50K") * 1
258 | sensitive_features = X[["sex"]]
259 | 
260 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(
261 |     X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42
262 | )
263 | 
264 | automl = AutoML(
265 |     algorithms=[
266 |         "Xgboost"
267 |     ],
268 |     train_ensemble=False,
269 |     fairness_metric="demographic_parity_ratio",  
270 |     fairness_threshold=0.8,
271 |     privileged_groups = [{"sex": "Male"}],
272 |     underprivileged_groups = [{"sex": "Female"}],
273 | )
274 | 
275 | automl.fit(X_train, y_train, sensitive_features=S_train)
276 | ```
277 | 
278 | You can read more about fairness aware AutoML training in our article https://mljar.com/blog/fairness-machine-learning/
279 | 
280 | ![Fairness aware AutoML](https://raw.githubusercontent.com/mljar/visual-identity/main/automl/fairness-automl.gif)
281 | 
282 | 
283 | 
284 | # Examples
285 | 
286 | ## :point_right: Binary Classification Example
287 | 
288 | There is a simple interface available with `fit` and `predict` methods.
289 | 
290 | ```python
291 | import pandas as pd
292 | from sklearn.model_selection import train_test_split
293 | from supervised.automl import AutoML
294 | 
295 | df = pd.read_csv(
296 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
297 |     skipinitialspace=True,
298 | )
299 | X_train, X_test, y_train, y_test = train_test_split(
300 |     df[df.columns[:-1]], df["income"], test_size=0.25
301 | )
302 | 
303 | automl = AutoML()
304 | automl.fit(X_train, y_train)
305 | 
306 | predictions = automl.predict(X_test)
307 | ```
308 | 
309 | AutoML `fit` will print:
310 | ```py
311 | Create directory AutoML_1
312 | AutoML task to be solved: binary_classification
313 | AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
314 | AutoML will optimize for metric: logloss
315 | 1_Baseline final logloss 0.5519845471086654 time 0.08 seconds
316 | 2_DecisionTree final logloss 0.3655910192804364 time 10.28 seconds
317 | 3_Linear final logloss 0.38139916864708445 time 3.19 seconds
318 | 4_Default_RandomForest final logloss 0.2975204390214936 time 79.19 seconds
319 | 5_Default_Xgboost final logloss 0.2731086827200411 time 5.17 seconds
320 | 6_Default_NeuralNetwork final logloss 0.319812276905242 time 21.19 seconds
321 | Ensemble final logloss 0.2731086821194617 time 1.43 seconds
322 | ```
323 | 
324 | - the AutoML results in [Markdown report](https://github.com/mljar/mljar-examples/tree/master/Income_classification/AutoML_1#automl-leaderboard)
325 | - the Xgboost [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/5_Default_Xgboost/README.md), please take a look at amazing dependence plots produced by SHAP package :sparkling_heart:
326 | - the Decision Tree [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/2_DecisionTree/README.md), please take a look at beautiful tree visualization :sparkles:
327 | - the Logistic Regression [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/3_Linear/README.md), please take a look at coefficients table, and you can compare the SHAP plots between (Xgboost, Decision Tree and Logistic Regression) :coffee:
328 | 
329 | 
330 | ## :point_right: Multi-Class Classification Example
331 | 
332 | The example code for classification of the optical recognition of handwritten digits dataset. Running this code in less than 30 minutes will result in test accuracy ~98%.
333 | 
334 | ```python
335 | import pandas as pd 
336 | # scikit learn utilites
337 | from sklearn.datasets import load_digits
338 | from sklearn.metrics import accuracy_score
339 | from sklearn.model_selection import train_test_split
340 | # mljar-supervised package
341 | from supervised.automl import AutoML
342 | 
343 | # load the data
344 | digits = load_digits()
345 | X_train, X_test, y_train, y_test = train_test_split(
346 |     pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25,
347 |     random_state=123
348 | )
349 | 
350 | # train models with AutoML
351 | automl = AutoML(mode="Perform")
352 | automl.fit(X_train, y_train)
353 | 
354 | # compute the accuracy on test data
355 | predictions = automl.predict_all(X_test)
356 | print(predictions.head())
357 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
358 | ```
359 | 
360 | ## :point_right: Regression Example
361 | 
362 | Regression example on `California Housing` house prices data.
363 | 
364 | ```python
365 | import numpy as np
366 | import pandas as pd
367 | from sklearn.datasets import fetch_california_housing
368 | from sklearn.model_selection import train_test_split
369 | from sklearn.metrics import mean_squared_error
370 | from supervised.automl import AutoML # mljar-supervised
371 | 
372 | # Load the data
373 | housing = fetch_california_housing()
374 | X_train, X_test, y_train, y_test = train_test_split(
375 |     pd.DataFrame(housing.data, columns=housing.feature_names),
376 |     housing.target,
377 |     test_size=0.25,
378 |     random_state=123,
379 | )
380 | 
381 | # train models with AutoML
382 | automl = AutoML(mode="Explain")
383 | automl.fit(X_train, y_train)
384 | 
385 | # compute the MSE on test data
386 | predictions = automl.predict(X_test)
387 | print("Test MSE:", mean_squared_error(y_test, predictions))
388 | ```
389 | 
390 | ## :point_right: More Examples
391 | 
392 | - [**Income classification**](https://github.com/mljar/mljar-examples/tree/master/Income_classification) - it is a binary classification task on census data
393 | - [**Iris classification**](https://github.com/mljar/mljar-examples/tree/master/Iris_classification) - it is a multiclass classification on Iris flowers data
394 | - [**House price regression**](https://github.com/mljar/mljar-examples/tree/master/House_price_regression) - it is a regression task on Boston houses data
395 | 
396 | # FAQ
397 | 
398 | <details><summary>What method is used for hyperparameters optimization?</summary>
399 |   - For modes: `Explain`, `Perform`, and `Compete` there is used a random search method combined with hill climbing. In this approach, all checked models are saved and used for building Ensemble.
400 |   - For mode: `Optuna` the Optuna framework is used. It uses using TPE sampler for tuning. Models checked during the Optuna hyperparameters search are not saved, only the best model is saved (the final model from tuning). You can check the details about checked hyperparameters from optuna by checking study files in the `optuna` directory in your AutoML `results_path`.
401 | </details>
402 | 
403 | <details><summary>How to save and load AutoML?</summary>
404 | 
405 | The save and load of AutoML models is automatic. All models created during AutoML training are saved in the directory set in `results_path` (argument of `AutoML()` constructor). If there is no `results_path` set, then the directory is created based on following name convention: `AutoML_{number}` the `number` will be number from 1 to 1000 (depends which directory name will be free).
406 | 
407 | Example save and load:
408 | 
409 | ```python
410 | automl = AutoML(results_path='AutoML_1')
411 | automl.fit(X, y)
412 | ```
413 | 
414 | The all models from AutoML are saved in `AutoML_1` directory.
415 | 
416 | To load models:
417 | 
418 | ```python
419 | automl = AutoML(results_path='AutoML_1')
420 | automl.predict(X)
421 | ```
422 | 
423 | </details>
424 | 
425 | <details><summary>How to set ML task (select between classification or regression)?</summary>
426 | 
427 | The MLJAR AutoML can work with:
428 | - binary classification
429 | - multi-class classification
430 | - regression
431 | 
432 | The ML task detection is automatic based on target values. There can be situation if you want to manually force AutoML to select the ML task, then you need to set `ml_task` parameter. It can be set to `'binary_classification'`, `'multiclass_classification'`, `'regression'`.
433 | 
434 | Example:
435 | ```python
436 | automl = AutoML(ml_task='regression')
437 | automl.fit(X, y)
438 | ```
439 | In the above example the regression model will be fitted.
440 | 
441 | </details>
442 | 
443 | <details><summary>How to reuse Optuna hyperparameters?</summary>
444 |   
445 |   You can reuse Optuna hyperparameters that were found in other AutoML training. You need to pass them in `optuna_init_params` argument. All hyperparameters found during Optuna tuning are saved in the `optuna/optuna.json` file (inside `results_path` directory).
446 |   
447 |  Example:
448 |  
449 |  ```python
450 |  optuna_init = json.loads(open('previous_AutoML_training/optuna/optuna.json').read())
451 |  
452 |  automl = AutoML(
453 |      mode='Optuna',
454 |      optuna_init_params=optuna_init
455 |  )
456 |  automl.fit(X, y)
457 |  ```
458 |   
459 |  When reusing Optuna hyperparameters the Optuna tuning is simply skipped. The model will be trained with hyperparameters set in `optuna_init_params`. Right now there is no option to continue Optuna tuning with seed parameters.
460 |   
461 |   
462 | </details>
463 | 
464 | 
465 | <details><summary>How to know the order of classes for binary or multiclass problem when using predict_proba?</summary>
466 | 
467 | To get predicted probabilites with information about class label please use the `predict_all()` method. It returns the pandas DataFrame with class names in the columns. The order of predicted columns is the same in the `predict_proba()` and `predict_all()` methods. The `predict_all()` method will additionaly have the column with the predicted class label.
468 | 
469 | </details>
470 | 
471 | # Documentation  
472 | 
473 | For details please check [mljar-supervised docs](https://supervised.mljar.com).
474 | 
475 | # Installation  
476 | 
477 | From PyPi repository:
478 | 
479 | ```
480 | pip install mljar-supervised
481 | ```
482 | 
483 | To install this package with conda run:
484 | ```
485 | conda install -c conda-forge mljar-supervised
486 | ```
487 | 
488 | From source code:
489 | 
490 | ```
491 | git clone https://github.com/mljar/mljar-supervised.git
492 | cd mljar-supervised
493 | python setup.py install
494 | ```
495 | 
496 | Installation for development
497 | ```
498 | git clone https://github.com/mljar/mljar-supervised.git
499 | virtualenv venv --python=python3.6
500 | source venv/bin/activate
501 | pip install -r requirements.txt
502 | pip install -r requirements_dev.txt
503 | ```
504 | 
505 | Running in the docker:
506 | ```
507 | FROM python:3.7-slim-buster
508 | RUN apt-get update && apt-get -y update
509 | RUN apt-get install -y build-essential python3-pip python3-dev
510 | RUN pip3 -q install pip --upgrade
511 | RUN pip3 install mljar-supervised jupyter
512 | CMD ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--allow-root"]
513 | ```
514 | 
515 | Install from GitHub with pip:
516 | ```
517 | pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master
518 | ```
519 | # Demo
520 | 
521 | In the below demo GIF you will see:
522 | - MLJAR AutoML trained in Jupyter Notebook on the Titanic dataset
523 | - overview of created files
524 | - a showcase of selected plots created during AutoML training
525 | - algorithm comparison report along with their plots
526 | - example of README file and CSV file with results
527 | 
528 | ![](https://github.com/mljar/mljar-examples/raw/master/media/mljar_files.gif)
529 | 
530 | # Contributing
531 | 
532 | To get started take a look at our [Contribution Guide](https://supervised.mljar.com/contributing/) for information about our process and where you can fit in!
533 | 
534 | ### Contributors
535 | <a href="https://github.com/mljar/mljar-supervised/graphs/contributors">
536 |   <img src="https://contributors-img.web.app/image?repo=mljar/mljar-supervised" />
537 | </a>
538 | 
539 | # Cite
540 | 
541 | Would you like to cite MLJAR? Great! :)
542 | 
543 | You can cite MLJAR as follows:
544 | 
545 | ```
546 | @misc{mljar,
547 |   author    = {Aleksandra P\l{}o\'{n}ska and Piotr P\l{}o\'{n}ski},
548 |   year      = {2021},
549 |   publisher = {MLJAR},
550 |   address   = {\L{}apy, Poland},
551 |   title     = {MLJAR: State-of-the-art Automated Machine Learning Framework for Tabular Data.  Version 0.10.3},
552 |   url       = {https://github.com/mljar/mljar-supervised}
553 | }
554 | ```
555 | 
556 | Would love to hear from you about how have you used MLJAR AutoML in your project. 
557 | Please feel free to let us know at 
558 | ![image](https://user-images.githubusercontent.com/6959032/118103228-f5ea9a00-b3d9-11eb-87ed-8cfb1f873f91.png)
559 | 
560 | 
561 | # License  
562 | 
563 | The `mljar-supervised` is provided with [MIT license](https://github.com/mljar/mljar-supervised/blob/master/LICENSE).
564 | 
565 | # Commercial support
566 | 
567 | Looking for commercial support? Do you need new feature implementation? Please contact us by [email](https://mljar.com/contact/) for details.
568 | 
569 | # MLJAR 
570 | <p align="center">
571 |   <img src="https://github.com/mljar/mljar-examples/blob/master/media/large_logo.png" width="314" />
572 | </p>
573 | 
574 | The `mljar-supervised` is an open-source project created by [MLJAR](https://mljar.com). We care about ease of use in Machine Learning. 
575 | The [mljar.com](https://mljar.com) provides a beautiful and simple user interface for building machine learning models.
576 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/supervised/validation/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/checks/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_callbacks/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_ensemble/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_fairness/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_tuner/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/tests/tests_validation/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------

```
1 | [pytest]
2 | addopts = -p no:warnings
```

--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------

```
1 | pytest
2 | black
3 | pytest-cov
4 | coveralls
```

--------------------------------------------------------------------------------
/supervised/__init__.py:
--------------------------------------------------------------------------------

```python
1 | __version__ = "1.1.18"
2 | 
3 | from supervised.automl import AutoML
4 | 
```

--------------------------------------------------------------------------------
/tests/checks/run_performance_tests.py:
--------------------------------------------------------------------------------

```python
1 | import unittest
2 | 
3 | from tests.tests_bin_class.test_performance import *
4 | 
5 | if __name__ == "__main__":
6 |     unittest.main()
7 | 
```

--------------------------------------------------------------------------------
/tests/checks/run_ml_tests.py:
--------------------------------------------------------------------------------

```python
1 | import unittest
2 | 
3 | from tests.tests_bin_class.run import *
4 | from tests.tests_multi_class.run import *
5 | 
6 | if __name__ == "__main__":
7 |     unittest.main()
8 | 
```

--------------------------------------------------------------------------------
/supervised/utils/constants.py:
--------------------------------------------------------------------------------

```python
1 | # tasks that can be handled by the package
2 | BINARY_CLASSIFICATION = "binary_classification"
3 | MULTICLASS_CLASSIFICATION = "multiclass_classification"
4 | REGRESSION = "regression"
5 | 
```

--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------

```python
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def data_folder(request) -> Path:
 8 |     folder_path = Path(__file__).parent / 'data'
 9 |     assert folder_path.exists()
10 |     request.cls.data_folder = folder_path
11 |     return folder_path
12 | 
```

--------------------------------------------------------------------------------
/supervised/utils/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | 
 3 | from supervised.utils.jsonencoder import MLJSONEncoder
 4 | 
 5 | 
 6 | def json_loads(data, *args, **kwargs):
 7 |     return json.loads(data, *args, **kwargs)
 8 | 
 9 | 
10 | def json_dumps(data, *args, **kwargs):
11 |     return json.dumps(data, cls=MLJSONEncoder, *args, **kwargs)
12 | 
```

--------------------------------------------------------------------------------
/supervised/validation/validator_base.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | class BaseValidator(object):
 7 |     def __init__(self, params):
 8 |         self.params = params
 9 | 
10 |     def split(self):
11 |         pass
12 | 
13 |     def get_n_splits(self):
14 |         pass
15 | 
16 |     def get_repeats(self):
17 |         return 1
18 | 
```

--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------

```
 1 | numpy>=1.19.5,<2
 2 | pandas>=2.0.0
 3 | scipy>=1.6.1
 4 | scikit-learn>=1.5.0
 5 | xgboost>=2.0.0
 6 | lightgbm>=3.0.0
 7 | catboost>=0.24.4
 8 | joblib>=1.0.1
 9 | tabulate>=0.8.7
10 | matplotlib>=3.2.2
11 | dtreeviz>=2.2.2
12 | shap>=0.42.1
13 | seaborn>=0.11.1
14 | optuna-integration>=3.6.0
15 | mljar-scikit-plot>=0.3.11
16 | markdown
17 | typing-extensions
18 | ipython
19 | 
```

--------------------------------------------------------------------------------
/examples/scripts/regression.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("./tests/data/housing_regression_missing_values_missing_target.csv")
 6 | x_cols = [c for c in df.columns if c != "MEDV"]
 7 | X = df[x_cols]
 8 | y = df["MEDV"]
 9 | 
10 | automl = AutoML()
11 | automl.fit(X, y)
12 | 
13 | df["predictions"] = automl.predict(X)
14 | print("Predictions")
15 | print(df[["MEDV", "predictions"]].head())
16 | 
```

--------------------------------------------------------------------------------
/supervised/utils/subsample.py:
--------------------------------------------------------------------------------

```python
 1 | from sklearn.model_selection import train_test_split
 2 | 
 3 | from supervised.algorithms.registry import REGRESSION
 4 | 
 5 | 
 6 | def subsample(X, y, ml_task, train_size):
 7 |     shuffle = True
 8 |     stratify = None
 9 | 
10 |     if ml_task != REGRESSION:
11 |         stratify = y
12 | 
13 |     X_train, X_test, y_train, y_test = train_test_split(
14 |         X, y, train_size=train_size, shuffle=shuffle, stratify=stratify
15 |     )
16 | 
17 |     return X_train, X_test, y_train, y_test
18 | 
```

--------------------------------------------------------------------------------
/examples/scripts/regression_law_school_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("tests/data/LawSchool/bar_pass_prediction.csv")
 6 | df["race1"][df["race1"] != "white"] = "non-white"  # keep it as binary feature
 7 | 
 8 | X = df[["gender", "lsat", "race1", "pass_bar"]]
 9 | y = df["gpa"]
10 | 
11 | sensitive_features = df["race1"]
12 | 
13 | automl = AutoML(
14 |     algorithms=["Xgboost", "LightGBM", "Extra Trees"],
15 |     train_ensemble=True,
16 |     fairness_threshold=0.9,
17 | )
18 | automl.fit(X, y, sensitive_features=sensitive_features)
19 | 
```

--------------------------------------------------------------------------------
/supervised/utils/config.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | LOG_LEVEL = logging.ERROR
 4 | 
 5 | # from guppy import hpy
 6 | # from pympler import summary
 7 | # from pympler import muppy
 8 | import time
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def mem(msg=""):
14 |     """Memory usage in MB"""
15 | 
16 |     time.sleep(5)
17 | 
18 |     with open("/proc/self/status") as f:
19 |         memusage = f.read().split("VmRSS:")[1].split("\n")[0][:-3]
20 | 
21 |     print(msg, "- memory:", np.round(float(memusage.strip()) / 1024.0), "MB")
22 | 
23 |     # all_objects = muppy.get_objects()
24 |     # sum1 = summary.summarize(all_objects)
25 |     # summary.print_(sum1)
26 | 
```

--------------------------------------------------------------------------------
/supervised/exceptions.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | from supervised.utils.config import LOG_LEVEL
 4 | 
 5 | logging.basicConfig(
 6 |     format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR
 7 | )
 8 | logger = logging.getLogger(__name__)
 9 | logger.setLevel(LOG_LEVEL)
10 | 
11 | 
12 | class AutoMLException(Exception):
13 |     def __init__(self, message):
14 |         super(AutoMLException, self).__init__(message)
15 |         logger.error(message)
16 | 
17 | 
18 | class NotTrainedException(Exception):
19 |     def __init__(self, message):
20 |         super(NotTrainedException, self).__init__(message)
21 |         logger.debug(message)
22 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/random_parameters.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RandomParameters:
 5 | 
 6 |     """
 7 |     Example params are in JSON format:
 8 |     {
 9 |         "booster": ["gbtree", "gblinear"],
10 |         "objective": ["binary:logistic"],
11 |         "eval_metric": ["auc", "logloss"],
12 |         "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
13 |     }
14 |     """
15 | 
16 |     @staticmethod
17 |     def get(params, seed=1):
18 |         np.random.seed(seed)
19 |         generated_params = {"seed": seed}
20 |         for k in params:
21 |             generated_params[k] = np.random.permutation(params[k])[0].item()
22 |         return generated_params
23 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/max_iters_constraint.py:
--------------------------------------------------------------------------------

```python
 1 | from supervised.callbacks.callback import Callback
 2 | 
 3 | 
 4 | class MaxItersConstraint(Callback):
 5 |     def __init__(self, params):
 6 |         super(MaxItersConstraint, self).__init__(params)
 7 |         self.name = params.get("name", "max_iters_constraint")
 8 |         self.max_iters = params.get("max_iters", 10)
 9 | 
10 |     def add_and_set_learner(self, learner):
11 |         self.learner = learner
12 | 
13 |     def on_iteration_end(self, logs, predictions):
14 |         # iters are computed starting from 0
15 |         if logs.get("iter_cnt") + 1 >= self.max_iters:
16 |             self.learner.stop_training = True
17 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_registry.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | from supervised.algorithms.registry import AlgorithmsRegistry
 4 | 
 5 | 
 6 | class AlgorithmsRegistryTest(unittest.TestCase):
 7 |     def test_add_to_registry(self):
 8 |         class Model1:
 9 |             algorithm_short_name = ""
10 | 
11 |         model1 = {
12 |             "task_name": "binary_classification",
13 |             "model_class": Model1,
14 |             "model_params": {},
15 |             "required_preprocessing": {},
16 |             "additional": {},
17 |             "default_params": {},
18 |         }
19 |         AlgorithmsRegistry.add(**model1)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     unittest.main()
24 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_factory.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | from supervised.algorithms.factory import AlgorithmFactory
 4 | from supervised.algorithms.xgboost import XgbAlgorithm
 5 | 
 6 | 
 7 | class AlgorithmFactoryTest(unittest.TestCase):
 8 |     def test_fit(self):
 9 |         params = {
10 |             "learner_type": "Xgboost",
11 |             "objective": "binary:logistic",
12 |             "eval_metric": "logloss",
13 |         }
14 |         learner = AlgorithmFactory.get_algorithm(params)
15 |         self.assertEqual(
16 |             learner.algorithm_short_name, XgbAlgorithm.algorithm_short_name
17 |         )
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 
```

--------------------------------------------------------------------------------
/supervised/utils/utils.py:
--------------------------------------------------------------------------------

```python
 1 | import copy
 2 | 
 3 | 
 4 | class Store:
 5 |     data = {}
 6 | 
 7 |     def set(self, key, value):
 8 |         Store.data[key] = value
 9 | 
10 |     def get(self, key):
11 |         return copy.deepcopy(Store.data[key])
12 | 
13 | 
14 | def dump_data(file_path, df):
15 |     store = Store()
16 |     store.set(file_path, df)
17 |     # try:
18 |     #    df.to_parquet(file_path, index=False)
19 |     # except Exception as e:
20 |     #    df.to_csv(file_path, index=False)
21 | 
22 | 
23 | def load_data(file_path):
24 |     store = Store()
25 |     return store.get(file_path)
26 |     # try:
27 |     #    return pd.read_parquet(file_path)
28 |     # except Exception as e:
29 |     #    return pd.read_csv(file_path)
30 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/callback.py:
--------------------------------------------------------------------------------

```python
 1 | class Callback(object):
 2 |     def __init__(self, params):
 3 |         self.params = params
 4 |         self.learners = []
 5 |         self.learner = None  # current learner
 6 |         self.name = "callback"
 7 | 
 8 |     def add_and_set_learner(self, learner):
 9 |         self.learners += [learner]
10 |         self.learner = learner
11 | 
12 |     def on_learner_train_start(self, logs):
13 |         pass
14 | 
15 |     def on_learner_train_end(self, logs):
16 |         pass
17 | 
18 |     def on_iteration_start(self, logs):
19 |         pass
20 | 
21 |     def on_iteration_end(self, logs, predictions):
22 |         pass
23 | 
24 |     def on_framework_train_end(self, logs):
25 |         pass
26 | 
```

--------------------------------------------------------------------------------
/tests/tests_tuner/test_tuner.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | from supervised.tuner.mljar_tuner import MljarTuner
 4 | 
 5 | 
 6 | class TunerTest(unittest.TestCase):
 7 |     def test_key_params(self):
 8 |         params1 = {
 9 |             "preprocessing": {"p1": 1, "p2": 2},
10 |             "learner": {"p1": 1, "p2": 2},
11 |             "validation_strategy": {},
12 |         }
13 |         params2 = {
14 |             "preprocessing": {"p1": 1, "p2": 2},
15 |             "learner": {"p2": 2, "p1": 1},
16 |             "validation_strategy": {},
17 |         }
18 |         key1 = MljarTuner.get_params_key(params1)
19 |         key2 = MljarTuner.get_params_key(params2)
20 |         self.assertEqual(key1, key2)
21 | 
```

--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | import numpy as np
 3 | from supervised.automl import AutoML
 4 | import supervised
 5 | 
 6 | 
 7 | import warnings
 8 | 
 9 | from sklearn import datasets
10 | from sklearn.pipeline import make_pipeline
11 | from sklearn.decomposition import PCA
12 | 
13 | from supervised import AutoML
14 | from supervised.exceptions import AutoMLException
15 | 
16 | df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
17 | X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
18 | y = df["class"]
19 | 
20 | automl = AutoML()
21 | 
22 | automl.fit(X, y)
23 | 
24 | predictions = automl.predict_all(X)
25 | 
26 | print(predictions.head())
27 | print(predictions.tail())
28 | 
29 | print(X.shape)
30 | print(predictions.shape)
31 | 
```

--------------------------------------------------------------------------------
/examples/scripts/regression_crime_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | # data source http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized
 6 | 
 7 | df = pd.read_csv("tests/data/CrimeData/crimedata.csv", na_values=["?"])
 8 | 
 9 | X = df[df.columns[5:129]]
10 | y = df["ViolentCrimesPerPop"]
11 | 
12 | sensitive_features = (df["racePctWhite"] > 84).astype(str)
13 | 
14 | automl = AutoML(
15 |     #algorithms=["Decision Tree", "Neural Network", "Xgboost", "Linear", "CatBoost"],
16 |     algorithms=["Xgboost", "Linear", "CatBoost"],
17 |     train_ensemble=True,
18 |     fairness_threshold=0.5,
19 | )
20 | automl.fit(X, y, sensitive_features=sensitive_features)
21 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_Titanic.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | import numpy as np
 3 | from sklearn.metrics import accuracy_score
 4 | from supervised import AutoML
 5 | 
 6 | train = pd.read_csv(
 7 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv"
 8 | )
 9 | print(train.head())
10 | 
11 | X = train[train.columns[2:]]
12 | y = train["Survived"]
13 | 
14 | automl = AutoML()  # default mode is Explain
15 | 
16 | automl.fit(X, y)
17 | 
18 | test = pd.read_csv(
19 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv"
20 | )
21 | predictions = automl.predict(test)
22 | print(predictions)
23 | print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%")
24 | 
```

--------------------------------------------------------------------------------
/examples/scripts/regression_housing_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | df = pd.read_csv("./tests/data/boston_housing.csv")
 6 | x_cols = [c for c in df.columns if c != "MEDV"]
 7 | 
 8 | df["large_B"] = (df["B"] > 380) * 1
 9 | df["large_B"] = df["large_B"].astype(str)
10 | 
11 | 
12 | print(df["large_B"].dtype.name)
13 | sensitive_features = df["large_B"]
14 | 
15 | X = df[x_cols]
16 | y = df["MEDV"]
17 | 
18 | automl = AutoML(
19 |     algorithms=["Xgboost", "LightGBM"],
20 |     train_ensemble=True,
21 |     fairness_threshold=0.9,
22 | )
23 | automl.fit(X, y, sensitive_features=sensitive_features)
24 | 
25 | df["predictions"] = automl.predict(X)
26 | print("Predictions")
27 | print(df[["MEDV", "predictions"]].head())
28 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_encoding_selector.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from supervised.preprocessing.encoding_selector import EncodingSelector
 6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 7 | 
 8 | 
 9 | class CategoricalIntegersTest(unittest.TestCase):
10 |     def test_selector(self):
11 |         d = {"col1": [f"{i}" for i in range(31)], "col2": ["a"] * 31}
12 |         df = pd.DataFrame(data=d)
13 | 
14 |         self.assertEqual(
15 |             EncodingSelector.get(df, None, "col1"),
16 |             PreprocessingCategorical.MANY_CATEGORIES,
17 |         )
18 |         self.assertEqual(
19 |             EncodingSelector.get(df, None, "col2"),
20 |             PreprocessingCategorical.FEW_CATEGORIES,
21 |         )
22 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_marketing.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | from supervised.automl import AutoML
 3 | import os
 4 | 
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv")
 9 | 
10 | X = df[df.columns[:-1]]
11 | y = df["y"]
12 | 
13 | 
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)
15 | 
16 | 
17 | automl = AutoML(
18 |     # results_path="AutoML_22",
19 |     total_time_limit=30 * 60,
20 |     start_random_models=10,
21 |     hill_climbing_steps=3,
22 |     top_models_to_improve=3,
23 |     train_ensemble=True,
24 | )
25 | 
26 | automl.fit(X_train, y_train)
27 | 
28 | 
29 | pred = automl.predict(X_test)
30 | print("Test accuracy", accuracy_score(y_test, pred))
31 | 
```

--------------------------------------------------------------------------------
/examples/scripts/regression_acs_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | 
 5 | # to get data
 6 | # from fairlearn.datasets import fetch_acs_income
 7 | # df = fetch_acs_income(as_frame=True)
 8 | # df["frame"].to_csv("acs_income.csv", index=False)
 9 | 
10 | df = pd.read_csv("tests/data/acs_income_1k.csv")
11 | 
12 | print(df)
13 | 
14 | x_cols = [c for c in df.columns if c != "PINCP"]
15 | 
16 | sensitive_features = df["SEX"].astype(str)
17 | 
18 | X = df[x_cols]
19 | y = df["PINCP"]
20 | 
21 | automl = AutoML(
22 |     algorithms=["Xgboost", "LightGBM"],
23 |     train_ensemble=True,
24 |     fairness_threshold=0.91,
25 |     # underprivileged_groups=[{"SEX": "1.0"}],
26 |     # privileged_groups=[{"SEX": "2.0"}]
27 | )
28 | automl.fit(X, y, sensitive_features=sensitive_features)
29 | 
```

--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier_digits.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | 
 3 | # scikit learn utilites
 4 | from sklearn.datasets import load_digits
 5 | from sklearn.metrics import accuracy_score
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | # mljar-supervised package
 9 | from supervised.automl import AutoML
10 | 
11 | # Load the data
12 | digits = load_digits()
13 | X_train, X_test, y_train, y_test = train_test_split(
14 |     pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25
15 | )
16 | 
17 | # train models
18 | automl = AutoML(mode="Perform")
19 | automl.fit(X_train, y_train)
20 | 
21 | # compute the accuracy on test data
22 | predictions = automl.predict(X_test)
23 | print(predictions.head())
24 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))
25 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_random.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | from sklearn.metrics import accuracy_score
 5 | import os
 6 | 
 7 | nrows = 100
 8 | ncols = 3
 9 | X = np.random.rand(nrows, ncols)
10 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)])
11 | y = np.random.randint(0, 2, nrows)
12 | # y = np.random.permutation(["a", "B"] * 50)
13 | 
14 | automl = AutoML(model_time_limit=10)  # , algorithms=["Decision Tree"])
15 | automl.fit(X, y)
16 | print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"]))
17 | 
18 | # X = np.random.rand(1000, 10)
19 | # X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)])
20 | # y = np.random.randint(0, 2, 1000)
21 | # print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
22 | 
```

--------------------------------------------------------------------------------
/supervised/fairness/utils.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | 
 3 | 
 4 | def accuracy(t, y):
 5 |     return np.round(np.sum(t == y) / t.shape[0], 4)
 6 | 
 7 | 
 8 | def selection_rate(y):
 9 |     return np.round(
10 |         np.sum((y == 1)) / y.shape[0],
11 |         4,
12 |     )
13 | 
14 | 
15 | def true_positive_rate(t, y):
16 |     return np.round(
17 |         np.sum((y == 1) & (t == 1)) / np.sum((t == 1)),
18 |         4,
19 |     )
20 | 
21 | 
22 | def false_positive_rate(t, y):
23 |     return np.round(
24 |         np.sum((y == 1) & (t == 0)) / np.sum((t == 0)),
25 |         4,
26 |     )
27 | 
28 | 
29 | def true_negative_rate(t, y):
30 |     return np.round(
31 |         np.sum((y == 0) & (t == 0)) / np.sum((t == 0)),
32 |         4,
33 |     )
34 | 
35 | 
36 | def false_negative_rate(t, y):
37 |     return np.round(
38 |         np.sum((y == 0) & (t == 1)) / np.sum((t == 1)),
39 |         4,
40 |     )
41 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_learning_curves.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import unittest
 3 | 
 4 | from supervised.utils.learning_curves import LearningCurves
 5 | 
 6 | 
 7 | class LearningCurvesTest(unittest.TestCase):
 8 |     def test_plot_close(self):
 9 |         """
10 |         Test if we close plots. To avoid following warning:
11 |         RuntimeWarning: More than 20 figures have been opened.
12 |         Figures created through the pyplot interface (`matplotlib.pyplot.figure`)
13 |         are retained until explicitly closed and may consume too much memory.
14 |         """
15 |         for _ in range(
16 |             1
17 |         ):  # you can increase the range, for tests speed reason I keep it low
18 |             LearningCurves.plot_for_ensemble([3, 2, 1], "random_metrics", ".")
19 | 
20 |         os.remove(LearningCurves.output_file_name)
21 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_update_errors_report.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLUpdateErrorsReportTest(unittest.TestCase):
11 |     automl_dir = "automl_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_custom_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.random.randint(0, 2, size=(30,))
19 | 
20 |         automl = AutoML(results_path=self.automl_dir)
21 |         automl._update_errors_report("model_1", "bad error")
22 | 
23 |         errors_filename = os.path.join(self.automl_dir, "errors.md")
24 |         self.assertTrue(os.path.exists(errors_filename))
25 |         with open(errors_filename) as file:
26 |             self.assertTrue("bad error" in file.read())
27 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_adult_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | 
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.datasets import fetch_openml
 4 | from supervised.automl import AutoML
 5 | 
 6 | data = fetch_openml(data_id=1590, as_frame=True)
 7 | X = data.data
 8 | # data.target #
 9 | y = data.target # (data.target == ">50K") * 1
10 | sensitive_features = X[["sex"]]
11 | 
12 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split(
13 |     X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42
14 | )
15 | 
16 | automl = AutoML(
17 |     algorithms=[
18 |         "Xgboost"
19 |     ],
20 |     train_ensemble=False,
21 |     fairness_metric="demographic_parity_ratio",  
22 |     fairness_threshold=0.8,
23 |     privileged_groups = [{"sex": "Male"}],
24 |     underprivileged_groups = [{"sex": "Female"}],
25 | )
26 | 
27 | automl.fit(X_train, y_train, sensitive_features=S_train)
28 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_subsample.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.algorithms.registry import REGRESSION
 7 | from supervised.utils.subsample import subsample
 8 | 
 9 | 
10 | class SubsampleTest(unittest.TestCase):
11 |     def test_subsample_regression_10k(self):
12 |         rows = 10000
13 |         cols = 51
14 |         X = np.random.rand(rows, cols)
15 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(cols)])
16 |         y = pd.Series(np.random.rand(rows), name="target")
17 | 
18 |         X_train, X_test, y_train, y_test = subsample(
19 |             X, y, train_size=1000, ml_task=REGRESSION
20 |         )
21 | 
22 |         self.assertTrue(X_train.shape[0], 1000)
23 |         self.assertTrue(X_test.shape[0], 9000)
24 |         self.assertTrue(y_train.shape[0], 1000)
25 |         self.assertTrue(y_test.shape[0], 9000)
26 | 
```

--------------------------------------------------------------------------------
/examples/scripts/tabular_mar_2021.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | from supervised import AutoML
 3 | 
 4 | train = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/train.csv")
 5 | test = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/test.csv")
 6 | 
 7 | X_train = train.drop(["id", "target"], axis=1)
 8 | y_train = train.target
 9 | X_test = test.drop(["id"], axis=1)
10 | 
11 | automl = AutoML(
12 |     mode="Optuna",
13 |     eval_metric="auc",
14 |     algorithms=["CatBoost"],
15 |     optuna_time_budget=1800,  # tune each algorithm for 30 minutes
16 |     total_time_limit=48
17 |     * 3600,  # total time limit, set large enough to have time to compute all steps
18 |     features_selection=False,
19 | )
20 | automl.fit(X_train, y_train)
21 | 
22 | preds = automl.predict_proba(X_test)
23 | submission = pd.DataFrame({"id": test.id, "target": preds[:, 1]})
24 | submission.to_csv("1_submission.csv", index=False)
25 | 
```

--------------------------------------------------------------------------------
/supervised/utils/jsonencoder.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | from datetime import date
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class MLJSONEncoder(json.JSONEncoder):
 8 |     def default(self, o):
 9 |         if isinstance(
10 |             o,
11 |             (
12 |                 np.int_,
13 |                 np.intc,
14 |                 np.intp,
15 |                 np.int8,
16 |                 np.int16,
17 |                 np.int32,
18 |                 np.int64,
19 |                 np.uint8,
20 |                 np.uint16,
21 |                 np.uint32,
22 |                 np.uint64,
23 |             ),
24 |         ):
25 |             return int(o)
26 |         elif isinstance(o, (np.float_, np.float16, np.float32, np.float64)):
27 |             return float(o)
28 |         elif isinstance(o, np.ndarray):
29 |             return o.tolist()
30 |         elif isinstance(obj, date):
31 |             return obj.strftime("%Y-%m-%d")
32 | 
33 |         return super(MLJSONEncoder, self).default(o)
34 | 
```

--------------------------------------------------------------------------------
/examples/scripts/multi_class_classifier_MNIST.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | import numpy as np
 3 | from supervised.automl import AutoML
 4 | 
 5 | 
 6 | from supervised.utils.config import mem
 7 | 
 8 | 
 9 | df = pd.read_csv("tests/data/MNIST/train.csv")
10 | 
11 | X = df[[f for f in df.columns if "pixel" in f]]
12 | y = df["label"]
13 | 
14 | for _ in range(4):
15 |     X = pd.concat([X, X], axis=0)
16 |     y = pd.concat([y, y], axis=0)
17 | 
18 | 
19 | mem()
20 | 
21 | 
22 | automl = AutoML(
23 |     # results_path="AutoML_12",
24 |     total_time_limit=60 * 60,
25 |     start_random_models=5,
26 |     hill_climbing_steps=2,
27 |     top_models_to_improve=3,
28 |     train_ensemble=True,
29 | )
30 | 
31 | mem()
32 | print("Start fit")
33 | automl.fit(X, y)
34 | 
35 | test = pd.read_csv("tests/data/MNIST/test.csv")
36 | predictions = automl.predict(test)
37 | 
38 | print(predictions.head())
39 | print(predictions.tail())
40 | 
41 | sub = pd.DataFrame({"ImageId": 0, "Label": predictions["label"]})
42 | sub["ImageId"] = sub.index + 1
43 | sub.to_csv("sub1.csv", index=False)
44 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/encoding_selector.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
 5 | 
 6 | 
 7 | class EncodingSelector:
 8 | 
 9 |     """
10 |     EncodingSelector object decides which method should be used for categorical encoding.
11 | 
12 |     Please keep it fast and simple. Thank you.
13 |     """
14 | 
15 |     @staticmethod
16 |     def get(X, y, column):
17 |         try:
18 |             unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column]))
19 |             if unique_cnt <= 20:
20 |                 return PreprocessingCategorical.FEW_CATEGORIES
21 |         except Exception as e:
22 |             pass
23 | 
24 |         return PreprocessingCategorical.MANY_CATEGORIES
25 |         """
26 |         if unique_cnt <= 2 or unique_cnt > 25:
27 |             return PreprocessingCategorical.CONVERT_INTEGER
28 | 
29 |         return PreprocessingCategorical.CONVERT_ONE_HOT
30 |         """
31 | 
```

--------------------------------------------------------------------------------
/.github/workflows/test-installation-with-pip-on-windows.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Test installation with pip on Windows
 2 | 
 3 | on: 
 4 |   schedule:
 5 |     - cron:  '0 8 * * 1'
 6 |   workflow_dispatch:
 7 |   
 8 | jobs:
 9 |   build:
10 |     name: Run (${{ matrix.python-version }}, ${{ matrix.os }})
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         os: [windows-latest]
16 |         python-version: ['3.9']
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - name: Set up Python
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 | 
26 |       - name: Check Python version
27 |         run: python --version
28 | 
29 |       - name: Upgrade pip
30 |         run: python -m pip install --upgrade pip
31 | 
32 |       - name: Install MLJAR AutoML
33 |         run: pip install mljar-supervised
34 | 
35 |       - name: Try to import
36 |         run: python -c "import supervised; print(supervised.__version__)"
37 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_shap.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.utils.shap import PlotSHAP
 7 | 
 8 | 
 9 | class PlotSHAPTest(unittest.TestCase):
10 |     def test_get_sample_data_larger_1k(self):
11 |         """Get sample when data is larger than 1k"""
12 |         X = pd.DataFrame(np.random.uniform(size=(5763, 31)))
13 |         y = pd.Series(np.random.randint(0, 2, size=(5763,)))
14 | 
15 |         X_, y_ = PlotSHAP.get_sample(X, y)
16 | 
17 |         self.assertEqual(X_.shape[0], 1000)
18 |         self.assertEqual(y_.shape[0], 1000)
19 | 
20 |     def test_get_sample_data_smaller_1k(self):
21 |         """Get sample when data is smaller than 1k"""
22 |         SAMPLES = 100
23 |         X = pd.DataFrame(np.random.uniform(size=(SAMPLES, 31)))
24 |         y = pd.Series(np.random.randint(0, 2, size=(SAMPLES,)))
25 | 
26 |         X_, y_ = PlotSHAP.get_sample(X, y)
27 | 
28 |         self.assertEqual(X_.shape[0], SAMPLES)
29 |         self.assertEqual(y_.shape[0], SAMPLES)
30 | 
```

--------------------------------------------------------------------------------
/.github/workflows/test-installation-with-conda.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Test installation with conda
 2 | 
 3 | on: 
 4 |   schedule:
 5 |     - cron:  '0 8 * * 1'
 6 |   # run workflow manually
 7 |   workflow_dispatch:
 8 |   
 9 | jobs:
10 |   build:
11 |     name: Run (${{ matrix.python-version }}, ${{ matrix.os }})
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         os: [windows-latest] 
17 |         python-version: ['3.9']
18 |     
19 |     steps:
20 |       - uses: conda-incubator/setup-miniconda@v2
21 |         with:
22 |           activate-environment: test
23 |           auto-update-conda: false
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Activate conda and check versions
26 |         run: |
27 |           conda activate test
28 |           conda --version
29 |           python --version
30 |       - name: Install MLJAR AutoML
31 |         run: conda install -c conda-forge mljar-supervised
32 |       - name: Try to import
33 |         run: python -c "import supervised;print(supervised.__version__)"
34 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/factory.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, AlgorithmsRegistry
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | from supervised.exceptions import AutoMLException
 8 | 
 9 | 
10 | class AlgorithmFactory(object):
11 |     @classmethod
12 |     def get_algorithm(cls, params):
13 |         alg_type = params.get("model_type", "Xgboost")
14 |         ml_task = params.get("ml_task", BINARY_CLASSIFICATION)
15 | 
16 |         try:
17 |             Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type)
18 |             return Algorithm(params)
19 |         except Exception as e:
20 |             raise AutoMLException(f"Cannot get algorithm class. {str(e)}")
21 | 
22 |     @classmethod
23 |     def load(cls, json_desc, learner_path, lazy_load):
24 |         learner = AlgorithmFactory.get_algorithm(json_desc.get("params"))
25 |         learner.set_params(json_desc, learner_path)
26 |         if not lazy_load:
27 |             learner.reload()
28 |         return learner
29 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/terminate_on_nan.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised.callbacks.callback import Callback
 8 | 
 9 | 
10 | class TerminateOnNan(Callback):
11 |     def __init__(self, learner, params):
12 |         super(TerminateOnNan, self).__init__(learner, params)
13 |         self.metric = Metric(params.get("metric_name"))
14 | 
15 |     def on_iteration_end(self, iter_cnt, data):
16 |         loss_train = 0
17 |         if data.get("y_train_predicted") is not None:
18 |             loss_train = self.metric(
19 |                 data.get("y_train_true"), data.get("y_train_predicted")
20 |             )
21 |         loss_validation = self.metric(
22 |             data.get("y_validation_true"), data.get("y_validation_predicted")
23 |         )
24 | 
25 |         for loss in [loss_train, loss_validation]:
26 |             if np.isnan(loss) or np.isinf(loss) or np.isneginf(loss):
27 |                 self.learner.stop_training = True
28 |                 log.info("Terminating learning, invalid loss value")
29 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | from supervised.automl import AutoML
 4 | from sklearn.model_selection import train_test_split
 5 | import os
 6 | from sklearn.metrics import log_loss
 7 | import warnings
 8 | 
 9 | # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)
10 | 
11 | df = pd.read_csv(
12 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
13 |     skipinitialspace=True,
14 | )
15 | 
16 | X = df[df.columns[:-1]]
17 | y = df["income"]
18 | 
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
20 | 
21 | automl = AutoML(
22 |     algorithms=["LightGBM"],
23 |     mode="Compete",
24 |     explain_level=0,
25 |     train_ensemble=True,
26 |     golden_features=False,
27 |     features_selection=False,
28 |     eval_metric="auc",
29 | )
30 | automl.fit(X_train, y_train)
31 | 
32 | predictions = automl.predict_all(X_test)
33 | 
34 | print(predictions.head())
35 | print(predictions.tail())
36 | print(X_test.shape, predictions.shape)
37 | print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
38 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_adjust_validation.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLAdjustValidationTest(unittest.TestCase):
11 |     automl_dir = "automl_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_custom_init(self):
17 |         X = np.random.uniform(size=(60, 2))
18 |         y = np.random.randint(0, 2, size=(60,))
19 | 
20 |         automl = AutoML(
21 |             results_path=self.automl_dir,
22 |             model_time_limit=10,
23 |             algorithms=["Xgboost"],
24 |             mode="Compete",
25 |             explain_level=0,
26 |             start_random_models=1,
27 |             hill_climbing_steps=0,
28 |             top_models_to_improve=0,
29 |             kmeans_features=False,
30 |             golden_features=False,
31 |             features_selection=False,
32 |             boost_on_errors=False,
33 |         )
34 |         automl.fit(X, y)
35 | 
36 |         self.assertFalse(
37 |             os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree"))
38 |         )
39 | 
```

--------------------------------------------------------------------------------
/examples/scripts/multi_class_drug_fairness.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from supervised import AutoML
 5 | 
 6 | 
 7 | df = pd.read_csv("tests/data/Drug/Drug_Consumption.csv")
 8 | 
 9 | 
10 | X = df[df.columns[1:13]]
11 | 
12 | # convert to 3 classes
13 | df = df.replace(
14 |     {
15 |         "Cannabis": {
16 |             "CL0": "never_used",
17 |             "CL1": "not_in_last_year",
18 |             "CL2": "not_in_last_year",
19 |             "CL3": "used_in_last_year",
20 |             "CL4": "used_in_last_year",
21 |             "CL5": "used_in_last_year",
22 |             "CL6": "used_in_last_year",
23 |         }
24 |     }
25 | )
26 | 
27 | y = df["Cannabis"]
28 | 
29 | # maybe should be 
30 | # The binary sensitive feature is education level (college degree or not).
31 | # like in 
32 | # Fairness guarantee in multi-class classification
33 | sensitive_features = df["Gender"]
34 | 
35 | 
36 | automl = AutoML(
37 |     algorithms=["Xgboost"],
38 |     train_ensemble=True,
39 |     start_random_models=3,
40 |     hill_climbing_steps=3,
41 |     top_models_to_improve=2,
42 |     fairness_threshold=0.8,
43 |     explain_level=1
44 | )
45 | automl.fit(X, y, sensitive_features=sensitive_features)
46 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_datetime_transformer.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer
 6 | 
 7 | 
 8 | class DateTimeTransformerTest(unittest.TestCase):
 9 |     def test_transformer(self):
10 |         d = {
11 |             "col1": [
12 |                 "2020/06/01",
13 |                 "2020/06/02",
14 |                 "2020/06/03",
15 |                 "2021/06/01",
16 |                 "2022/06/01",
17 |             ]
18 |         }
19 |         df = pd.DataFrame(data=d)
20 |         df["col1"] = pd.to_datetime(df["col1"])
21 |         df_org = df.copy()
22 | 
23 |         transf = DateTimeTransformer()
24 |         transf.fit(df, "col1")
25 |         df = transf.transform(df)
26 | 
27 |         self.assertTrue(df.shape[0] == 5)
28 |         self.assertTrue("col1" not in df.columns)
29 |         self.assertTrue("col1_Year" in df.columns)
30 | 
31 |         transf2 = DateTimeTransformer()
32 |         transf2.from_json(transf.to_json())
33 |         df2 = transf2.transform(df_org)
34 |         self.assertTrue("col1" not in df2.columns)
35 |         self.assertTrue("col1_Year" in df2.columns)
36 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_text_transformer.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.preprocessing.text_transformer import TextTransformer
 7 | 
 8 | 
 9 | class TextTransformerTest(unittest.TestCase):
10 |     def test_transformer(self):
11 |         d = {
12 |             "col1": [
13 |                 "This is the first document.",
14 |                 "This document is the second document.",
15 |                 "And this is the third one.",
16 |                 None,
17 |                 "Is this the first document?",
18 |             ]
19 |         }
20 |         df = pd.DataFrame(data=d)
21 |         df_org = df.copy()
22 | 
23 |         transf = TextTransformer()
24 |         transf.fit(df, "col1")
25 |         df = transf.transform(df)
26 |         
27 |         self.assertTrue(df.shape[0] == 5)
28 |         self.assertTrue("col1" not in df.columns)
29 | 
30 |         transf2 = TextTransformer()
31 |         transf2.from_json(transf.to_json())
32 |         df2 = transf2.transform(df_org)
33 |         self.assertTrue("col1" not in df2.columns)
34 | 
35 |         assert_almost_equal(df.iloc[0, 0], df2.iloc[0, 0])
36 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_importance.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | 
 9 | from supervised.utils.importance import PermutationImportance
10 | 
11 | 
12 | class PermutationImportanceTest(unittest.TestCase):
13 |     def test_compute_and_plot(self):
14 |         rows = 20
15 |         X = np.random.rand(rows, 3)
16 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
17 |         y = np.random.randint(0, 2, rows)
18 | 
19 |         model = DecisionTreeClassifier(max_depth=1)
20 |         model.fit(X, y)
21 | 
22 |         with tempfile.TemporaryDirectory() as tmpdir:
23 |             PermutationImportance.compute_and_plot(
24 |                 model,
25 |                 X_validation=X,
26 |                 y_validation=y,
27 |                 model_file_path=tmpdir,
28 |                 learner_name="learner_test",
29 |                 metric_name=None,
30 |                 ml_task="binary_classification",
31 |             )
32 |             self.assertTrue(
33 |                 os.path.exists(os.path.join(tmpdir, "learner_test_importance.csv"))
34 |             )
35 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/callback_list.py:
--------------------------------------------------------------------------------

```python
 1 | class CallbackList(object):
 2 |     def __init__(self, callbacks=[]):
 3 |         self.callbacks = callbacks
 4 | 
 5 |     def add_and_set_learner(self, learner):
 6 |         for cb in self.callbacks:
 7 |             cb.add_and_set_learner(learner)
 8 | 
 9 |     def on_learner_train_start(self, logs=None):
10 |         for cb in self.callbacks:
11 |             cb.on_learner_train_start(logs)
12 | 
13 |     def on_learner_train_end(self, logs=None):
14 |         for cb in self.callbacks:
15 |             cb.on_learner_train_end(logs)
16 | 
17 |     def on_iteration_start(self, logs=None):
18 |         for cb in self.callbacks:
19 |             cb.on_iteration_start(logs)
20 | 
21 |     def on_iteration_end(self, logs=None, predictions=None):
22 |         for cb in self.callbacks:
23 |             cb.on_iteration_end(logs, predictions)
24 | 
25 |     def on_framework_train_end(self, logs=None):
26 |         for cb in self.callbacks:
27 |             cb.on_framework_train_end(logs)
28 | 
29 |     def get(self, callback_name):
30 |         for cb in self.callbacks:
31 |             if cb.name == callback_name:
32 |                 return cb
33 |         return None
34 | 
```

--------------------------------------------------------------------------------
/supervised/utils/common.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | 
 3 | 
 4 | def construct_learner_name(fold, repeat, repeats):
 5 |     repeat_str = f"_repeat_{repeat}" if repeats > 1 else ""
 6 |     return f"learner_fold_{fold}{repeat_str}"
 7 | 
 8 | 
 9 | def learner_name_to_fold_repeat(name):
10 |     fold, repeat = None, None
11 |     arr = name.split("_")
12 |     fold = int(arr[2])
13 |     if "repeat" in name:
14 |         repeat = int(arr[4])
15 |     return fold, repeat
16 | 
17 | 
18 | def get_fold_repeat_cnt(model_path):
19 |     training_logs = [f for f in os.listdir(model_path) if "_training.log" in f]
20 |     fold_cnt, repeat_cnt = 0, 0
21 |     for fname in training_logs:
22 |         fold, repeat = learner_name_to_fold_repeat(fname)
23 |         if fold is not None:
24 |             fold_cnt = max(fold_cnt, fold)
25 |         if repeat is not None:
26 |             repeat_cnt = max(repeat_cnt, repeat)
27 | 
28 |     fold_cnt += 1  # counting from 0
29 |     repeat_cnt += 1
30 | 
31 |     return fold_cnt, repeat_cnt
32 | 
33 | 
34 | def get_learners_names(model_path):
35 |     postfix = "_training.log"
36 |     learner_names = [
37 |         f.repleace(postfix, "") for f in os.listdir(model_path) if postfix in f
38 |     ]
39 |     return learner_names
40 | 
```

--------------------------------------------------------------------------------
/tests/tests_ensemble/test_save_load.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import pandas as pd
 5 | from sklearn import datasets
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class EnsembleSaveLoadTest(unittest.TestCase):
11 |     automl_dir = "EnsembleSaveLoadTest"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_save_load(self):
17 |         a = AutoML(
18 |             results_path=self.automl_dir,
19 |             total_time_limit=10,
20 |             explain_level=0,
21 |             mode="Explain",
22 |             train_ensemble=True,
23 |             start_random_models=1,
24 |         )
25 | 
26 |         X, y = datasets.make_classification(
27 |             n_samples=100,
28 |             n_features=5,
29 |             n_informative=4,
30 |             n_redundant=1,
31 |             n_classes=2,
32 |             n_clusters_per_class=3,
33 |             n_repeated=0,
34 |             shuffle=False,
35 |             random_state=0,
36 |         )
37 |         X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
38 | 
39 |         a.fit(X, y)
40 |         p = a.predict(X)
41 | 
42 |         a2 = AutoML(results_path=self.automl_dir)
43 |         p2 = a2.predict(X)
44 | 
45 |         self.assertTrue((p == p2).all())
46 | 
```

--------------------------------------------------------------------------------
/supervised/validation/validation_step.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | from supervised.exceptions import AutoMLException
 6 | from supervised.validation.validator_custom import CustomValidator
 7 | from supervised.validation.validator_kfold import KFoldValidator
 8 | from supervised.validation.validator_split import SplitValidator
 9 | 
10 | 
11 | class ValidationStep:
12 |     def __init__(self, params):
13 |         # kfold is default validation technique
14 |         self.validation_type = params.get("validation_type", "kfold")
15 | 
16 |         if self.validation_type == "kfold":
17 |             self.validator = KFoldValidator(params)
18 |         elif self.validation_type == "split":
19 |             self.validator = SplitValidator(params)
20 |         elif self.validation_type == "custom":
21 |             self.validator = CustomValidator(params)
22 |         else:
23 |             raise AutoMLException(
24 |                 f"The validation type ({self.validation_type}) is not implemented."
25 |             )
26 | 
27 |     def get_split(self, k, repeat=0):
28 |         return self.validator.get_split(k, repeat)
29 | 
30 |     def split(self):
31 |         return self.validator.split()
32 | 
33 |     def get_n_splits(self):
34 |         return self.validator.get_n_splits()
35 | 
36 |     def get_repeats(self):
37 |         return self.validator.get_repeats()
38 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_report.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import pytest
 9 | from sklearn import datasets
10 | from sklearn.decomposition import PCA
11 | from sklearn.pipeline import make_pipeline
12 | 
13 | from supervised import AutoML
14 | from supervised.exceptions import AutoMLException
15 | 
16 | iris = datasets.load_iris()
17 | 
18 | class AutoMLReportTest(unittest.TestCase):
19 |     automl_dir = "AutoMLTest"
20 | 
21 |     def tearDown(self):
22 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
23 | 
24 |     def setUp(self):
25 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
26 | 
27 |     def test_report(self):
28 |         """Tests AutoML in the iris dataset (Multiclass classification)"""
29 |         model = AutoML(
30 |             algorithms=["Baseline"],
31 |             explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir
32 |         )
33 |         model.fit(iris.data, iris.target)
34 |         model.report()
35 | 
36 |         report_path = os.path.join(self.automl_dir, "README.html")
37 |         self.assertTrue(os.path.exists(report_path))
38 | 
39 |         content = None
40 |         with open(report_path, "r") as fin:
41 |             content = fin.read()
42 | 
43 | 
44 |         #print(content)
45 |         link = '<a href="1_Baseline/README.html">'
46 |         self.assertFalse(link in content)
47 | 
48 | 
49 | 
50 | 
```

--------------------------------------------------------------------------------
/tests/checks/check_automl_with_regression.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import pandas as pd
 4 | import sklearn.model_selection
 5 | 
 6 | from supervised.automl import AutoML
 7 | 
 8 | 
 9 | class AutoMLWithRegressionTest(unittest.TestCase):
10 |     def test_fit_and_predict(self):
11 |         seed = 1709
12 | 
13 |         df = pd.read_csv(
14 |             "./tests/data/housing_regression_missing_values_missing_target.csv"
15 |         )
16 |         print(df.columns)
17 |         x_cols = [c for c in df.columns if c != "MEDV"]
18 |         X = df[x_cols]
19 |         y = df["MEDV"]
20 | 
21 |         X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
22 |             X, y, test_size=0.3, random_state=seed
23 |         )
24 |         automl = AutoML(
25 |             total_time_limit=10,
26 |             algorithms=["Xgboost"],  # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"],
27 |             start_random_models=1,
28 |             hill_climbing_steps=0,
29 |             top_models_to_improve=0,
30 |             train_ensemble=True,
31 |             verbose=True,
32 |         )
33 |         automl.fit(X_train, y_train)
34 | 
35 |         response = automl.predict(X_test)  # ["p_1"]
36 |         print("Response", response)
37 | 
38 |         # Compute the logloss on test dataset
39 |         # ll = log_loss(y_test, response)
40 |         # print("(*) Dataset id {} logloss {}".format(dataset_id, ll))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main()
45 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_data_types.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class AutoMLDataTypesTest(unittest.TestCase):
11 |     automl_dir = "automl_tests"
12 |     rows = 250
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
16 | 
17 |     def test_category_data_type(self):
18 |         X = np.random.rand(self.rows, 3)
19 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
20 |         y = np.random.randint(0, 2, self.rows)
21 | 
22 |         X["f1"] = X["f1"].astype("category")
23 | 
24 |         automl = AutoML(
25 |             results_path=self.automl_dir,
26 |             total_time_limit=1,
27 |             algorithms=["CatBoost"],
28 |             train_ensemble=False,
29 |             explain_level=0,
30 |             start_random_models=1,
31 |         )
32 |         automl.fit(X, y)
33 | 
34 |     def test_encoding_strange_characters(self):
35 |         X = np.random.rand(self.rows, 3)
36 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
37 |         y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2))
38 | 
39 |         automl = AutoML(
40 |             results_path=self.automl_dir,
41 |             total_time_limit=1,
42 |             algorithms=["Baseline"],
43 |             train_ensemble=False,
44 |             explain_level=0,
45 |             start_random_models=1,
46 |         )
47 |         automl.fit(X, y)
48 | 
```

--------------------------------------------------------------------------------
/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Tests
 2 | 
 3 | on: [ push,pull_request ]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ ubuntu-latest ]
12 |         python-version: [ '3.10']
13 |         #os: [ ubuntu-latest, macos-latest, windows-latest ]
14 |         #python-version: [ '3.8', '3.9', '3.10', '3.11' ]
15 | 
16 |     steps:
17 |       - name: Install OS Dependencies
18 |         if: matrix.os == 'ubuntu-latest'
19 |         run: |
20 |           sudo apt-get update
21 |           sudo apt-get -y install graphviz
22 | 
23 |       - name: Install OS Dependencies
24 |         if: matrix.os == 'macos-latest'
25 |         run: |
26 |           brew install graphviz
27 | 
28 |       - name: Install OS Dependencies
29 |         if: matrix.os == 'windows-latest'
30 |         run: |
31 |           choco install graphviz
32 |       - uses: actions/checkout@v2
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v2
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 |       - name: Install Python Dependencies
38 |         run: |
39 |           python -m pip install --upgrade pip
40 |           pip install --upgrade setuptools
41 |           pip install -U importlib-metadata>=1.7.0
42 |           pip install -U -r requirements.txt
43 |           pip install -U -r requirements_dev.txt
44 |           pip install ipython
45 |           python setup.py install
46 |       - name: Test with pytest
47 |         run: |
48 |           pytest tests --cov=supervised/
49 |     continue-on-error: true
50 | 
```

--------------------------------------------------------------------------------
/supervised/utils/data_validation.py:
--------------------------------------------------------------------------------

```python
 1 | def check_greater_than_zero_integer(value, original_var_name):
 2 |     if not isinstance(value, int):
 3 |         raise ValueError(
 4 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
 5 |         )
 6 | 
 7 |     if value <= 0:
 8 |         raise ValueError(
 9 |             f"'{original_var_name}' must be greater than zero, got '{value}'."
10 |         )
11 | 
12 | 
13 | def check_positive_integer(value, original_var_name):
14 |     if not isinstance(value, int):
15 |         raise ValueError(
16 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
17 |         )
18 | 
19 |     if value < 0:
20 |         raise ValueError(
21 |             f"'{original_var_name}' must be equal or greater than zero, got '{value}'."
22 |         )
23 | 
24 | 
25 | def check_integer(value, original_var_name):
26 |     if not isinstance(value, int):
27 |         raise ValueError(
28 |             f"'{original_var_name}' must be an integer, got '{type(value)}'."
29 |         )
30 | 
31 | 
32 | def check_bool(value, original_var_name):
33 |     if not isinstance(value, bool):
34 |         raise ValueError(
35 |             f"'{original_var_name}' must be a boolean, got '{type(value)}'."
36 |         )
37 | 
38 | 
39 | def check_greater_than_zero_integer_or_float(value, original_var_name):
40 |     if not (isinstance(value, int) or isinstance(value, float)):
41 |         raise ValueError(
42 |             f"'{original_var_name}' must be an integer or float, got '{type(value)}'."
43 |         )
44 | 
45 |     if value <= 0:
46 |         raise ValueError(
47 |             f"'{original_var_name}' must be greater than zero, got '{value}'."
48 |         )
49 | 
```

--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------

```python
 1 | from setuptools import setup, find_packages
 2 | from codecs import open
 3 | from os import path
 4 | 
 5 | here = path.abspath(path.dirname(__file__))
 6 | 
 7 | # Get the long description from the README file
 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name="mljar-supervised",
13 |     version="1.1.18",
14 |     description="Automated Machine Learning for Humans",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/mljar/mljar-supervised",
18 |     author="MLJAR, Sp. z o.o.",
19 |     author_email="[email protected]",
20 |     license="MIT",
21 |     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
22 |     install_requires=open("requirements.txt").readlines(),
23 |     include_package_data=True,
24 |     python_requires='>=3.8',
25 |     classifiers=[
26 |         "Programming Language :: Python",
27 |         "Programming Language :: Python :: 3.8",
28 |         "Programming Language :: Python :: 3.9",
29 |         "Programming Language :: Python :: 3.10",
30 |         "Programming Language :: Python :: 3.11",
31 |     ],
32 |     keywords=[
33 |         "automated machine learning",
34 |         "automl",
35 |         "machine learning",
36 |         "data science",
37 |         "data mining",
38 |         "mljar",
39 |         "random forest",
40 |         "decision tree",
41 |         "xgboost",
42 |         "lightgbm",
43 |         "catboost",
44 |         "neural network",
45 |         "extra trees",
46 |         "linear model",
47 |         "features selection",
48 |         "features engineering"
49 |     ],
50 | )
51 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/exclude_missing_target.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | import warnings
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class ExcludeRowsMissingTarget(object):
14 |     @staticmethod
15 |     def transform(
16 |         X=None, y=None, sample_weight=None, sensitive_features=None, warn=False
17 |     ):
18 |         if y is None:
19 |             return X, y, sample_weight, sensitive_features
20 |         y_missing = pd.isnull(y)
21 |         if np.sum(np.array(y_missing)) == 0:
22 |             return X, y, sample_weight, sensitive_features
23 |         logger.debug("Exclude rows with missing target values")
24 |         if warn:
25 |             warnings.warn(
26 |                 "There are samples with missing target values in the data which will be excluded for further analysis",
27 |                 UserWarning
28 |             )
29 |         y = y.drop(y.index[y_missing])
30 |         y.reset_index(drop=True, inplace=True)
31 | 
32 |         if X is not None:
33 |             X = X.drop(X.index[y_missing])
34 |             X.reset_index(drop=True, inplace=True)
35 | 
36 |         if sample_weight is not None:
37 |             sample_weight = sample_weight.drop(sample_weight.index[y_missing])
38 |             sample_weight.reset_index(drop=True, inplace=True)
39 | 
40 |         if sensitive_features is not None:
41 |             sensitive_features = sensitive_features.drop(
42 |                 sensitive_features.index[y_missing]
43 |             )
44 |             sensitive_features.reset_index(drop=True, inplace=True)
45 | 
46 |         return X, y, sample_weight, sensitive_features
47 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_prediction_after_load.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | from numpy.testing import assert_almost_equal
 5 | from sklearn import datasets
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | from supervised import AutoML
 9 | 
10 | 
11 | class AutoMLPredictionAfterLoadTest(unittest.TestCase):
12 |     automl_dir = "AutoMLPredictionAfterLoadTest"
13 | 
14 |     def tearDown(self):
15 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
16 | 
17 |     def test_integration(self):
18 |         a = AutoML(
19 |             results_path=self.automl_dir,
20 |             mode="Compete",
21 |             algorithms=["Baseline", "CatBoost", "LightGBM", "Xgboost"],
22 |             stack_models=True,
23 |             total_time_limit=60,
24 |             validation_strategy={
25 |                 "validation_type": "kfold",
26 |                 "k_folds": 3,
27 |                 "shuffle": True,
28 |                 "stratify": True,
29 |                 "random_seed": 123,
30 |             },
31 |         )
32 | 
33 |         X, y = datasets.make_classification(
34 |             n_samples=1000,
35 |             n_features=30,
36 |             n_informative=29,
37 |             n_redundant=1,
38 |             n_classes=8,
39 |             n_clusters_per_class=3,
40 |             n_repeated=0,
41 |             shuffle=False,
42 |             random_state=0,
43 |         )
44 |         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
45 | 
46 |         a.fit(X_train, y_train)
47 |         p = a.predict_all(X_test)
48 | 
49 |         a2 = AutoML(results_path=self.automl_dir)
50 |         p2 = a2.predict_all(X_test)
51 | 
52 |         assert_almost_equal(p["prediction_0"].iloc[0], p2["prediction_0"].iloc[0])
53 |         assert_almost_equal(p["prediction_7"].iloc[0], p2["prediction_7"].iloc[0])
54 | 
```

--------------------------------------------------------------------------------
/examples/scripts/binary_classifier_ensemble.py:
--------------------------------------------------------------------------------

```python
 1 | import pandas as pd
 2 | from supervised.automl import AutoML
 3 | from supervised.ensemble import Ensemble
 4 | import os
 5 | 
 6 | df = pd.read_csv(
 7 |     "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
 8 |     skipinitialspace=True,
 9 | )
10 | 
11 | X = df[df.columns[:-1]]
12 | y = df["income"]
13 | 
14 | results_path = "AutoML_2"
15 | automl = AutoML(
16 |     results_path=results_path,
17 |     total_time_limit=400,
18 |     start_random_models=10,
19 |     hill_climbing_steps=0,
20 |     top_models_to_improve=0,
21 |     train_ensemble=False,
22 | )
23 | 
24 | 
25 | models_map = {m.get_name(): m for m in automl._models}
26 | 
27 | ensemble = Ensemble("logloss", "binary_classification")
28 | ensemble.models_map = models_map
29 | 
30 | oofs = {}
31 | target = None
32 | for i in range(1, 30):
33 |     oof = pd.read_csv(
34 |         os.path.join(results_path, f"model_{i}", "predictions_out_of_folds.csv")
35 |     )
36 |     prediction_cols = [c for c in oof.columns if "prediction" in c]
37 |     oofs[f"model_{i}"] = oof[prediction_cols]
38 |     if target is None:
39 |         target_columns = [c for c in oof.columns if "target" in c]
40 |         target = oof[target_columns]
41 | 
42 | ensemble.target = target
43 | ensemble.target_columns = "target"
44 | ensemble.fit(oofs, target)
45 | ensemble.save(os.path.join(results_path, "ensemble"))
46 | 
47 | 
48 | predictions = ensemble.predict(X)
49 | print(predictions.head())
50 | 
51 | """
52 |     p_<=50K    p_>50K
53 | 0  0.982940  0.017060
54 | 1  0.722781  0.277219
55 | 2  0.972687  0.027313
56 | 3  0.903021  0.096979
57 | 4  0.591373  0.408627
58 | """
59 | 
60 | 
61 | ensemble2 = Ensemble.load(os.path.join(results_path, "ensemble"), models_map)
62 | predictions2 = ensemble2.predict(X)
63 | print(predictions2.head())
64 | 
65 | """
66 |     p_<=50K    p_>50K
67 | 0  0.982940  0.017060
68 | 1  0.722781  0.277219
69 | 2  0.972687  0.027313
70 | 3  0.903021  0.096979
71 | 4  0.591373  0.408627
72 | """
73 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/learner_time_constraint.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | import time
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised.callbacks.callback import Callback
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | log = logging.getLogger(__name__)
10 | log.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class LearnerTimeConstraint(Callback):
14 |     def __init__(self, params={}):
15 |         super(LearnerTimeConstraint, self).__init__(params)
16 |         self.name = params.get("name", "learner_time_constraint")
17 |         self.min_steps = params.get("min_steps")
18 |         self.learner_time_limit = params.get("learner_time_limit")  # in seconds
19 |         self.iterations_count = 0
20 | 
21 |     def on_learner_train_start(self, logs):
22 |         self.train_start_time = time.time()
23 |         self.iterations_count = 0
24 | 
25 |     def on_iteration_start(self, logs):
26 |         self.iter_start_time = time.time()
27 | 
28 |     def on_iteration_end(self, logs, predictions):
29 |         self.iterations_count += 1
30 |         iteration_elapsed_time = np.round(time.time() - self.iter_start_time, 2)
31 |         learner_elapsed_time = np.round(time.time() - self.train_start_time, 2)
32 |         log.debug(
33 |             "Iteration {0} took {1} seconds, learner training time {2} seconds".format(
34 |                 self.iterations_count, iteration_elapsed_time, learner_elapsed_time
35 |             )
36 |         )
37 | 
38 |         if self.min_steps is not None:
39 |             if self.iterations_count < self.min_steps:
40 |                 # self.learner.stop_training = False
41 |                 # return before checking other conditions
42 |                 return
43 | 
44 |         if self.learner_time_limit is not None:
45 |             if learner_elapsed_time >= self.learner_time_limit:
46 |                 self.learner.stop_training = True
47 |                 log.info("Terminating learning, time limit reached")
48 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_restore.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import os
 3 | import shutil
 4 | import unittest
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from supervised import AutoML
10 | from supervised.algorithms.xgboost import additional
11 | 
12 | additional["max_rounds"] = 1
13 | 
14 | 
15 | class AutoMLRestoreTest(unittest.TestCase):
16 |     automl_dir = "automl_tests"
17 |     rows = 50
18 | 
19 |     def tearDown(self):
20 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
21 | 
22 |     def test_tune_only_default(self):
23 |         X = np.random.rand(self.rows, 3)
24 |         X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)])
25 |         y = np.random.randint(0, 2, self.rows)
26 | 
27 |         automl = AutoML(
28 |             results_path=self.automl_dir,
29 |             total_time_limit=3,
30 |             algorithms=["Decision Tree"],
31 |             explain_level=0,
32 |             train_ensemble=False,
33 |         )
34 |         automl.fit(X, y)
35 | 
36 |         # Get number of starting models
37 |         n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])
38 | 
39 |         with open(os.path.join(self.automl_dir, "progress.json"), "r") as file:
40 |             progress = json.load(file)
41 |         progress["fit_level"] = "default_algorithms"
42 | 
43 |         with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout:
44 |             fout.write(json.dumps(progress, indent=4))
45 | 
46 |         automl = AutoML(
47 |             results_path=self.automl_dir,
48 |             total_time_limit=3,
49 |             algorithms=["Decision Tree", "Xgboost"],
50 |             explain_level=0,
51 |             train_ensemble=False,
52 |         )
53 |         automl.fit(X, y)
54 |         # Get number of models after second fit
55 |         n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()])
56 |         # number of models should be equal
57 |         # user cannot overwrite parameters
58 |         self.assertEqual(n2, n1)
59 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/label_encoder.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | from decimal import Decimal
 3 | 
 4 | import numpy as np
 5 | from sklearn import preprocessing as sk_preproc
 6 | 
 7 | from supervised.utils.config import LOG_LEVEL
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(LOG_LEVEL)
11 | 
12 | 
13 | class LabelEncoder(object):
14 |     def __init__(self, try_to_fit_numeric=False):
15 |         self.lbl = sk_preproc.LabelEncoder()
16 |         self._try_to_fit_numeric = try_to_fit_numeric
17 | 
18 |     def fit(self, x):
19 |         self.lbl.fit(x)  # list(x.values))
20 |         if self._try_to_fit_numeric:
21 |             logger.debug("Try to fit numeric in LabelEncoder")
22 |             try:
23 |                 arr = {Decimal(c): c for c in self.lbl.classes_}
24 |                 sorted_arr = dict(sorted(arr.items()))
25 |                 self.lbl.classes_ = np.array(
26 |                     list(sorted_arr.values()), dtype=self.lbl.classes_.dtype
27 |                 )
28 |             except Exception as e:
29 |                 pass
30 | 
31 |     def transform(self, x):
32 |         try:
33 |             return self.lbl.transform(x)  # list(x.values))
34 |         except ValueError as ve:
35 |             # rescue
36 |             classes = np.unique(x)  # list(x.values))
37 |             diff = np.setdiff1d(classes, self.lbl.classes_)
38 |             self.lbl.classes_ = np.concatenate((self.lbl.classes_, diff))
39 |             return self.lbl.transform(x)  # list(x.values))
40 | 
41 |     def inverse_transform(self, x):
42 |         return self.lbl.inverse_transform(x)  # (list(x.values))
43 | 
44 |     def to_json(self):
45 |         data_json = {}
46 |         for i, cl in enumerate(self.lbl.classes_):
47 |             data_json[str(cl)] = i
48 |         return data_json
49 | 
50 |     def from_json(self, data_json):
51 |         keys = np.array(list(data_json.keys()))
52 |         if len(keys) == 2 and "False" in keys and "True" in keys:
53 |             keys = np.array([False, True])
54 |         self.lbl.classes_ = keys
55 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_exclude_missing.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
 7 | 
 8 | 
 9 | class ExcludeRowsMissingTargetTest(unittest.TestCase):
10 |     def test_transform(self):
11 |         d_test = {
12 |             "col1": [1, 1, np.nan, 3],
13 |             "col2": ["a", "a", np.nan, "a"],
14 |             "col3": [1, 1, 1, 3],
15 |             "col4": ["a", "a", "b", "c"],
16 |             "y": [np.nan, 1, np.nan, 2],
17 |         }
18 |         df_test = pd.DataFrame(data=d_test)
19 |         X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
20 |         y = df_test.loc[:, "y"]
21 | 
22 |         self.assertEqual(X.shape[0], 4)
23 |         self.assertEqual(y.shape[0], 4)
24 |         X, y, _, _ = ExcludeRowsMissingTarget.transform(X, y)
25 |         self.assertEqual(X.shape[0], 2)
26 |         self.assertEqual(y.shape[0], 2)
27 | 
28 |         self.assertEqual(y[0], 1)
29 |         self.assertEqual(y[1], 2)
30 | 
31 |     def test_transform_with_sample_weight(self):
32 |         d_test = {
33 |             "col1": [1, 1, np.nan, 3],
34 |             "col2": ["a", "a", np.nan, "a"],
35 |             "col3": [1, 1, 1, 3],
36 |             "col4": ["a", "a", "b", "c"],
37 |             "sample_weight": [1, 2, 3, 4],
38 |             "y": [np.nan, 1, np.nan, 2],
39 |         }
40 |         df_test = pd.DataFrame(data=d_test)
41 |         X = df_test.loc[:, ["col1", "col2", "col3", "col4"]]
42 |         y = df_test.loc[:, "y"]
43 |         sample_weight = df_test.loc[:, "sample_weight"]
44 | 
45 |         self.assertEqual(X.shape[0], 4)
46 |         self.assertEqual(y.shape[0], 4)
47 |         X, y, sw, _ = ExcludeRowsMissingTarget.transform(X, y, sample_weight)
48 |         self.assertEqual(X.shape[0], 2)
49 |         self.assertEqual(y.shape[0], 2)
50 |         self.assertEqual(sw.shape[0], 2)
51 | 
52 |         self.assertEqual(y[0], 1)
53 |         self.assertEqual(y[1], 2)
54 |         self.assertEqual(sw[0], 2)
55 |         self.assertEqual(sw[1], 4)
56 | 
```

--------------------------------------------------------------------------------
/tests/tests_fairness/test_multi_class_classification.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class FairnessInMultiClassClassificationTest(unittest.TestCase):
11 |     automl_dir = "automl_fairness_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.array(["A", "B", "C"] * 10)
19 |         S = pd.DataFrame({"sensitive": ["D", "E"] * 15})
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             model_time_limit=10,
24 |             algorithms=["Xgboost"],
25 |             explain_level=0,
26 |             train_ensemble=False,
27 |             stack_models=False,
28 |             validation_strategy={"validation_type": "split"},
29 |             start_random_models=1,
30 |         )
31 | 
32 |         automl.fit(X, y, sensitive_features=S)
33 | 
34 |         self.assertGreater(len(automl._models), 0)
35 | 
36 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
37 |         self.assertEqual(len(sensitive_features_names), 3)
38 | 
39 |         self.assertTrue("sensitive__A" in sensitive_features_names)
40 |         self.assertTrue("sensitive__B" in sensitive_features_names)
41 |         self.assertTrue("sensitive__C" in sensitive_features_names)
42 | 
43 |         self.assertTrue(
44 |             automl._models[0].get_fairness_metric("sensitive__A") is not None
45 |         )
46 |         self.assertTrue(
47 |             automl._models[0].get_fairness_metric("sensitive__B") is not None
48 |         )
49 |         self.assertTrue(
50 |             automl._models[0].get_fairness_metric("sensitive__C") is not None
51 |         )
52 | 
53 |         self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1)
54 |         self.assertTrue(automl._models[0].get_worst_fairness() is not None)
55 |         self.assertTrue(automl._models[0].get_best_fairness() is not None)
56 | 
```

--------------------------------------------------------------------------------
/supervised/callbacks/metric_logger.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | log = logging.getLogger(__name__)
 4 | 
 5 | from supervised.callbacks.callback import Callback
 6 | from supervised.utils.metric import Metric
 7 | 
 8 | 
 9 | class MetricLogger(Callback):
10 |     def __init__(self, params):
11 |         super(MetricLogger, self).__init__(params)
12 |         self.name = params.get("name", "metric_logger")
13 |         self.loss_values = {}
14 |         self.metrics = []
15 |         for metric_name in params.get("metric_names"):
16 |             self.metrics += [Metric({"name": metric_name})]
17 | 
18 |     def add_and_set_learner(self, learner):
19 |         self.loss_values[learner.uid] = {"train": {}, "validation": {}, "iters": []}
20 |         for metric in self.metrics:
21 |             self.loss_values[learner.uid]["train"][metric.name] = []
22 |             self.loss_values[learner.uid]["validation"][metric.name] = []
23 | 
24 |         self.current_learner_uid = learner.uid
25 | 
26 |     def on_iteration_end(self, logs, predictions):
27 |         for metric in self.metrics:
28 |             train_loss = 0
29 |             if predictions.get("y_train_predicted") is not None:
30 |                 train_loss = metric(
31 |                     predictions.get("y_train_true"),
32 |                     predictions.get("y_train_predicted"),
33 |                 )
34 |             validation_loss = metric(
35 |                 predictions.get("y_validation_true"),
36 |                 predictions.get("y_validation_predicted"),
37 |             )
38 |             self.loss_values[self.current_learner_uid]["train"][metric.name] += [
39 |                 train_loss
40 |             ]
41 |             self.loss_values[self.current_learner_uid]["validation"][metric.name] += [
42 |                 validation_loss
43 |             ]
44 |             # keep information about iter number only once :)
45 |             if metric == self.metrics[0]:
46 |                 self.loss_values[self.current_learner_uid]["iters"] += [
47 |                     logs.get("iter_cnt")
48 |                 ]
49 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/knn.py:
--------------------------------------------------------------------------------

```python
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm
 4 | from supervised.algorithms.registry import (
 5 |     REGRESSION,
 6 | )
 7 | from supervised.utils.metric import Metric
 8 | 
 9 | 
10 | class KNNObjective:
11 |     def __init__(
12 |         self,
13 |         ml_task,
14 |         X_train,
15 |         y_train,
16 |         sample_weight,
17 |         X_validation,
18 |         y_validation,
19 |         sample_weight_validation,
20 |         eval_metric,
21 |         n_jobs,
22 |         random_state,
23 |     ):
24 |         self.ml_task = ml_task
25 |         self.X_train = X_train
26 |         self.y_train = y_train
27 |         self.sample_weight = sample_weight
28 |         self.X_validation = X_validation
29 |         self.y_validation = y_validation
30 |         self.eval_metric = eval_metric
31 |         self.n_jobs = n_jobs
32 |         self.seed = random_state
33 | 
34 |     def __call__(self, trial):
35 |         try:
36 |             params = {
37 |                 "n_neighbors": trial.suggest_int("n_neighbors", 1, 128),
38 |                 "weights": trial.suggest_categorical(
39 |                     "weights", ["uniform", "distance"]
40 |                 ),
41 |                 "n_jobs": self.n_jobs,
42 |                 "rows_limit": 100000,
43 |                 "ml_task": self.ml_task,
44 |             }
45 |             Algorithm = (
46 |                 KNeighborsRegressorAlgorithm
47 |                 if self.ml_task == REGRESSION
48 |                 else KNeighborsAlgorithm
49 |             )
50 |             model = Algorithm(params)
51 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
52 |             preds = model.predict(self.X_validation)
53 | 
54 |             score = self.eval_metric(self.y_validation, preds)
55 |             if Metric.optimize_negative(self.eval_metric.name):
56 |                 score *= -1.0
57 | 
58 |         except optuna.exceptions.TrialPruned as e:
59 |             raise e
60 |         except Exception as e:
61 |             print("Exception in KNNObjective", str(e))
62 |             return None
63 | 
64 |         return score
65 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_automl_init.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised import AutoML
 7 | 
 8 | 
 9 | class AutoMLInitTest(unittest.TestCase):
10 |     automl_dir = "AutoMLInitTest"
11 | 
12 |     def tearDown(self):
13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
14 | 
15 |     def test_custom_init(self):
16 |         X = np.random.uniform(size=(30, 2))
17 |         y = np.random.randint(0, 2, size=(30,))
18 | 
19 |         automl = AutoML(
20 |             results_path=self.automl_dir,
21 |             model_time_limit=1,
22 |             algorithms=["Xgboost"],
23 |             explain_level=0,
24 |             train_ensemble=False,
25 |             stack_models=False,
26 |             validation_strategy={"validation_type": "split"},
27 |             start_random_models=3,
28 |             hill_climbing_steps=1,
29 |             top_models_to_improve=1,
30 |         )
31 | 
32 |         automl.fit(X, y)
33 |         self.assertGreater(len(automl._models), 3)
34 | 
35 |     def test_get_results_path(self):
36 |         automl = AutoML(algorithms=["Baseline"], total_time_limit=1)
37 |         first_path = automl._get_results_path()
38 |         self.assertEqual(first_path, automl._get_results_path())
39 |         shutil.rmtree(first_path, ignore_errors=True)
40 | 
41 |         automl = AutoML(
42 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
43 |         )
44 |         self.assertEqual(self.automl_dir, automl._get_results_path())
45 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
46 | 
47 |         # get results path after save
48 |         automl = AutoML(
49 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
50 |         )
51 |         X = np.random.uniform(size=(30, 2))
52 |         y = np.random.randint(0, 2, size=(30,))
53 |         automl.fit(X, y)
54 |         self.assertEqual(self.automl_dir, automl._get_results_path())
55 | 
56 |         automl2 = AutoML(
57 |             algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir
58 |         )
59 |         self.assertEqual(self.automl_dir, automl2._get_results_path())
60 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_stack_models_constraints.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from supervised import AutoML
 7 | 
 8 | 
 9 | class AutoMLStackModelsConstraintsTest(unittest.TestCase):
10 |     automl_dir = "AutoMLStackModelsConstraintsTest"
11 | 
12 |     def tearDown(self):
13 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
14 | 
15 |     def test_allow_stack_models(self):
16 |         X = np.random.uniform(size=(100, 2))
17 |         y = np.random.randint(0, 2, size=(100,))
18 |         X[:, 0] = y
19 |         X[:, 1] = -y
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             total_time_limit=5,
24 |             mode="Compete",
25 |             validation_strategy={"validation_type": "kfold", "k_folds": 5},
26 |         )
27 |         automl.fit(X, y)
28 |         self.assertTrue(automl._stack_models)
29 |         self.assertTrue(automl.tuner._stack_models)
30 |         self.assertTrue(automl._time_ctrl._is_stacking)
31 | 
32 |     def test_disable_stack_models(self):
33 |         X = np.random.uniform(size=(100, 2))
34 |         y = np.random.randint(0, 2, size=(100,))
35 |         X[:, 0] = y
36 |         X[:, 1] = -y
37 | 
38 |         automl = AutoML(
39 |             results_path=self.automl_dir,
40 |             total_time_limit=5,
41 |             mode="Compete",
42 |             validation_strategy={"validation_type": "split"},
43 |         )
44 |         automl.fit(X, y)
45 |         self.assertFalse(automl._stack_models)
46 |         self.assertFalse(automl.tuner._stack_models)
47 |         self.assertFalse(automl._time_ctrl._is_stacking)
48 | 
49 |     def test_disable_stack_models_adjusted_validation(self):
50 |         X = np.random.uniform(size=(100, 2))
51 |         y = np.random.randint(0, 2, size=(100,))
52 |         X[:, 0] = y
53 |         X[:, 1] = -y
54 | 
55 |         automl = AutoML(
56 |             results_path=self.automl_dir, total_time_limit=5, mode="Compete"
57 |         )
58 |         automl.fit(X, y)
59 |         # the stacking should be disabled
60 |         # because of small time limit
61 |         self.assertFalse(automl._stack_models)
62 |         self.assertFalse(automl.tuner._stack_models)
63 |         self.assertFalse(automl._time_ctrl._is_stacking)
64 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_decision_tree.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised.algorithms.decision_tree import (
 9 |     DecisionTreeRegressorAlgorithm,
10 | )
11 | from supervised.utils.metric import Metric
12 | 
13 | 
14 | class DecisionTreeTest(unittest.TestCase):
15 |     @classmethod
16 |     def setUpClass(cls):
17 |         cls.X, cls.y = datasets.make_regression(
18 |             n_samples=100,
19 |             n_features=5,
20 |             n_informative=4,
21 |             n_targets=1,
22 |             shuffle=False,
23 |             random_state=0,
24 |         )
25 | 
26 |     def test_reproduce_fit_regression(self):
27 |         metric = Metric({"name": "rmse"})
28 |         params = {"max_depth": 1, "seed": 1, "ml_task": "regression"}
29 |         prev_loss = None
30 |         for _ in range(3):
31 |             model = DecisionTreeRegressorAlgorithm(params)
32 |             model.fit(self.X, self.y)
33 |             y_predicted = model.predict(self.X)
34 |             loss = metric(self.y, y_predicted)
35 |             if prev_loss is not None:
36 |                 assert_almost_equal(prev_loss, loss)
37 |             prev_loss = loss
38 | 
39 |     def test_save_and_load(self):
40 |         metric = Metric({"name": "rmse"})
41 |         dt = DecisionTreeRegressorAlgorithm({"ml_task": "regression"})
42 |         dt.fit(self.X, self.y)
43 |         y_predicted = dt.predict(self.X)
44 |         loss = metric(self.y, y_predicted)
45 | 
46 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
47 | 
48 |         dt.save(filename)
49 |         dt2 = DecisionTreeRegressorAlgorithm({"ml_task": "regression"})
50 |         dt2.load(filename)
51 | 
52 |         y_predicted = dt2.predict(self.X)
53 |         loss2 = metric(self.y, y_predicted)
54 |         assert_almost_equal(loss, loss2)
55 | 
56 |         # Finished with temp file, delete it
57 |         os.remove(filename)
58 | 
59 |     def test_is_fitted(self):
60 |         params = {"max_depth": 1, "seed": 1, "ml_task": "regression"}
61 |         model = DecisionTreeRegressorAlgorithm(params)
62 |         self.assertFalse(model.is_fitted())
63 |         model.fit(self.X, self.y)
64 |         self.assertTrue(model.is_fitted())
65 | 
```

--------------------------------------------------------------------------------
/tests/tests_callbacks/test_total_time_constraint.py:
--------------------------------------------------------------------------------

```python
 1 | import time
 2 | import unittest
 3 | 
 4 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint
 5 | from supervised.exceptions import NotTrainedException
 6 | 
 7 | 
 8 | class TotalTimeConstraintTest(unittest.TestCase):
 9 |     def test_stop_on_first_learner(self):
10 |         params = {
11 |             "total_time_limit": 100,
12 |             "total_time_start": time.time(),
13 |             "expected_learners_cnt": 1001,
14 |         }
15 |         callback = TotalTimeConstraint(params)
16 |         callback.add_and_set_learner(learner={})
17 |         callback.on_learner_train_start(logs=None)
18 |         time.sleep(0.1)
19 |         with self.assertRaises(NotTrainedException) as context:
20 |             callback.on_learner_train_end(logs=None)
21 |         self.assertTrue("Stop training after the first fold" in str(context.exception))
22 | 
23 |     def test_stop_on_not_first_learner(self):
24 |         params = {
25 |             "total_time_limit": 100,
26 |             "total_time_start": time.time(),
27 |             "expected_learners_cnt": 10,
28 |         }
29 |         callback = TotalTimeConstraint(params)
30 |         callback.add_and_set_learner(learner={})
31 |         callback.on_learner_train_start(logs=None)
32 |         callback.on_learner_train_end(logs=None)
33 |         with self.assertRaises(NotTrainedException) as context:
34 |             #
35 |             # hardcoded change just for tests!
36 |             callback.total_time_start = time.time() - 600 - 100 - 1
37 |             #
38 |             callback.add_and_set_learner(learner={})
39 |             callback.on_learner_train_start(logs=None)
40 |             callback.on_learner_train_end(logs=None)
41 |         self.assertTrue("Force to stop" in str(context.exception))
42 | 
43 |     def test_dont_stop(self):
44 |         params = {
45 |             "total_time_limit": 100,
46 |             "total_time_start": time.time(),
47 |             "expected_learners_cnt": 10,
48 |         }
49 |         callback = TotalTimeConstraint(params)
50 | 
51 |         for i in range(10):
52 |             callback.add_and_set_learner(learner={})
53 |             callback.on_learner_train_start(logs=None)
54 |             callback.on_learner_train_end(logs=None)
55 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_preprocessing_utils.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
 7 | 
 8 | 
 9 | class PreprocessingUtilsTest(unittest.TestCase):
10 |     def test_get_type_numpy_number(self):
11 |         tmp = np.array([1, 2, 3])
12 |         tmp_type = PreprocessingUtils.get_type(tmp)
13 |         self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
14 | 
15 |     def test_get_type_numpy_categorical(self):
16 |         tmp = np.array(["a", "b", "c"])
17 |         tmp_type = PreprocessingUtils.get_type(tmp)
18 |         self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL)
19 | 
20 |     def test_get_type_pandas_bug(self):
21 |         d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
22 |         df = pd.DataFrame(data=d)
23 |         col1_type = PreprocessingUtils.get_type(df.loc[:, "col2"])
24 |         self.assertEqual(col1_type, PreprocessingUtils.CATEGORICAL)
25 | 
26 |     def test_get_type_pandas(self):
27 |         d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]}
28 |         df = pd.DataFrame(data=d)
29 |         col1_type = PreprocessingUtils.get_type(df["col1"])
30 |         self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
31 |         col2_type = PreprocessingUtils.get_type(df["col2"])
32 |         self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL)
33 | 
34 |     def test_get_stats(self):
35 |         tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan])
36 |         self.assertEqual(1, PreprocessingUtils.get_min(tmp))
37 |         self.assertEqual(2, PreprocessingUtils.get_mean(tmp))
38 |         self.assertEqual(2, PreprocessingUtils.get_median(tmp))
39 |         d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]}
40 |         df = pd.DataFrame(data=d)
41 |         self.assertEqual(1, PreprocessingUtils.get_min(df["col1"]))
42 |         self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"]))
43 |         self.assertEqual(1, PreprocessingUtils.get_median(df["col1"]))
44 | 
45 |         self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"]))
46 |         self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"]))
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     unittest.main()
51 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/nn.py:
--------------------------------------------------------------------------------

```python
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm
 4 | from supervised.algorithms.registry import (
 5 |     REGRESSION,
 6 | )
 7 | from supervised.utils.metric import Metric
 8 | 
 9 | 
10 | class NeuralNetworkObjective:
11 |     def __init__(
12 |         self,
13 |         ml_task,
14 |         X_train,
15 |         y_train,
16 |         sample_weight,
17 |         X_validation,
18 |         y_validation,
19 |         sample_weight_validation,
20 |         eval_metric,
21 |         n_jobs,
22 |         random_state,
23 |     ):
24 |         self.ml_task = ml_task
25 |         self.X_train = X_train
26 |         self.y_train = y_train
27 |         self.sample_weight = sample_weight
28 |         self.X_validation = X_validation
29 |         self.y_validation = y_validation
30 |         self.eval_metric = eval_metric
31 |         self.seed = random_state
32 | 
33 |     def __call__(self, trial):
34 |         try:
35 |             Algorithm = (
36 |                 MLPRegressorAlgorithm if self.ml_task == REGRESSION else MLPAlgorithm
37 |             )
38 |             params = {
39 |                 "dense_1_size": trial.suggest_int("dense_1_size", 4, 100),
40 |                 "dense_2_size": trial.suggest_int("dense_2_size", 2, 100),
41 |                 "learning_rate": trial.suggest_categorical(
42 |                     "learning_rate", [0.005, 0.01, 0.05, 0.1, 0.2]
43 |                 ),
44 |                 "learning_rate_type": trial.suggest_categorical(
45 |                     "learning_rate_type", ["constant", "adaptive"]
46 |                 ),
47 |                 "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
48 |                 "seed": self.seed,
49 |                 "ml_task": self.ml_task,
50 |             }
51 |             model = Algorithm(params)
52 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
53 | 
54 |             preds = model.predict(self.X_validation)
55 | 
56 |             score = self.eval_metric(self.y_validation, preds)
57 |             if Metric.optimize_negative(self.eval_metric.name):
58 |                 score *= -1.0
59 | 
60 |         except optuna.exceptions.TrialPruned as e:
61 |             raise e
62 |         except Exception as e:
63 |             print("Exception in NeuralNetworkObjective", str(e))
64 |             return None
65 | 
66 |         return score
67 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_compute_additional_metrics.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, REGRESSION
 6 | from supervised.utils.additional_metrics import AdditionalMetrics
 7 | 
 8 | 
 9 | class ComputeAdditionalMetricsTest(unittest.TestCase):
10 |     def test_compute(self):
11 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
12 |         pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8])
13 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
14 |         details = info["metric_details"]
15 |         max_metrics = info["max_metrics"]
16 |         conf = info["confusion_matrix"]
17 |         self.assertEqual(conf.iloc[0, 0], 3)
18 |         self.assertEqual(conf.iloc[1, 1], 3)
19 |         self.assertTrue(details is not None)
20 |         self.assertTrue(max_metrics is not None)
21 | 
22 |     def test_compute_f1(self):
23 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
24 |         pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
25 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
26 |         details = info["metric_details"]
27 |         max_metrics = info["max_metrics"]
28 |         conf = info["confusion_matrix"]
29 |         self.assertEqual(max_metrics["f1"]["score"], 1)
30 |         self.assertTrue(details is not None)
31 |         self.assertTrue(conf is not None)
32 | 
33 |     def test_compute_for_regression(self):
34 |         target = np.array([0, 0, 0, 0, 1, 1, 1, 1])
35 |         pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8])
36 |         info = AdditionalMetrics.compute(target, pred, None, REGRESSION)
37 |         all_metrics = list(info["max_metrics"]["Metric"].values)
38 |         for m in ["MAE", "MSE", "RMSE", "R2"]:
39 |             self.assertTrue(m in all_metrics)
40 | 
41 |     def test_compute_constant_preds(self):
42 |         target = np.array([0, 0, 1, 1, 0, 0, 0, 0])
43 |         pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
44 |         info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION)
45 |         details = info["metric_details"]
46 |         max_metrics = info["max_metrics"]
47 |         conf = info["confusion_matrix"]
48 |         self.assertTrue(max_metrics["f1"]["score"] < 1)
49 |         self.assertTrue(max_metrics["mcc"]["score"] < 1)
50 | 
```

--------------------------------------------------------------------------------
/tests/tests_fairness/test_regression.py:
--------------------------------------------------------------------------------

```python
 1 | import shutil
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from supervised import AutoML
 8 | 
 9 | 
10 | class FairnessInRegressionTest(unittest.TestCase):
11 |     automl_dir = "automl_fairness_testing"
12 | 
13 |     def tearDown(self):
14 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
15 | 
16 |     def test_init(self):
17 |         X = np.random.uniform(size=(30, 2))
18 |         y = np.random.randint(0, 100, size=(30,))
19 |         S = pd.DataFrame({"sensitive": ["A", "B"] * 15})
20 | 
21 |         automl = AutoML(
22 |             results_path=self.automl_dir,
23 |             model_time_limit=10,
24 |             algorithms=["Xgboost"],
25 |             explain_level=0,
26 |             train_ensemble=False,
27 |             stack_models=False,
28 |             validation_strategy={"validation_type": "split"},
29 |             start_random_models=1,
30 |         )
31 | 
32 |         automl.fit(X, y, sensitive_features=S)
33 | 
34 |         self.assertGreater(len(automl._models), 0)
35 | 
36 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
37 |         self.assertEqual(len(sensitive_features_names), 1)
38 |         self.assertTrue("sensitive" in sensitive_features_names)
39 | 
40 |         self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None)
41 |         self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1)
42 |         self.assertTrue(automl._models[0].get_worst_fairness() is not None)
43 |         self.assertTrue(automl._models[0].get_best_fairness() is not None)
44 | 
45 |     def test_two_sensitive_features(self):
46 |         X = np.random.uniform(size=(30, 2))
47 |         y = np.random.randint(0, 100, size=(30,))
48 |         S = pd.DataFrame(
49 |             {
50 |                 "sensitive_1": ["White", "Black"] * 15,
51 |                 "sensitive_2": ["Male", "Female"] * 15,
52 |             }
53 |         )
54 | 
55 |         automl = AutoML(
56 |             results_path=self.automl_dir,
57 |             model_time_limit=10,
58 |             algorithms=["Xgboost"],
59 |             explain_level=0,
60 |             train_ensemble=False,
61 |             stack_models=False,
62 |             start_random_models=1,
63 |         )
64 | 
65 |         automl.fit(X, y, sensitive_features=S)
66 | 
67 |         self.assertGreater(len(automl._models), 0)
68 | 
69 |         sensitive_features_names = automl._models[0].get_sensitive_features_names()
70 |         self.assertEqual(len(sensitive_features_names), 2)
71 | 
```

--------------------------------------------------------------------------------
/tests/tests_tuner/test_time_controller.py:
--------------------------------------------------------------------------------

```python
 1 | import time
 2 | import unittest
 3 | 
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.tuner.time_controller import TimeController
 7 | 
 8 | 
 9 | class TimeControllerTest(unittest.TestCase):
10 |     def test_to_and_from_json(self):
11 |         tc = TimeController(
12 |             start_time=time.time(),
13 |             total_time_limit=10,
14 |             model_time_limit=None,
15 |             steps=["simple_algorithms"],
16 |             algorithms=["Baseline"],
17 |         )
18 |         tc.log_time("1_Baseline", "Baseline", "simple_algorithms", 123.1)
19 | 
20 |         tc2 = TimeController.from_json(tc.to_json())
21 | 
22 |         assert_almost_equal(tc2.step_spend("simple_algorithms"), 123.1)
23 |         assert_almost_equal(tc2.model_spend("Baseline"), 123.1)
24 | 
25 |     def test_enough_time_for_stacking(self):
26 |         for t in [5, 10, 20]:
27 |             tc = TimeController(
28 |                 start_time=time.time(),
29 |                 total_time_limit=100,
30 |                 model_time_limit=None,
31 |                 steps=[
32 |                     "default_algorithms",
33 |                     "not_so_random",
34 |                     "golden_features",
35 |                     "insert_random_feature",
36 |                     "features_selection",
37 |                     "hill_climbing_1",
38 |                     "hill_climbing_3",
39 |                     "hill_climbing_5",
40 |                     "ensemble",
41 |                     "stack",
42 |                     "ensemble_stacked",
43 |                 ],
44 |                 algorithms=["Xgboost"],
45 |             )
46 |             tc.log_time("1_Xgboost", "Xgboost", "default_algorithms", t)
47 |             tc.log_time("2_Xgboost", "Xgboost", "not_so_random", t)
48 |             tc.log_time("3_Xgboost", "Xgboost", "insert_random_feature", t)
49 |             tc.log_time("4_Xgboost", "Xgboost", "features_selection", t)
50 |             tc.log_time("5_Xgboost", "Xgboost", "hill_climbing_1", t)
51 |             tc.log_time("6_Xgboost", "Xgboost", "hill_climbing_2", t)
52 |             tc.log_time("7_Xgboost", "Xgboost", "hill_climbing_3", t)
53 | 
54 |             tc._start_time = time.time() - 7 * t
55 |             assert_almost_equal(tc.already_spend(), 7 * t)
56 |             if t < 20:
57 |                 self.assertTrue(tc.enough_time("Xgboost", "stack"))
58 |             else:
59 |                 self.assertFalse(tc.enough_time("Xgboost", "stack"))
60 |             self.assertTrue(tc.enough_time("Ensemble_Stacked", "ensemble_stacked"))
61 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/registry.py:
--------------------------------------------------------------------------------

```python
 1 | # tasks that can be handled by the package
 2 | BINARY_CLASSIFICATION = "binary_classification"
 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification"
 4 | REGRESSION = "regression"
 5 | 
 6 | class AlgorithmsRegistry:
 7 |     registry = {
 8 |         BINARY_CLASSIFICATION: {},
 9 |         MULTICLASS_CLASSIFICATION: {},
10 |         REGRESSION: {},
11 |     }
12 | 
13 |     @staticmethod
14 |     def add(
15 |         task_name,
16 |         model_class,
17 |         model_params,
18 |         required_preprocessing,
19 |         additional,
20 |         default_params,
21 |     ):
22 |         model_information = {
23 |             "class": model_class,
24 |             "params": model_params,
25 |             "required_preprocessing": required_preprocessing,
26 |             "additional": additional,
27 |             "default_params": default_params,
28 |         }
29 |         AlgorithmsRegistry.registry[task_name][
30 |             model_class.algorithm_short_name
31 |         ] = model_information
32 | 
33 |     @staticmethod
34 |     def get_supported_ml_tasks():
35 |         return AlgorithmsRegistry.registry.keys()
36 | 
37 |     @staticmethod
38 |     def get_algorithm_class(ml_task, algorithm_name):
39 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["class"]
40 | 
41 |     @staticmethod
42 |     def get_long_name(ml_task, algorithm_name):
43 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name][
44 |             "class"
45 |         ].algorithm_name
46 | 
47 |     @staticmethod
48 |     def get_max_rows_limit(ml_task, algorithm_name):
49 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][
50 |             "max_rows_limit"
51 |         ]
52 | 
53 |     @staticmethod
54 |     def get_max_cols_limit(ml_task, algorithm_name):
55 |         return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][
56 |             "max_cols_limit"
57 |         ]
58 | 
59 |     @staticmethod
60 |     def get_eval_metric(algorithm_name, ml_task, automl_eval_metric):
61 |         if algorithm_name == "Xgboost":
62 |             return xgboost_eval_metric(ml_task, automl_eval_metric)
63 | 
64 |         return automl_eval_metric
65 | 
66 | # Import algorithm to be registered
67 | import supervised.algorithms.baseline
68 | import supervised.algorithms.catboost
69 | import supervised.algorithms.decision_tree
70 | import supervised.algorithms.extra_trees
71 | import supervised.algorithms.knn
72 | import supervised.algorithms.lightgbm
73 | import supervised.algorithms.linear
74 | import supervised.algorithms.nn
75 | import supervised.algorithms.random_forest
76 | import supervised.algorithms.xgboost
```

--------------------------------------------------------------------------------
/supervised/tuner/hill_climbing.py:
--------------------------------------------------------------------------------

```python
 1 | import copy
 2 | 
 3 | import numpy as np
 4 | 
 5 | from supervised.algorithms.registry import AlgorithmsRegistry
 6 | 
 7 | 
 8 | class HillClimbing:
 9 | 
10 |     """
11 |     Example params are in JSON format:
12 |     {
13 |         "booster": ["gbtree", "gblinear"],
14 |         "objective": ["binary:logistic"],
15 |         "eval_metric": ["auc", "logloss"],
16 |         "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1]
17 |     }
18 |     """
19 | 
20 |     @staticmethod
21 |     def get(params, ml_task, seed=1):
22 |         np.random.seed(seed)
23 |         keys = list(params.keys())
24 |         for k in [
25 |             "num_class",
26 |             "model_type",
27 |             "seed",
28 |             "ml_task",
29 |             "explain_level",
30 |             "model_architecture_json",
31 |             "n_jobs",
32 |             "metric",
33 |             "eval_metric",
34 |             "custom_eval_metric_name",
35 |             "eval_metric_name",
36 |         ]:
37 |             if k in keys:
38 |                 keys.remove(k)
39 | 
40 |         model_type = params["model_type"]
41 |         if model_type == "Baseline":
42 |             return [None, None]
43 |         model_info = AlgorithmsRegistry.registry[ml_task][model_type]
44 |         model_params = model_info["params"]
45 | 
46 |         permuted_keys = np.random.permutation(keys)
47 |         key_to_update = None
48 |         values = None
49 | 
50 |         for key_to_update in permuted_keys:
51 |             if key_to_update not in model_params:
52 |                 continue
53 |             values = model_params[key_to_update]
54 |             if len(values) > 1:
55 |                 break
56 |         if values is None:
57 |             return [None, None]
58 | 
59 |         left, right = None, None
60 |         for i, v in enumerate(values):
61 |             if v == params[key_to_update]:
62 |                 if i + 1 < len(values):
63 |                     right = values[i + 1]
64 |                 if i - 1 >= 0:
65 |                     left = values[i - 1]
66 | 
67 |         params_1, params_2 = None, None
68 |         if left is not None:
69 |             params_1 = copy.deepcopy(params)
70 |             params_1[key_to_update] = left
71 |         if right is not None:
72 |             params_2 = copy.deepcopy(params)
73 |             params_2[key_to_update] = right
74 | 
75 |         if params_1 is not None and "model_architecture_json" in params_1:
76 |             del params_1["model_architecture_json"]
77 |         if params_2 is not None and "model_architecture_json" in params_2:
78 |             del params_2["model_architecture_json"]
79 | 
80 |         return [params_1, params_2]
81 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/data_info.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from supervised.algorithms.registry import (
 5 |     BINARY_CLASSIFICATION,
 6 |     MULTICLASS_CLASSIFICATION,
 7 |     REGRESSION,
 8 | )
 9 | from supervised.preprocessing.encoding_selector import EncodingSelector
10 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
11 | 
12 | 
13 | class DataInfo:
14 |     @staticmethod
15 |     def compute(X, y, machinelearning_task):
16 |         columns_info = {}
17 |         for col in X.columns:
18 |             columns_info[col] = []
19 |             #
20 |             empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
21 |             if empty_column:
22 |                 columns_info[col] += ["empty_column"]
23 |                 continue
24 |             #
25 |             constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
26 |             if constant_column:
27 |                 columns_info[col] += ["constant_column"]
28 |                 continue
29 |             #
30 |             if PreprocessingUtils.is_na(X[col]):
31 |                 columns_info[col] += ["missing_values"]
32 |             #
33 |             if PreprocessingUtils.is_categorical(X[col]):
34 |                 columns_info[col] += ["categorical"]
35 |                 columns_info[col] += [EncodingSelector.get(X, y, col)]
36 |             elif PreprocessingUtils.is_datetime(X[col]):
37 |                 columns_info[col] += ["datetime_transform"]
38 |             elif PreprocessingUtils.is_text(X[col]):
39 |                 columns_info[col] = ["text_transform"]  # override other transforms
40 |             else:
41 |                 # numeric type, check if scale needed
42 |                 if PreprocessingUtils.is_scale_needed(X[col]):
43 |                     columns_info[col] += ["scale"]
44 | 
45 |         target_info = []
46 |         if machinelearning_task == BINARY_CLASSIFICATION:
47 |             if not PreprocessingUtils.is_0_1(y):
48 |                 target_info += ["convert_0_1"]
49 | 
50 |         if machinelearning_task == REGRESSION:
51 |             if PreprocessingUtils.is_log_scale_needed(y):
52 |                 target_info += ["scale_log"]
53 |             elif PreprocessingUtils.is_scale_needed(y):
54 |                 target_info += ["scale"]
55 | 
56 |         num_class = None
57 |         if machinelearning_task == MULTICLASS_CLASSIFICATION:
58 |             num_class = PreprocessingUtils.num_class(y)
59 | 
60 |         return {
61 |             "columns_info": columns_info,
62 |             "target_info": target_info,
63 |             "num_class": num_class,
64 |         }
65 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_dir_change.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import shutil
 3 | import unittest
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal
 7 | from sklearn import datasets
 8 | 
 9 | from supervised import AutoML
10 | 
11 | 
12 | class AutoMLDirChangeTest(unittest.TestCase):
13 |     automl_dir_a = "automl_testing_A"
14 |     automl_dir_b = "automl_testing_B"
15 |     automl_dir = "automl_testing"
16 | 
17 |     def tearDown(self):
18 |         shutil.rmtree(self.automl_dir_a, ignore_errors=True)
19 |         shutil.rmtree(self.automl_dir_b, ignore_errors=True)
20 | 
21 |     def create_dir(self, dir_path):
22 |         if not os.path.exists(dir_path):
23 |             try:
24 |                 os.mkdir(dir_path)
25 |             except Exception as e:
26 |                 pass
27 | 
28 |     def test_create_report_after_dir_change(self):
29 |         #
30 |         # test for https://github.com/mljar/mljar-supervised/issues/384
31 |         #
32 |         self.create_dir(self.automl_dir_a)
33 |         self.create_dir(self.automl_dir_b)
34 | 
35 |         path_a = os.path.join(self.automl_dir_a, self.automl_dir)
36 |         path_b = os.path.join(self.automl_dir_b, self.automl_dir)
37 | 
38 |         X = np.random.uniform(size=(30, 2))
39 |         y = np.random.randint(0, 2, size=(30,))
40 | 
41 |         automl = AutoML(results_path=path_a, algorithms=["Baseline"], explain_level=0)
42 |         automl.fit(X, y)
43 | 
44 |         shutil.move(path_a, path_b)
45 | 
46 |         automl2 = AutoML(
47 |             results_path=path_b,
48 |         )
49 |         automl2.report()
50 | 
51 |     def test_compute_predictions_after_dir_change(self):
52 |         #
53 |         # test for https://github.com/mljar/mljar-supervised/issues/384
54 |         #
55 |         self.create_dir(self.automl_dir_a)
56 |         self.create_dir(self.automl_dir_b)
57 | 
58 |         path_a = os.path.join(self.automl_dir_a, self.automl_dir)
59 |         path_b = os.path.join(self.automl_dir_b, self.automl_dir)
60 | 
61 |         X, y = datasets.make_regression(
62 |             n_samples=100,
63 |             n_features=5,
64 |             n_informative=4,
65 |             n_targets=1,
66 |             shuffle=False,
67 |             random_state=0,
68 |         )
69 | 
70 |         automl = AutoML(
71 |             results_path=path_a,
72 |             explain_level=0,
73 |             ml_task="regression",
74 |             total_time_limit=10,
75 |         )
76 |         automl.fit(X, y)
77 |         p = automl.predict(X[:3])
78 | 
79 |         shutil.move(path_a, path_b)
80 | 
81 |         automl2 = AutoML(
82 |             results_path=path_b,
83 |         )
84 |         p2 = automl2.predict(X[:3])
85 | 
86 |         for i in range(3):
87 |             assert_almost_equal(p[i], p2[i])
88 | 
```

--------------------------------------------------------------------------------
/tests/tests_preprocessing/test_scale.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from numpy.testing import assert_almost_equal
 6 | 
 7 | from supervised.preprocessing.scale import Scale
 8 | 
 9 | 
10 | class ScaleTest(unittest.TestCase):
11 |     def test_fit_log_and_normal(self):
12 |         # training data
13 |         d = {
14 |             "col1": [12, 13, 3, 4, 5, 6, 7, 8000, 9000, 10000.0],
15 |             "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0],
16 |             "col3": [12, 2, 3, 4, 5, 6, 7, 8000, 9000, 10000.0],
17 |         }
18 |         df = pd.DataFrame(data=d)
19 | 
20 |         scale = Scale(["col1", "col3"], scale_method=Scale.SCALE_LOG_AND_NORMAL)
21 |         scale.fit(df)
22 |         df = scale.transform(df)
23 |         val = float(df["col1"][0])
24 | 
25 |         assert_almost_equal(np.mean(df["col1"]), 0)
26 |         self.assertTrue(
27 |             df["col1"][0] + 0.01 < df["col1"][1]
28 |         )  # in case of wrong scaling the small values will be squeezed
29 | 
30 |         df = scale.inverse_transform(df)
31 | 
32 |         scale2 = Scale()
33 |         scale_params = scale.to_json()
34 | 
35 |         scale2.from_json(scale_params)
36 |         df = scale2.transform(df)
37 |         assert_almost_equal(df["col1"][0], val)
38 | 
39 |     def test_fit(self):
40 |         # training data
41 |         d = {
42 |             "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0],
43 |             "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0],
44 |         }
45 |         df = pd.DataFrame(data=d)
46 | 
47 |         scale = Scale(["col1"])
48 |         scale.fit(df)
49 |         df = scale.transform(df)
50 | 
51 |         assert_almost_equal(np.mean(df["col1"]), 0)
52 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
53 | 
54 |         df = scale.inverse_transform(df)
55 |         assert_almost_equal(df["col1"][0], 1)
56 |         assert_almost_equal(df["col1"][1], 2)
57 | 
58 |     def test_to_and_from_json(self):
59 |         # training data
60 |         d = {
61 |             "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10],
62 |             "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30],
63 |         }
64 |         df = pd.DataFrame(data=d)
65 | 
66 |         scale = Scale(["col1"])
67 |         scale.fit(df)
68 |         # do not transform
69 |         assert_almost_equal(np.mean(df["col1"]), 5.5)
70 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
71 |         # to and from json
72 | 
73 |         json_data = scale.to_json()
74 |         scale2 = Scale()
75 |         scale2.from_json(json_data)
76 |         # transform with loaded scaler
77 |         df = scale2.transform(df)
78 |         assert_almost_equal(np.mean(df["col1"]), 0)
79 |         assert_almost_equal(np.mean(df["col2"]), 25.5)
80 | 
```

--------------------------------------------------------------------------------
/tests/tests_utils/test_metric.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from numpy.testing import assert_almost_equal
 5 | 
 6 | from supervised.utils.metric import Metric
 7 | from supervised.utils.metric import UserDefinedEvalMetric
 8 | 
 9 | 
10 | class MetricTest(unittest.TestCase):
11 |     def test_create(self):
12 |         params = {"name": "logloss"}
13 |         m = Metric(params)
14 |         y_true = np.array([0, 0, 1, 1])
15 |         y_predicted = np.array([0, 0, 1, 1])
16 |         score = m(y_true, y_predicted)
17 |         self.assertTrue(score < 0.1)
18 |         y_true = np.array([0, 0, 1, 1])
19 |         y_predicted = np.array([1, 1, 0, 0])
20 |         score = m(y_true, y_predicted)
21 |         self.assertTrue(score > 1.0)
22 | 
23 |     def test_metric_improvement(self):
24 |         params = {"name": "logloss"}
25 |         m = Metric(params)
26 |         y_true = np.array([0, 0, 1, 1])
27 |         y_predicted = np.array([0, 0, 0, 1])
28 |         score_1 = m(y_true, y_predicted)
29 |         y_true = np.array([0, 0, 1, 1])
30 |         y_predicted = np.array([0, 0, 1, 1])
31 |         score_2 = m(y_true, y_predicted)
32 |         self.assertTrue(m.improvement(score_1, score_2))
33 | 
34 |     def test_sample_weight(self):
35 |         metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"]
36 |         for m in metrics:
37 |             metric = Metric({"name": m})
38 |             y_true = np.array([0, 0, 1, 1])
39 |             y_predicted = np.array([0, 0, 0, 1])
40 |             sample_weight = np.array([1, 1, 1, 1])
41 | 
42 |             score_1 = metric(y_true, y_predicted)
43 |             score_2 = metric(y_true, y_predicted, sample_weight)
44 |             assert_almost_equal(score_1, score_2)
45 | 
46 |     def test_r2_metric(self):
47 |         params = {"name": "r2"}
48 |         m = Metric(params)
49 |         y_true = np.array([0, 0, 1, 1])
50 |         y_predicted = np.array([0, 0, 1, 1])
51 |         score = m(y_true, y_predicted)
52 |         self.assertEqual(score, -1.0)  # negative r2
53 | 
54 |     def test_mape_metric(self):
55 |         params = {"name": "mape"}
56 |         m = Metric(params)
57 |         y_true = np.array([0, 0, 1, 1])
58 |         y_predicted = np.array([0, 0, 1, 1])
59 |         score = m(y_true, y_predicted)
60 |         self.assertEqual(score, 0.0)
61 | 
62 |     def test_user_defined_metric(self):
63 |         def custom(x, y, sample_weight=None):
64 |             return np.sum(x + y)
65 | 
66 |         UserDefinedEvalMetric().set_metric(custom)
67 | 
68 |         params = {"name": "user_defined_metric"}
69 |         m = Metric(params)
70 | 
71 |         a = np.array([1, 1, 1])
72 | 
73 |         score = m(a, a)
74 |         self.assertEqual(score, 6)
75 | 
```

--------------------------------------------------------------------------------
/tests/tests_automl/test_joblib_version.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import os
 3 | import shutil
 4 | import unittest
 5 | 
 6 | import joblib
 7 | import numpy as np
 8 | 
 9 | from supervised import AutoML
10 | from supervised.exceptions import AutoMLException
11 | 
12 | 
13 | class TestJoblibVersion(unittest.TestCase):
14 |     automl_dir = "TestJoblibVersion"
15 | 
16 |     def tearDown(self):
17 |         shutil.rmtree(self.automl_dir, ignore_errors=True)
18 | 
19 |     def test_joblib_good_version(self):
20 |         X = np.random.uniform(size=(60, 2))
21 |         y = np.random.randint(0, 2, size=(60,))
22 | 
23 |         automl = AutoML(
24 |             results_path=self.automl_dir,
25 |             model_time_limit=10,
26 |             algorithms=["Xgboost"],
27 |             mode="Explain",
28 |             explain_level=0,
29 |             start_random_models=1,
30 |             hill_climbing_steps=0,
31 |             top_models_to_improve=0,
32 |             kmeans_features=False,
33 |             golden_features=False,
34 |             features_selection=False,
35 |             boost_on_errors=False,
36 |         )
37 |         automl.fit(X, y)
38 | 
39 |         # Test if joblib is in json
40 |         json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json")
41 | 
42 |         with open(json_path) as file:
43 |             frame = json.load(file)
44 | 
45 |         json_version = frame["joblib_version"]
46 |         expected_result = joblib.__version__
47 | 
48 |         self.assertEqual(expected_result, json_version)
49 | 
50 |     def test_joblib_wrong_version(self):
51 |         X = np.random.uniform(size=(60, 2))
52 |         y = np.random.randint(0, 2, size=(60,))
53 | 
54 |         automl = AutoML(
55 |             results_path=self.automl_dir,
56 |             model_time_limit=10,
57 |             algorithms=["Xgboost"],
58 |             mode="Explain",
59 |             explain_level=0,
60 |             start_random_models=1,
61 |             hill_climbing_steps=0,
62 |             top_models_to_improve=0,
63 |             kmeans_features=False,
64 |             golden_features=False,
65 |             features_selection=False,
66 |             boost_on_errors=False,
67 |         )
68 |         automl.fit(X, y)
69 | 
70 |         json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json")
71 | 
72 |         with open(json_path) as file:
73 |             frame = json.load(file)
74 | 
75 |         # Injection of wrong joblib version
76 |         frame["joblib_version"] = "0.2.0"
77 | 
78 |         with open(json_path, "w") as file:
79 |             json.dump(frame, file)
80 | 
81 |         with self.assertRaises(AutoMLException):
82 |             automl_2 = AutoML(results_path=self.automl_dir)
83 |             automl_2.predict(X)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     unittest.main()
88 | 
```

--------------------------------------------------------------------------------
/supervised/algorithms/baseline.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | 
 3 | import sklearn
 4 | from sklearn.base import ClassifierMixin, RegressorMixin
 5 | from sklearn.dummy import DummyClassifier, DummyRegressor
 6 | 
 7 | from supervised.algorithms.registry import (
 8 |     BINARY_CLASSIFICATION,
 9 |     MULTICLASS_CLASSIFICATION,
10 |     REGRESSION,
11 |     AlgorithmsRegistry,
12 | )
13 | from supervised.algorithms.sklearn import SklearnAlgorithm
14 | from supervised.utils.config import LOG_LEVEL
15 | 
16 | logger = logging.getLogger(__name__)
17 | logger.setLevel(LOG_LEVEL)
18 | 
19 | 
20 | class BaselineClassifierAlgorithm(ClassifierMixin, SklearnAlgorithm):
21 |     algorithm_name = "Baseline Classifier"
22 |     algorithm_short_name = "Baseline"
23 | 
24 |     def __init__(self, params):
25 |         super(BaselineClassifierAlgorithm, self).__init__(params)
26 |         logger.debug("BaselineClassifierAlgorithm.__init__")
27 | 
28 |         self.library_version = sklearn.__version__
29 |         self.max_iters = additional.get("max_steps", 1)
30 |         self.model = DummyClassifier(
31 |             strategy="prior", random_state=params.get("seed", 1)
32 |         )
33 | 
34 |     def file_extension(self):
35 |         return "baseline"
36 | 
37 |     def is_fitted(self):
38 |         return (
39 |             hasattr(self.model, "n_outputs_")
40 |             and self.model.n_outputs_ is not None
41 |             and self.model.n_outputs_ > 0
42 |         )
43 | 
44 | 
45 | class BaselineRegressorAlgorithm(RegressorMixin, SklearnAlgorithm):
46 |     algorithm_name = "Baseline Regressor"
47 |     algorithm_short_name = "Baseline"
48 | 
49 |     def __init__(self, params):
50 |         super(BaselineRegressorAlgorithm, self).__init__(params)
51 |         logger.debug("BaselineRegressorAlgorithm.__init__")
52 | 
53 |         self.library_version = sklearn.__version__
54 |         self.max_iters = additional.get("max_steps", 1)
55 |         self.model = DummyRegressor(strategy="mean")
56 | 
57 |     def file_extension(self):
58 |         return "baseline"
59 | 
60 |     def is_fitted(self):
61 |         return (
62 |             hasattr(self.model, "n_outputs_")
63 |             and self.model.n_outputs_ is not None
64 |             and self.model.n_outputs_ > 0
65 |         )
66 | 
67 | 
68 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None}
69 | required_preprocessing = ["target_as_integer"]
70 | 
71 | AlgorithmsRegistry.add(
72 |     BINARY_CLASSIFICATION,
73 |     BaselineClassifierAlgorithm,
74 |     {},
75 |     required_preprocessing,
76 |     additional,
77 |     {},
78 | )
79 | 
80 | AlgorithmsRegistry.add(
81 |     MULTICLASS_CLASSIFICATION,
82 |     BaselineClassifierAlgorithm,
83 |     {},
84 |     required_preprocessing,
85 |     additional,
86 |     {},
87 | )
88 | 
89 | 
90 | AlgorithmsRegistry.add(REGRESSION, BaselineRegressorAlgorithm, {}, {}, additional, {})
91 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/extra_trees.py:
--------------------------------------------------------------------------------

```python
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.extra_trees import (
 4 |     ExtraTreesAlgorithm,
 5 |     ExtraTreesRegressorAlgorithm,
 6 | )
 7 | from supervised.algorithms.registry import (
 8 |     REGRESSION,
 9 | )
10 | from supervised.utils.metric import Metric
11 | 
12 | EPS = 1e-8
13 | 
14 | 
15 | class ExtraTreesObjective:
16 |     def __init__(
17 |         self,
18 |         ml_task,
19 |         X_train,
20 |         y_train,
21 |         sample_weight,
22 |         X_validation,
23 |         y_validation,
24 |         sample_weight_validation,
25 |         eval_metric,
26 |         n_jobs,
27 |         random_state,
28 |     ):
29 |         self.ml_task = ml_task
30 |         self.X_train = X_train
31 |         self.y_train = y_train
32 |         self.sample_weight = sample_weight
33 |         self.X_validation = X_validation
34 |         self.y_validation = y_validation
35 |         self.eval_metric = eval_metric
36 |         self.n_jobs = n_jobs
37 |         self.objective = "squared_error" if ml_task == REGRESSION else "gini"
38 |         self.max_steps = 10  # ET is trained in steps 100 trees each
39 |         self.seed = random_state
40 | 
41 |     def __call__(self, trial):
42 |         try:
43 |             Algorithm = (
44 |                 ExtraTreesRegressorAlgorithm
45 |                 if self.ml_task == REGRESSION
46 |                 else ExtraTreesAlgorithm
47 |             )
48 |             self.objective = (
49 |                 "squared_error"
50 |                 if self.ml_task == REGRESSION
51 |                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
52 |             )
53 |             params = {
54 |                 "max_steps": self.max_steps,
55 |                 "criterion": self.objective,
56 |                 "max_depth": trial.suggest_int("max_depth", 2, 32),
57 |                 "min_samples_split": trial.suggest_int("min_samples_split", 2, 100),
58 |                 "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
59 |                 "max_features": trial.suggest_float("max_features", 0.01, 1),
60 |                 "n_jobs": self.n_jobs,
61 |                 "seed": self.seed,
62 |                 "ml_task": self.ml_task,
63 |             }
64 |             model = Algorithm(params)
65 | 
66 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
67 | 
68 |             preds = model.predict(self.X_validation)
69 | 
70 |             score = self.eval_metric(self.y_validation, preds)
71 |             if Metric.optimize_negative(self.eval_metric.name):
72 |                 score *= -1.0
73 | 
74 |         except optuna.exceptions.TrialPruned as e:
75 |             raise e
76 |         except Exception as e:
77 |             print("Exception in ExtraTreesObjective", str(e))
78 |             return None
79 | 
80 |         return score
81 | 
```

--------------------------------------------------------------------------------
/supervised/tuner/optuna/random_forest.py:
--------------------------------------------------------------------------------

```python
 1 | import optuna
 2 | 
 3 | from supervised.algorithms.random_forest import (
 4 |     RandomForestAlgorithm,
 5 |     RandomForestRegressorAlgorithm,
 6 | )
 7 | from supervised.algorithms.registry import (
 8 |     REGRESSION,
 9 | )
10 | from supervised.utils.metric import Metric
11 | 
12 | 
13 | class RandomForestObjective:
14 |     def __init__(
15 |         self,
16 |         ml_task,
17 |         X_train,
18 |         y_train,
19 |         sample_weight,
20 |         X_validation,
21 |         y_validation,
22 |         sample_weight_validation,
23 |         eval_metric,
24 |         n_jobs,
25 |         random_state,
26 |     ):
27 |         self.ml_task = ml_task
28 |         self.X_train = X_train
29 |         self.y_train = y_train
30 |         self.sample_weight = sample_weight
31 |         self.X_validation = X_validation
32 |         self.y_validation = y_validation
33 |         self.eval_metric = eval_metric
34 |         self.n_jobs = n_jobs
35 |         self.objective = "squared_error" if ml_task == REGRESSION else "gini"
36 |         self.max_steps = 10  # RF is trained in steps 100 trees each
37 |         self.seed = random_state
38 | 
39 |     def __call__(self, trial):
40 |         try:
41 |             Algorithm = (
42 |                 RandomForestRegressorAlgorithm
43 |                 if self.ml_task == REGRESSION
44 |                 else RandomForestAlgorithm
45 |             )
46 |             self.objective = (
47 |                 "squared_error"
48 |                 if self.ml_task == REGRESSION
49 |                 else trial.suggest_categorical("criterion", ["gini", "entropy"])
50 |             )
51 |             params = {
52 |                 "max_steps": self.max_steps,
53 |                 "criterion": self.objective,
54 |                 "max_depth": trial.suggest_int("max_depth", 2, 32),
55 |                 "min_samples_split": trial.suggest_int("min_samples_split", 2, 100),
56 |                 "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
57 |                 "max_features": trial.suggest_float("max_features", 0.01, 1),
58 |                 "n_jobs": self.n_jobs,
59 |                 "seed": self.seed,
60 |                 "ml_task": self.ml_task,
61 |             }
62 |             model = Algorithm(params)
63 |             model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
64 | 
65 |             preds = model.predict(self.X_validation)
66 | 
67 |             score = self.eval_metric(self.y_validation, preds)
68 |             if Metric.optimize_negative(self.eval_metric.name):
69 |                 score *= -1.0
70 | 
71 |         except optuna.exceptions.TrialPruned as e:
72 |             raise e
73 |         except Exception as e:
74 |             print("Exception in RandomForestObjective", str(e))
75 |             return None
76 | 
77 |         return score
78 | 
```

--------------------------------------------------------------------------------
/tests/tests_tuner/test_hill_climbing.py:
--------------------------------------------------------------------------------

```python
 1 | import unittest
 2 | 
 3 | from supervised.tuner.mljar_tuner import MljarTuner
 4 | 
 5 | 
 6 | class ModelMock:
 7 |     def __init__(self, name, model_type, final_loss, params):
 8 |         self.name = name
 9 |         self.model_type = model_type
10 |         self.final_loss = final_loss
11 |         self.params = params
12 | 
13 |     def get_name(self):
14 |         return self.name
15 | 
16 |     def get_type(self):
17 |         return self.model_type
18 | 
19 |     def get_final_loss(self):
20 |         return self.final_loss
21 | 
22 |     def get_train_time(self):
23 |         return 0.1
24 | 
25 | 
26 | class TunerHillClimbingTest(unittest.TestCase):
27 |     def test_hill_climbing(self):
28 |         models = []
29 |         models += [
30 |             ModelMock(
31 |                 "121_RandomForest",
32 |                 "Random Forest",
33 |                 0.1,
34 |                 {
35 |                     "learner": {"max_features": 0.4, "model_type": "Random Forest"},
36 |                     "preprocessing": {},
37 |                     "validation_strategy": {},
38 |                 },
39 |             )
40 |         ]
41 |         models += [
42 |             ModelMock(
43 |                 "1_RandomForest",
44 |                 "Random Forest",
45 |                 0.1,
46 |                 {
47 |                     "learner": {"max_features": 0.4, "model_type": "Random Forest"},
48 |                     "preprocessing": {},
49 |                     "validation_strategy": {},
50 |                 },
51 |             )
52 |         ]
53 |         tuner = MljarTuner(
54 |             {
55 |                 "start_random_models": 0,
56 |                 "hill_climbing_steps": 1,
57 |                 "top_models_to_improve": 2,
58 |             },
59 |             algorithms=["Random Foresrt"],
60 |             ml_task="binary_classification",
61 |             eval_metric="logloss",
62 |             validation_strategy={},
63 |             explain_level=2,
64 |             data_info={"columns_info": [], "target_info": []},
65 |             golden_features=False,
66 |             features_selection=False,
67 |             train_ensemble=False,
68 |             stack_models=False,
69 |             adjust_validation=False,
70 |             boost_on_errors=False,
71 |             kmeans_features=False,
72 |             mix_encoding=False,
73 |             optuna_time_budget=None,
74 |             optuna_init_params={},
75 |             optuna_verbose=True,
76 |             n_jobs=1,
77 |             seed=12,
78 |         )
79 |         ind = 121
80 |         score = 0.1
81 |         for _ in range(5):
82 |             for p in tuner.get_hill_climbing_params(models):
83 |                 models += [ModelMock(p["name"], "Random Forest", score, p)]
84 |                 score *= 0.1
85 |                 self.assertTrue(int(p["name"].split("_")[0]) > ind)
86 |                 ind += 1
87 | 
```

--------------------------------------------------------------------------------
/supervised/preprocessing/text_transformer.py:
--------------------------------------------------------------------------------

```python
 1 | import warnings
 2 | import numpy as np
 3 | import pandas as pd
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | 
 6 | 
 7 | class TextTransformer(object):
 8 |     def __init__(self):
 9 |         self._new_columns = []
10 |         self._old_column = None
11 |         self._max_features = 100
12 |         self._vectorizer = None
13 | 
14 |     def fit(self, X, column):
15 |         self._old_column = column
16 |         self._vectorizer = TfidfVectorizer(
17 |             analyzer="word",
18 |             stop_words="english",
19 |             lowercase=True,
20 |             max_features=self._max_features,
21 |         )
22 | 
23 |         x = X[column][~pd.isnull(X[column])]
24 |         self._vectorizer.fit(x)
25 |         for f in list(self._vectorizer.get_feature_names_out()):
26 |             new_col = self._old_column + "_" + f
27 |             self._new_columns += [new_col]
28 | 
29 |     def transform(self, X):
30 |         with warnings.catch_warnings():
31 |             warnings.simplefilter(
32 |                 action="ignore", category=pd.errors.PerformanceWarning
33 |             )
34 |             ii = ~pd.isnull(X[self._old_column])
35 |             x = X[self._old_column][ii]
36 |             vect = self._vectorizer.transform(x)
37 | 
38 |             for f in self._new_columns:
39 |                 X[f] = 0.0
40 | 
41 |             X.loc[ii, self._new_columns] = vect.toarray()
42 |             X.drop(self._old_column, axis=1, inplace=True)
43 |         return X
44 | 
45 |     def to_json(self):
46 |         for k in self._vectorizer.vocabulary_.keys():
47 |             self._vectorizer.vocabulary_[k] = int(self._vectorizer.vocabulary_[k])
48 | 
49 |         data_json = {
50 |             "new_columns": list(self._new_columns),
51 |             "old_column": self._old_column,
52 |             "vocabulary": self._vectorizer.vocabulary_,
53 |             "fixed_vocabulary": self._vectorizer.fixed_vocabulary_,
54 |             "idf": list(self._vectorizer.idf_),
55 |         }
56 |         return data_json
57 | 
58 |     def from_json(self, data_json):
59 |         self._new_columns = data_json.get("new_columns", None)
60 |         self._old_column = data_json.get("old_column", None)
61 |         vocabulary = data_json.get("vocabulary")
62 |         fixed_vocabulary = data_json.get("fixed_vocabulary")
63 |         idf = data_json.get("idf")
64 |         if vocabulary is not None and fixed_vocabulary is not None and idf is not None:
65 |             self._vectorizer = TfidfVectorizer(
66 |                 analyzer="word",
67 |                 stop_words="english",
68 |                 lowercase=True,
69 |                 max_features=self._max_features,
70 |             )
71 |             self._vectorizer.vocabulary_ = vocabulary
72 |             self._vectorizer.fixed_vocabulary_ = fixed_vocabulary
73 |             self._vectorizer.idf_ = np.array(idf)
74 | 
```

--------------------------------------------------------------------------------
/tests/tests_algorithms/test_baseline.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import tempfile
 3 | import unittest
 4 | 
 5 | from numpy.testing import assert_almost_equal
 6 | from sklearn import datasets
 7 | 
 8 | from supervised.algorithms.baseline import (
 9 |     BaselineClassifierAlgorithm,
10 |     BaselineRegressorAlgorithm,
11 | )
12 | from supervised.utils.metric import Metric
13 | 
14 | 
15 | class BaselineTest(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(cls):
18 |         cls.X, cls.y = datasets.make_regression(
19 |             n_samples=100,
20 |             n_features=5,
21 |             n_informative=4,
22 |             n_targets=1,
23 |             shuffle=False,
24 |             random_state=0,
25 |         )
26 | 
27 |     def test_reproduce_fit_regression(self):
28 |         metric = Metric({"name": "rmse"})
29 |         prev_loss = None
30 |         for _ in range(3):
31 |             model = BaselineRegressorAlgorithm({"ml_task": "regression"})
32 |             model.fit(self.X, self.y)
33 |             y_predicted = model.predict(self.X)
34 |             loss = metric(self.y, y_predicted)
35 |             if prev_loss is not None:
36 |                 assert_almost_equal(prev_loss, loss)
37 |             prev_loss = loss
38 | 
39 |     def test_reproduce_fit_bin_class(self):
40 |         X, y = datasets.make_classification(
41 |             n_samples=100,
42 |             n_features=5,
43 |             n_informative=4,
44 |             n_redundant=1,
45 |             n_classes=2,
46 |             n_clusters_per_class=3,
47 |             n_repeated=0,
48 |             shuffle=False,
49 |             random_state=0,
50 |         )
51 |         metric = Metric({"name": "logloss"})
52 |         prev_loss = None
53 |         for _ in range(3):
54 |             model = BaselineClassifierAlgorithm({"ml_task": "binary_classification"})
55 |             model.fit(X, y)
56 |             y_predicted = model.predict(X)
57 |             loss = metric(y, y_predicted)
58 |             if prev_loss is not None:
59 |                 assert_almost_equal(prev_loss, loss)
60 |             prev_loss = loss
61 | 
62 |     def test_save_and_load(self):
63 |         metric = Metric({"name": "rmse"})
64 |         dt = BaselineRegressorAlgorithm({"ml_task": "regression"})
65 |         dt.fit(self.X, self.y)
66 |         y_predicted = dt.predict(self.X)
67 |         loss = metric(self.y, y_predicted)
68 | 
69 |         filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex())
70 | 
71 |         dt.save(filename)
72 |         dt2 = BaselineRegressorAlgorithm({"ml_task": "regression"})
73 |         dt2.load(filename)
74 |         # Finished with the file, delete it
75 |         os.remove(filename)
76 | 
77 |         y_predicted = dt2.predict(self.X)
78 |         loss2 = metric(self.y, y_predicted)
79 |         assert_almost_equal(loss, loss2)
80 | 
81 |     def test_is_fitted(self):
82 |         model = BaselineRegressorAlgorithm({"ml_task": "regression"})
83 |         self.assertFalse(model.is_fitted())
84 |         model.fit(self.X, self.y)
85 |         self.assertTrue(model.is_fitted())
86 | 
```