This is page 1 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | AutoML_* 2 | .vscode 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ``` -------------------------------------------------------------------------------- /tests/data/LawSchool/README.md: -------------------------------------------------------------------------------- ```markdown 1 | Source: https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage ``` -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- ```markdown 1 | # Running tests 2 | 3 | 4 | To run all tests: 5 | 6 | ``` 7 | pytest tests -v -x 8 | ``` 9 | 10 | To run tests for `algorithms`: 11 | 12 | ``` 13 | pytest tests/tests_algorithms -v -x -s 14 | ``` ``` -------------------------------------------------------------------------------- /tests/data/CrimeData/README.md: -------------------------------------------------------------------------------- ```markdown 1 | Source: https://www.kaggle.com/datasets/kkanda/communities%20and%20crime%20unnormalized%20data%20set?select=crimedata.csv 2 | Description: http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized ``` -------------------------------------------------------------------------------- /tests/data/Drug/README.md: -------------------------------------------------------------------------------- ```markdown 1 | Source https://www.kaggle.com/datasets/obeykhadija/drug-consumptions-uci 2 | 3 | 4 | Rating's for Drug Use: 5 | 6 | CL0 Never Used 7 | 8 | CL1 Used over a Decade Ago 9 | 10 | CL2 Used in Last Decade 11 | 12 | CL3 Used in Last Year 59 13 | 14 | CL4 Used in Last Month 15 | 16 | CL5 Used in Last Week 17 | 18 | CL6 Used in Last Day ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | 2 | 3 | # MLJAR Automated Machine Learning for Humans 4 | 5 | [](https://github.com/mljar/mljar-supervised/actions/workflows/run-tests.yml) 6 | [](https://badge.fury.io/py/mljar-supervised) 7 | [](https://anaconda.org/conda-forge/mljar-supervised) 8 | [](https://pypi.python.org/pypi/mljar-supervised/) 9 | 10 | 11 | [](https://anaconda.org/conda-forge/mljar-supervised) 12 | [](https://anaconda.org/conda-forge/mljar-supervised) 13 | [](https://pepy.tech/project/mljar-supervised) 14 | 15 | <p align="center"> 16 | <img 17 | alt="mljar AutoML" 18 | src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/AutoML_white.png#gh-light-mode-only" width="50%" /> 19 | </p> 20 | <p align="center"> 21 | <img 22 | alt="mljar AutoML" 23 | src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/AutoML_black.png#gh-dark-mode-only" width="50%" /> 24 | </p> 25 | 26 | --- 27 | 28 | **Documentation**: <a href="https://supervised.mljar.com/" target="_blank">https://supervised.mljar.com/</a> 29 | 30 | **Source Code**: <a href="https://github.com/mljar/mljar-supervised" target="_blank">https://github.com/mljar/mljar-supervised</a> 31 | 32 | **Looking for commercial support**: Please contact us by [email](https://mljar.com/contact/) for details 33 | 34 | 35 | <p align="center"> 36 | <img src="https://raw.githubusercontent.com/mljar/mljar-examples/master/media/pipeline_AutoML.png" width="100%" /> 37 | </p> 38 | 39 | --- 40 | 41 | Watch full AutoML training in Python under 2 minutes. The training is done in [MLJAR Studio](https://mljar.com). 42 | 43 | [](https://youtu.be/t_opxR5dbPU) 44 | 45 | ## Table of Contents 46 | 47 | - [Automated Machine Learning](https://github.com/mljar/mljar-supervised#automated-machine-learning) 48 | - [What's good in it?](https://github.com/mljar/mljar-supervised#whats-good-in-it) 49 | - [AutoML Web App with GUI](https://github.com/mljar/mljar-supervised#automl-web-app-with-user-interface) 50 | - [Automatic Documentation](https://github.com/mljar/mljar-supervised#automatic-documentation) 51 | - [Available Modes](https://github.com/mljar/mljar-supervised#available-modes) 52 | - [Fairness Aware Training](https://github.com/mljar/mljar-supervised#fairness-aware-training) 53 | - [Examples](https://github.com/mljar/mljar-supervised#examples) 54 | - [FAQ](https://github.com/mljar/mljar-supervised#faq) 55 | - [Documentation](https://github.com/mljar/mljar-supervised#documentation) 56 | - [Installation](https://github.com/mljar/mljar-supervised#installation) 57 | - [Demo](https://github.com/mljar/mljar-supervised#demo) 58 | - [Contributing](https://github.com/mljar/mljar-supervised#contributing) 59 | - [Cite](https://github.com/mljar/mljar-supervised#cite) 60 | - [License](https://github.com/mljar/mljar-supervised#license) 61 | - [Commercial support](https://github.com/mljar/mljar-supervised#commercial-support) 62 | - [MLJAR](https://github.com/mljar/mljar-supervised#mljar) 63 | 64 | 65 | 66 | 67 | 68 | # Automated Machine Learning 69 | 70 | The `mljar-supervised` is an Automated Machine Learning Python package that works with tabular data. It is designed to save time for a data scientist. It abstracts the common way to preprocess the data, construct the machine learning models, and perform hyper-parameters tuning to find the best model :trophy:. It is no black box, as you can see exactly how the ML pipeline is constructed (with a detailed Markdown report for each ML model). 71 | 72 | The `mljar-supervised` will help you with: 73 | - explaining and understanding your data (Automatic Exploratory Data Analysis), 74 | - trying many different machine learning models (Algorithm Selection and Hyper-Parameters tuning), 75 | - creating Markdown reports from analysis with details about all models (Automatic-Documentation), 76 | - saving, re-running, and loading the analysis and ML models. 77 | 78 | It has four built-in modes of work: 79 | - `Explain` mode, which is ideal for explaining and understanding the data, with many data explanations, like decision trees visualization, linear models coefficients display, permutation importance, and SHAP explanations of data, 80 | - `Perform` for building ML pipelines to use in production, 81 | - `Compete` mode that trains highly-tuned ML models with ensembling and stacking, with the purpose to use in ML competitions. 82 | - `Optuna` mode can be used to search for highly-tuned ML models should be used when the performance is the most important, and computation time is not limited (it is available from version `0.10.0`) 83 | 84 | Of course, you can further customize the details of each `mode` to meet the requirements. 85 | 86 | ## What's good in it? 87 | 88 | - It uses many algorithms: `Baseline`, `Linear`, `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, `CatBoost`, `Neural Networks`, and `Nearest Neighbors`. 89 | - It can compute Ensemble based on a greedy algorithm from [Caruana paper](http://www.cs.cornell.edu/~alexn/papers/shotgun.icml04.revised.rev2.pdf). 90 | - It can stack models to build a level 2 ensemble (available in `Compete` mode or after setting the `stack_models` parameter). 91 | - It can do features preprocessing, like missing values imputation and converting categoricals. What is more, it can also handle target values preprocessing. 92 | - It can do advanced features engineering, like [Golden Features](https://supervised.mljar.com/features/golden_features/), [Features Selection](https://supervised.mljar.com/features/features_selection/), Text and Time Transformations. 93 | - It can tune hyper-parameters with a `not-so-random-search` algorithm (random-search over a defined set of values) and hill climbing to fine-tune final models. 94 | - It can compute the `Baseline` for your data so that you will know if you need Machine Learning or not! 95 | - It has extensive explanations. This package is training simple `Decision Trees` with `max_depth <= 5`, so you can easily visualize them with amazing [dtreeviz](https://github.com/parrt/dtreeviz) to better understand your data. 96 | - The `mljar-supervised` uses simple linear regression and includes its coefficients in the summary report, so you can check which features are used the most in the linear model. 97 | - It cares about the explainability of models: for every algorithm, the feature importance is computed based on permutation. Additionally, for every algorithm, the SHAP explanations are computed: feature importance, dependence plots, and decision plots (explanations can be switched off with the `explain_level` parameter). 98 | - There is automatic documentation for every ML experiment run with AutoML. The `mljar-supervised` creates markdown reports from AutoML training full of ML details, metrics, and charts. 99 | 100 | <p align="center"> 101 | <img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/infograph.png" width="100%" /> 102 | </p> 103 | 104 | # AutoML Web App with User Interface 105 | 106 | We created a Web App with GUI, so you don't need to write any code 🐍. Just upload your data. Please check the Web App at [github.com/mljar/automl-app](https://github.com/mljar/automl-app). You can run this Web App locally on your computer, so your data is safe and secure :cat: 107 | 108 | <kbd> 109 | <img src="https://github.com/mljar/automl-app/blob/main/media/web-app.gif" alt="AutoML training in Web App"></img> 110 | </kbd> 111 | 112 | # Automatic Documentation 113 | 114 | ## The AutoML Report 115 | 116 | The report from running AutoML will contain the table with information about each model score and the time needed to train the model. There is a link for each model, which you can click to see the model's details. The performance of all ML models is presented as scatter and box plots so you can visually inspect which algorithms perform the best :trophy:. 117 | 118 |  119 | 120 | ## The `Decision Tree` Report 121 | 122 | The example for `Decision Tree` summary with trees visualization. For classification tasks, additional metrics are provided: 123 | - confusion matrix 124 | - threshold (optimized in the case of binary classification task) 125 | - F1 score 126 | - Accuracy 127 | - Precision, Recall, MCC 128 | 129 |  130 | 131 | ## The `LightGBM` Report 132 | 133 | The example for `LightGBM` summary: 134 | 135 |  136 | 137 | 138 | ## Available Modes 139 | 140 | In the [docs](https://supervised.mljar.com/features/modes/) you can find details about AutoML modes that are presented in the table. 141 | 142 | <p align="center"> 143 | <img src="https://raw.githubusercontent.com/mljar/visual-identity/main/media/mljar_modes.png" width="100%" /> 144 | </p> 145 | 146 | ### Explain 147 | 148 | ```py 149 | automl = AutoML(mode="Explain") 150 | ``` 151 | 152 | It is aimed to be used when the user wants to explain and understand the data. 153 | - It is using 75%/25% train/test split. 154 | - It uses: `Baseline`, `Linear`, `Decision Tree`, `Random Forest`, `Xgboost`, `Neural Network' algorithms, and ensemble. 155 | - It has full explanations: learning curves, importance plots, and SHAP plots. 156 | 157 | ### Perform 158 | 159 | ```py 160 | automl = AutoML(mode="Perform") 161 | ``` 162 | 163 | It should be used when the user wants to train a model that will be used in real-life use cases. 164 | - It uses a 5-fold CV. 165 | - It uses: `Linear`, `Random Forest`, `LightGBM`, `Xgboost`, `CatBoost`, and `Neural Network`. It uses ensembling. 166 | - It has learning curves and importance plots in reports. 167 | 168 | ### Compete 169 | 170 | ```py 171 | automl = AutoML(mode="Compete") 172 | ``` 173 | 174 | It should be used for machine learning competitions. 175 | - It adapts the validation strategy depending on dataset size and `total_time_limit`. It can be: a train/test split (80/20), 5-fold CV or 10-fold CV. 176 | - It is using: `Linear`, `Decision Tree`, `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, `CatBoost`, `Neural Network`, and `Nearest Neighbors`. It uses ensemble and **stacking**. 177 | - It has only learning curves in the reports. 178 | 179 | ### Optuna 180 | 181 | ```py 182 | automl = AutoML(mode="Optuna", optuna_time_budget=3600) 183 | ``` 184 | 185 | It should be used when the performance is the most important and time is not limited. 186 | - It uses a 10-fold CV 187 | - It uses: `Random Forest`, `Extra Trees`, `LightGBM`, `Xgboost`, and `CatBoost`. Those algorithms are tuned by `Optuna` framework for `optuna_time_budget` seconds, each. Algorithms are tuned with original data, without advanced feature engineering. 188 | - It uses advanced feature engineering, stacking and ensembling. The hyperparameters found for original data are reused with those steps. 189 | - It produces learning curves in the reports. 190 | 191 | 192 | 193 | ## How to save and load AutoML? 194 | 195 | All models in the AutoML are saved and loaded automatically. No need to call `save()` or `load()`. 196 | 197 | ### Example: 198 | 199 | #### Train AutoML 200 | 201 | ```python 202 | automl = AutoML(results_path="AutoML_classifier") 203 | automl.fit(X, y) 204 | ``` 205 | 206 | You will have all models saved in the `AutoML_classifier` directory. Each model will have a separate directory with the `README.md` file with all details from the training. 207 | 208 | #### Compute predictions 209 | ```python 210 | automl = AutoML(results_path="AutoML_classifier") 211 | automl.predict(X) 212 | ``` 213 | 214 | The AutoML automatically loads models from the `results_path` directory. If you will call `fit()` on already trained AutoML then you will get a warning message that AutoML is already fitted. 215 | 216 | 217 | ### Why do you automatically save all models? 218 | 219 | All models are automatically saved to be able to restore the training after interruption. For example, you are training AutoML for 48 hours, and after 47 hours, there is some unexpected interruption. In MLJAR AutoML you just call the same training code after the interruption and AutoML reloads already trained models and finishes the training. 220 | 221 | ## Supported evaluation metrics (`eval_metric` argument in `AutoML()`) 222 | 223 | - for binary classification: `logloss`, `auc`, `f1`, `average_precision`, `accuracy`- default is `logloss` 224 | - for multiclass classification: `logloss`, `f1`, `accuracy` - default is `logloss` 225 | - for regression: `rmse`, `mse`, `mae`, `r2`, `mape`, `spearman`, `pearson` - default is `rmse` 226 | 227 | If you don't find the `eval_metric` that you need, please add a new issue. We will add it. 228 | 229 | 230 | ## Fairness Aware Training 231 | 232 | Starting from version `1.0.0` AutoML can optimize the Machine Learning pipeline with sensitive features. There are the following fairness related arguments in the AutoML constructor: 233 | - `fairness_metric` - metric which will be used to decide if the model is fair, 234 | - `fairness_threshold` - threshold used in decision about model fairness, 235 | - `privileged_groups` - privileged groups used in fairness metrics computation, 236 | - `underprivileged_groups` - underprivileged groups used in fairness metrics computation. 237 | 238 | The `fit()` method accepts `sensitive_features`. When sensitive features are passed to AutoML, the best model will be selected among fair models only. In the AutoML reports, additional information about fairness metrics will be added. The MLJAR AutoML supports two methods for bias mitigation: 239 | - Sample Weighting - assigns weights to samples to treat samples equally, 240 | - Smart Grid Search - similar to Sample Weighting, where different weights are checked to optimize fairness metric. 241 | 242 | The fair ML building can be used with all algorithms, including `Ensemble` and `Stacked Ensemble`. We support three Machine Learning tasks: 243 | - binary classification, 244 | - mutliclass classification, 245 | - regression. 246 | 247 | Example code: 248 | 249 | 250 | ```python 251 | from sklearn.model_selection import train_test_split 252 | from sklearn.datasets import fetch_openml 253 | from supervised.automl import AutoML 254 | 255 | data = fetch_openml(data_id=1590, as_frame=True) 256 | X = data.data 257 | y = (data.target == ">50K") * 1 258 | sensitive_features = X[["sex"]] 259 | 260 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split( 261 | X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42 262 | ) 263 | 264 | automl = AutoML( 265 | algorithms=[ 266 | "Xgboost" 267 | ], 268 | train_ensemble=False, 269 | fairness_metric="demographic_parity_ratio", 270 | fairness_threshold=0.8, 271 | privileged_groups = [{"sex": "Male"}], 272 | underprivileged_groups = [{"sex": "Female"}], 273 | ) 274 | 275 | automl.fit(X_train, y_train, sensitive_features=S_train) 276 | ``` 277 | 278 | You can read more about fairness aware AutoML training in our article https://mljar.com/blog/fairness-machine-learning/ 279 | 280 |  281 | 282 | 283 | 284 | # Examples 285 | 286 | ## :point_right: Binary Classification Example 287 | 288 | There is a simple interface available with `fit` and `predict` methods. 289 | 290 | ```python 291 | import pandas as pd 292 | from sklearn.model_selection import train_test_split 293 | from supervised.automl import AutoML 294 | 295 | df = pd.read_csv( 296 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 297 | skipinitialspace=True, 298 | ) 299 | X_train, X_test, y_train, y_test = train_test_split( 300 | df[df.columns[:-1]], df["income"], test_size=0.25 301 | ) 302 | 303 | automl = AutoML() 304 | automl.fit(X_train, y_train) 305 | 306 | predictions = automl.predict(X_test) 307 | ``` 308 | 309 | AutoML `fit` will print: 310 | ```py 311 | Create directory AutoML_1 312 | AutoML task to be solved: binary_classification 313 | AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network'] 314 | AutoML will optimize for metric: logloss 315 | 1_Baseline final logloss 0.5519845471086654 time 0.08 seconds 316 | 2_DecisionTree final logloss 0.3655910192804364 time 10.28 seconds 317 | 3_Linear final logloss 0.38139916864708445 time 3.19 seconds 318 | 4_Default_RandomForest final logloss 0.2975204390214936 time 79.19 seconds 319 | 5_Default_Xgboost final logloss 0.2731086827200411 time 5.17 seconds 320 | 6_Default_NeuralNetwork final logloss 0.319812276905242 time 21.19 seconds 321 | Ensemble final logloss 0.2731086821194617 time 1.43 seconds 322 | ``` 323 | 324 | - the AutoML results in [Markdown report](https://github.com/mljar/mljar-examples/tree/master/Income_classification/AutoML_1#automl-leaderboard) 325 | - the Xgboost [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/5_Default_Xgboost/README.md), please take a look at amazing dependence plots produced by SHAP package :sparkling_heart: 326 | - the Decision Tree [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/2_DecisionTree/README.md), please take a look at beautiful tree visualization :sparkles: 327 | - the Logistic Regression [Markdown report](https://github.com/mljar/mljar-examples/blob/master/Income_classification/AutoML_1/3_Linear/README.md), please take a look at coefficients table, and you can compare the SHAP plots between (Xgboost, Decision Tree and Logistic Regression) :coffee: 328 | 329 | 330 | ## :point_right: Multi-Class Classification Example 331 | 332 | The example code for classification of the optical recognition of handwritten digits dataset. Running this code in less than 30 minutes will result in test accuracy ~98%. 333 | 334 | ```python 335 | import pandas as pd 336 | # scikit learn utilites 337 | from sklearn.datasets import load_digits 338 | from sklearn.metrics import accuracy_score 339 | from sklearn.model_selection import train_test_split 340 | # mljar-supervised package 341 | from supervised.automl import AutoML 342 | 343 | # load the data 344 | digits = load_digits() 345 | X_train, X_test, y_train, y_test = train_test_split( 346 | pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25, 347 | random_state=123 348 | ) 349 | 350 | # train models with AutoML 351 | automl = AutoML(mode="Perform") 352 | automl.fit(X_train, y_train) 353 | 354 | # compute the accuracy on test data 355 | predictions = automl.predict_all(X_test) 356 | print(predictions.head()) 357 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))) 358 | ``` 359 | 360 | ## :point_right: Regression Example 361 | 362 | Regression example on `California Housing` house prices data. 363 | 364 | ```python 365 | import numpy as np 366 | import pandas as pd 367 | from sklearn.datasets import fetch_california_housing 368 | from sklearn.model_selection import train_test_split 369 | from sklearn.metrics import mean_squared_error 370 | from supervised.automl import AutoML # mljar-supervised 371 | 372 | # Load the data 373 | housing = fetch_california_housing() 374 | X_train, X_test, y_train, y_test = train_test_split( 375 | pd.DataFrame(housing.data, columns=housing.feature_names), 376 | housing.target, 377 | test_size=0.25, 378 | random_state=123, 379 | ) 380 | 381 | # train models with AutoML 382 | automl = AutoML(mode="Explain") 383 | automl.fit(X_train, y_train) 384 | 385 | # compute the MSE on test data 386 | predictions = automl.predict(X_test) 387 | print("Test MSE:", mean_squared_error(y_test, predictions)) 388 | ``` 389 | 390 | ## :point_right: More Examples 391 | 392 | - [**Income classification**](https://github.com/mljar/mljar-examples/tree/master/Income_classification) - it is a binary classification task on census data 393 | - [**Iris classification**](https://github.com/mljar/mljar-examples/tree/master/Iris_classification) - it is a multiclass classification on Iris flowers data 394 | - [**House price regression**](https://github.com/mljar/mljar-examples/tree/master/House_price_regression) - it is a regression task on Boston houses data 395 | 396 | # FAQ 397 | 398 | <details><summary>What method is used for hyperparameters optimization?</summary> 399 | - For modes: `Explain`, `Perform`, and `Compete` there is used a random search method combined with hill climbing. In this approach, all checked models are saved and used for building Ensemble. 400 | - For mode: `Optuna` the Optuna framework is used. It uses using TPE sampler for tuning. Models checked during the Optuna hyperparameters search are not saved, only the best model is saved (the final model from tuning). You can check the details about checked hyperparameters from optuna by checking study files in the `optuna` directory in your AutoML `results_path`. 401 | </details> 402 | 403 | <details><summary>How to save and load AutoML?</summary> 404 | 405 | The save and load of AutoML models is automatic. All models created during AutoML training are saved in the directory set in `results_path` (argument of `AutoML()` constructor). If there is no `results_path` set, then the directory is created based on following name convention: `AutoML_{number}` the `number` will be number from 1 to 1000 (depends which directory name will be free). 406 | 407 | Example save and load: 408 | 409 | ```python 410 | automl = AutoML(results_path='AutoML_1') 411 | automl.fit(X, y) 412 | ``` 413 | 414 | The all models from AutoML are saved in `AutoML_1` directory. 415 | 416 | To load models: 417 | 418 | ```python 419 | automl = AutoML(results_path='AutoML_1') 420 | automl.predict(X) 421 | ``` 422 | 423 | </details> 424 | 425 | <details><summary>How to set ML task (select between classification or regression)?</summary> 426 | 427 | The MLJAR AutoML can work with: 428 | - binary classification 429 | - multi-class classification 430 | - regression 431 | 432 | The ML task detection is automatic based on target values. There can be situation if you want to manually force AutoML to select the ML task, then you need to set `ml_task` parameter. It can be set to `'binary_classification'`, `'multiclass_classification'`, `'regression'`. 433 | 434 | Example: 435 | ```python 436 | automl = AutoML(ml_task='regression') 437 | automl.fit(X, y) 438 | ``` 439 | In the above example the regression model will be fitted. 440 | 441 | </details> 442 | 443 | <details><summary>How to reuse Optuna hyperparameters?</summary> 444 | 445 | You can reuse Optuna hyperparameters that were found in other AutoML training. You need to pass them in `optuna_init_params` argument. All hyperparameters found during Optuna tuning are saved in the `optuna/optuna.json` file (inside `results_path` directory). 446 | 447 | Example: 448 | 449 | ```python 450 | optuna_init = json.loads(open('previous_AutoML_training/optuna/optuna.json').read()) 451 | 452 | automl = AutoML( 453 | mode='Optuna', 454 | optuna_init_params=optuna_init 455 | ) 456 | automl.fit(X, y) 457 | ``` 458 | 459 | When reusing Optuna hyperparameters the Optuna tuning is simply skipped. The model will be trained with hyperparameters set in `optuna_init_params`. Right now there is no option to continue Optuna tuning with seed parameters. 460 | 461 | 462 | </details> 463 | 464 | 465 | <details><summary>How to know the order of classes for binary or multiclass problem when using predict_proba?</summary> 466 | 467 | To get predicted probabilites with information about class label please use the `predict_all()` method. It returns the pandas DataFrame with class names in the columns. The order of predicted columns is the same in the `predict_proba()` and `predict_all()` methods. The `predict_all()` method will additionaly have the column with the predicted class label. 468 | 469 | </details> 470 | 471 | # Documentation 472 | 473 | For details please check [mljar-supervised docs](https://supervised.mljar.com). 474 | 475 | # Installation 476 | 477 | From PyPi repository: 478 | 479 | ``` 480 | pip install mljar-supervised 481 | ``` 482 | 483 | To install this package with conda run: 484 | ``` 485 | conda install -c conda-forge mljar-supervised 486 | ``` 487 | 488 | From source code: 489 | 490 | ``` 491 | git clone https://github.com/mljar/mljar-supervised.git 492 | cd mljar-supervised 493 | python setup.py install 494 | ``` 495 | 496 | Installation for development 497 | ``` 498 | git clone https://github.com/mljar/mljar-supervised.git 499 | virtualenv venv --python=python3.6 500 | source venv/bin/activate 501 | pip install -r requirements.txt 502 | pip install -r requirements_dev.txt 503 | ``` 504 | 505 | Running in the docker: 506 | ``` 507 | FROM python:3.7-slim-buster 508 | RUN apt-get update && apt-get -y update 509 | RUN apt-get install -y build-essential python3-pip python3-dev 510 | RUN pip3 -q install pip --upgrade 511 | RUN pip3 install mljar-supervised jupyter 512 | CMD ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--allow-root"] 513 | ``` 514 | 515 | Install from GitHub with pip: 516 | ``` 517 | pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master 518 | ``` 519 | # Demo 520 | 521 | In the below demo GIF you will see: 522 | - MLJAR AutoML trained in Jupyter Notebook on the Titanic dataset 523 | - overview of created files 524 | - a showcase of selected plots created during AutoML training 525 | - algorithm comparison report along with their plots 526 | - example of README file and CSV file with results 527 | 528 |  529 | 530 | # Contributing 531 | 532 | To get started take a look at our [Contribution Guide](https://supervised.mljar.com/contributing/) for information about our process and where you can fit in! 533 | 534 | ### Contributors 535 | <a href="https://github.com/mljar/mljar-supervised/graphs/contributors"> 536 | <img src="https://contributors-img.web.app/image?repo=mljar/mljar-supervised" /> 537 | </a> 538 | 539 | # Cite 540 | 541 | Would you like to cite MLJAR? Great! :) 542 | 543 | You can cite MLJAR as follows: 544 | 545 | ``` 546 | @misc{mljar, 547 | author = {Aleksandra P\l{}o\'{n}ska and Piotr P\l{}o\'{n}ski}, 548 | year = {2021}, 549 | publisher = {MLJAR}, 550 | address = {\L{}apy, Poland}, 551 | title = {MLJAR: State-of-the-art Automated Machine Learning Framework for Tabular Data. Version 0.10.3}, 552 | url = {https://github.com/mljar/mljar-supervised} 553 | } 554 | ``` 555 | 556 | Would love to hear from you about how have you used MLJAR AutoML in your project. 557 | Please feel free to let us know at 558 |  559 | 560 | 561 | # License 562 | 563 | The `mljar-supervised` is provided with [MIT license](https://github.com/mljar/mljar-supervised/blob/master/LICENSE). 564 | 565 | # Commercial support 566 | 567 | Looking for commercial support? Do you need new feature implementation? Please contact us by [email](https://mljar.com/contact/) for details. 568 | 569 | # MLJAR 570 | <p align="center"> 571 | <img src="https://github.com/mljar/mljar-examples/blob/master/media/large_logo.png" width="314" /> 572 | </p> 573 | 574 | The `mljar-supervised` is an open-source project created by [MLJAR](https://mljar.com). We care about ease of use in Machine Learning. 575 | The [mljar.com](https://mljar.com) provides a beautiful and simple user interface for building machine learning models. 576 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/fairness/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/tuner/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /supervised/validation/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/checks/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_callbacks/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_ensemble/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_fairness/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_tuner/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /tests/tests_validation/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- ``` 1 | [pytest] 2 | addopts = -p no:warnings ``` -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- ``` 1 | pytest 2 | black 3 | pytest-cov 4 | coveralls ``` -------------------------------------------------------------------------------- /supervised/__init__.py: -------------------------------------------------------------------------------- ```python 1 | __version__ = "1.1.18" 2 | 3 | from supervised.automl import AutoML 4 | ``` -------------------------------------------------------------------------------- /tests/checks/run_performance_tests.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from tests.tests_bin_class.test_performance import * 4 | 5 | if __name__ == "__main__": 6 | unittest.main() 7 | ``` -------------------------------------------------------------------------------- /tests/checks/run_ml_tests.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from tests.tests_bin_class.run import * 4 | from tests.tests_multi_class.run import * 5 | 6 | if __name__ == "__main__": 7 | unittest.main() 8 | ``` -------------------------------------------------------------------------------- /supervised/utils/constants.py: -------------------------------------------------------------------------------- ```python 1 | # tasks that can be handled by the package 2 | BINARY_CLASSIFICATION = "binary_classification" 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification" 4 | REGRESSION = "regression" 5 | ``` -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- ```python 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def data_folder(request) -> Path: 8 | folder_path = Path(__file__).parent / 'data' 9 | assert folder_path.exists() 10 | request.cls.data_folder = folder_path 11 | return folder_path 12 | ``` -------------------------------------------------------------------------------- /supervised/utils/__init__.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | 3 | from supervised.utils.jsonencoder import MLJSONEncoder 4 | 5 | 6 | def json_loads(data, *args, **kwargs): 7 | return json.loads(data, *args, **kwargs) 8 | 9 | 10 | def json_dumps(data, *args, **kwargs): 11 | return json.dumps(data, cls=MLJSONEncoder, *args, **kwargs) 12 | ``` -------------------------------------------------------------------------------- /supervised/validation/validator_base.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | 6 | class BaseValidator(object): 7 | def __init__(self, params): 8 | self.params = params 9 | 10 | def split(self): 11 | pass 12 | 13 | def get_n_splits(self): 14 | pass 15 | 16 | def get_repeats(self): 17 | return 1 18 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- ``` 1 | numpy>=1.19.5,<2 2 | pandas>=2.0.0 3 | scipy>=1.6.1 4 | scikit-learn>=1.5.0 5 | xgboost>=2.0.0 6 | lightgbm>=3.0.0 7 | catboost>=0.24.4 8 | joblib>=1.0.1 9 | tabulate>=0.8.7 10 | matplotlib>=3.2.2 11 | dtreeviz>=2.2.2 12 | shap>=0.42.1 13 | seaborn>=0.11.1 14 | optuna-integration>=3.6.0 15 | mljar-scikit-plot>=0.3.11 16 | markdown 17 | typing-extensions 18 | ipython 19 | ``` -------------------------------------------------------------------------------- /examples/scripts/regression.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("./tests/data/housing_regression_missing_values_missing_target.csv") 6 | x_cols = [c for c in df.columns if c != "MEDV"] 7 | X = df[x_cols] 8 | y = df["MEDV"] 9 | 10 | automl = AutoML() 11 | automl.fit(X, y) 12 | 13 | df["predictions"] = automl.predict(X) 14 | print("Predictions") 15 | print(df[["MEDV", "predictions"]].head()) 16 | ``` -------------------------------------------------------------------------------- /supervised/utils/subsample.py: -------------------------------------------------------------------------------- ```python 1 | from sklearn.model_selection import train_test_split 2 | 3 | from supervised.algorithms.registry import REGRESSION 4 | 5 | 6 | def subsample(X, y, ml_task, train_size): 7 | shuffle = True 8 | stratify = None 9 | 10 | if ml_task != REGRESSION: 11 | stratify = y 12 | 13 | X_train, X_test, y_train, y_test = train_test_split( 14 | X, y, train_size=train_size, shuffle=shuffle, stratify=stratify 15 | ) 16 | 17 | return X_train, X_test, y_train, y_test 18 | ``` -------------------------------------------------------------------------------- /examples/scripts/regression_law_school_fairness.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("tests/data/LawSchool/bar_pass_prediction.csv") 6 | df["race1"][df["race1"] != "white"] = "non-white" # keep it as binary feature 7 | 8 | X = df[["gender", "lsat", "race1", "pass_bar"]] 9 | y = df["gpa"] 10 | 11 | sensitive_features = df["race1"] 12 | 13 | automl = AutoML( 14 | algorithms=["Xgboost", "LightGBM", "Extra Trees"], 15 | train_ensemble=True, 16 | fairness_threshold=0.9, 17 | ) 18 | automl.fit(X, y, sensitive_features=sensitive_features) 19 | ``` -------------------------------------------------------------------------------- /supervised/utils/config.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | LOG_LEVEL = logging.ERROR 4 | 5 | # from guppy import hpy 6 | # from pympler import summary 7 | # from pympler import muppy 8 | import time 9 | 10 | import numpy as np 11 | 12 | 13 | def mem(msg=""): 14 | """Memory usage in MB""" 15 | 16 | time.sleep(5) 17 | 18 | with open("/proc/self/status") as f: 19 | memusage = f.read().split("VmRSS:")[1].split("\n")[0][:-3] 20 | 21 | print(msg, "- memory:", np.round(float(memusage.strip()) / 1024.0), "MB") 22 | 23 | # all_objects = muppy.get_objects() 24 | # sum1 = summary.summarize(all_objects) 25 | # summary.print_(sum1) 26 | ``` -------------------------------------------------------------------------------- /supervised/exceptions.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | from supervised.utils.config import LOG_LEVEL 4 | 5 | logging.basicConfig( 6 | format="%(asctime)s %(name)s %(levelname)s %(message)s", level=logging.ERROR 7 | ) 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(LOG_LEVEL) 10 | 11 | 12 | class AutoMLException(Exception): 13 | def __init__(self, message): 14 | super(AutoMLException, self).__init__(message) 15 | logger.error(message) 16 | 17 | 18 | class NotTrainedException(Exception): 19 | def __init__(self, message): 20 | super(NotTrainedException, self).__init__(message) 21 | logger.debug(message) 22 | ``` -------------------------------------------------------------------------------- /supervised/tuner/random_parameters.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | 3 | 4 | class RandomParameters: 5 | 6 | """ 7 | Example params are in JSON format: 8 | { 9 | "booster": ["gbtree", "gblinear"], 10 | "objective": ["binary:logistic"], 11 | "eval_metric": ["auc", "logloss"], 12 | "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1] 13 | } 14 | """ 15 | 16 | @staticmethod 17 | def get(params, seed=1): 18 | np.random.seed(seed) 19 | generated_params = {"seed": seed} 20 | for k in params: 21 | generated_params[k] = np.random.permutation(params[k])[0].item() 22 | return generated_params 23 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/max_iters_constraint.py: -------------------------------------------------------------------------------- ```python 1 | from supervised.callbacks.callback import Callback 2 | 3 | 4 | class MaxItersConstraint(Callback): 5 | def __init__(self, params): 6 | super(MaxItersConstraint, self).__init__(params) 7 | self.name = params.get("name", "max_iters_constraint") 8 | self.max_iters = params.get("max_iters", 10) 9 | 10 | def add_and_set_learner(self, learner): 11 | self.learner = learner 12 | 13 | def on_iteration_end(self, logs, predictions): 14 | # iters are computed starting from 0 15 | if logs.get("iter_cnt") + 1 >= self.max_iters: 16 | self.learner.stop_training = True 17 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_registry.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from supervised.algorithms.registry import AlgorithmsRegistry 4 | 5 | 6 | class AlgorithmsRegistryTest(unittest.TestCase): 7 | def test_add_to_registry(self): 8 | class Model1: 9 | algorithm_short_name = "" 10 | 11 | model1 = { 12 | "task_name": "binary_classification", 13 | "model_class": Model1, 14 | "model_params": {}, 15 | "required_preprocessing": {}, 16 | "additional": {}, 17 | "default_params": {}, 18 | } 19 | AlgorithmsRegistry.add(**model1) 20 | 21 | 22 | if __name__ == "__main__": 23 | unittest.main() 24 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_factory.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from supervised.algorithms.factory import AlgorithmFactory 4 | from supervised.algorithms.xgboost import XgbAlgorithm 5 | 6 | 7 | class AlgorithmFactoryTest(unittest.TestCase): 8 | def test_fit(self): 9 | params = { 10 | "learner_type": "Xgboost", 11 | "objective": "binary:logistic", 12 | "eval_metric": "logloss", 13 | } 14 | learner = AlgorithmFactory.get_algorithm(params) 15 | self.assertEqual( 16 | learner.algorithm_short_name, XgbAlgorithm.algorithm_short_name 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | ``` -------------------------------------------------------------------------------- /supervised/utils/utils.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | 3 | 4 | class Store: 5 | data = {} 6 | 7 | def set(self, key, value): 8 | Store.data[key] = value 9 | 10 | def get(self, key): 11 | return copy.deepcopy(Store.data[key]) 12 | 13 | 14 | def dump_data(file_path, df): 15 | store = Store() 16 | store.set(file_path, df) 17 | # try: 18 | # df.to_parquet(file_path, index=False) 19 | # except Exception as e: 20 | # df.to_csv(file_path, index=False) 21 | 22 | 23 | def load_data(file_path): 24 | store = Store() 25 | return store.get(file_path) 26 | # try: 27 | # return pd.read_parquet(file_path) 28 | # except Exception as e: 29 | # return pd.read_csv(file_path) 30 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/callback.py: -------------------------------------------------------------------------------- ```python 1 | class Callback(object): 2 | def __init__(self, params): 3 | self.params = params 4 | self.learners = [] 5 | self.learner = None # current learner 6 | self.name = "callback" 7 | 8 | def add_and_set_learner(self, learner): 9 | self.learners += [learner] 10 | self.learner = learner 11 | 12 | def on_learner_train_start(self, logs): 13 | pass 14 | 15 | def on_learner_train_end(self, logs): 16 | pass 17 | 18 | def on_iteration_start(self, logs): 19 | pass 20 | 21 | def on_iteration_end(self, logs, predictions): 22 | pass 23 | 24 | def on_framework_train_end(self, logs): 25 | pass 26 | ``` -------------------------------------------------------------------------------- /tests/tests_tuner/test_tuner.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from supervised.tuner.mljar_tuner import MljarTuner 4 | 5 | 6 | class TunerTest(unittest.TestCase): 7 | def test_key_params(self): 8 | params1 = { 9 | "preprocessing": {"p1": 1, "p2": 2}, 10 | "learner": {"p1": 1, "p2": 2}, 11 | "validation_strategy": {}, 12 | } 13 | params2 = { 14 | "preprocessing": {"p1": 1, "p2": 2}, 15 | "learner": {"p2": 2, "p1": 1}, 16 | "validation_strategy": {}, 17 | } 18 | key1 = MljarTuner.get_params_key(params1) 19 | key2 = MljarTuner.get_params_key(params2) 20 | self.assertEqual(key1, key2) 21 | ``` -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | import numpy as np 3 | from supervised.automl import AutoML 4 | import supervised 5 | 6 | 7 | import warnings 8 | 9 | from sklearn import datasets 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.decomposition import PCA 12 | 13 | from supervised import AutoML 14 | from supervised.exceptions import AutoMLException 15 | 16 | df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv") 17 | X = df[["feature_1", "feature_2", "feature_3", "feature_4"]] 18 | y = df["class"] 19 | 20 | automl = AutoML() 21 | 22 | automl.fit(X, y) 23 | 24 | predictions = automl.predict_all(X) 25 | 26 | print(predictions.head()) 27 | print(predictions.tail()) 28 | 29 | print(X.shape) 30 | print(predictions.shape) 31 | ``` -------------------------------------------------------------------------------- /examples/scripts/regression_crime_fairness.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | # data source http://archive.ics.uci.edu/ml/datasets/Communities%20and%20Crime%20Unnormalized 6 | 7 | df = pd.read_csv("tests/data/CrimeData/crimedata.csv", na_values=["?"]) 8 | 9 | X = df[df.columns[5:129]] 10 | y = df["ViolentCrimesPerPop"] 11 | 12 | sensitive_features = (df["racePctWhite"] > 84).astype(str) 13 | 14 | automl = AutoML( 15 | #algorithms=["Decision Tree", "Neural Network", "Xgboost", "Linear", "CatBoost"], 16 | algorithms=["Xgboost", "Linear", "CatBoost"], 17 | train_ensemble=True, 18 | fairness_threshold=0.5, 19 | ) 20 | automl.fit(X, y, sensitive_features=sensitive_features) 21 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_Titanic.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.metrics import accuracy_score 4 | from supervised import AutoML 5 | 6 | train = pd.read_csv( 7 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv" 8 | ) 9 | print(train.head()) 10 | 11 | X = train[train.columns[2:]] 12 | y = train["Survived"] 13 | 14 | automl = AutoML() # default mode is Explain 15 | 16 | automl.fit(X, y) 17 | 18 | test = pd.read_csv( 19 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv" 20 | ) 21 | predictions = automl.predict(test) 22 | print(predictions) 23 | print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%") 24 | ``` -------------------------------------------------------------------------------- /examples/scripts/regression_housing_fairness.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | df = pd.read_csv("./tests/data/boston_housing.csv") 6 | x_cols = [c for c in df.columns if c != "MEDV"] 7 | 8 | df["large_B"] = (df["B"] > 380) * 1 9 | df["large_B"] = df["large_B"].astype(str) 10 | 11 | 12 | print(df["large_B"].dtype.name) 13 | sensitive_features = df["large_B"] 14 | 15 | X = df[x_cols] 16 | y = df["MEDV"] 17 | 18 | automl = AutoML( 19 | algorithms=["Xgboost", "LightGBM"], 20 | train_ensemble=True, 21 | fairness_threshold=0.9, 22 | ) 23 | automl.fit(X, y, sensitive_features=sensitive_features) 24 | 25 | df["predictions"] = automl.predict(X) 26 | print("Predictions") 27 | print(df[["MEDV", "predictions"]].head()) 28 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_encoding_selector.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from supervised.preprocessing.encoding_selector import EncodingSelector 6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 7 | 8 | 9 | class CategoricalIntegersTest(unittest.TestCase): 10 | def test_selector(self): 11 | d = {"col1": [f"{i}" for i in range(31)], "col2": ["a"] * 31} 12 | df = pd.DataFrame(data=d) 13 | 14 | self.assertEqual( 15 | EncodingSelector.get(df, None, "col1"), 16 | PreprocessingCategorical.MANY_CATEGORIES, 17 | ) 18 | self.assertEqual( 19 | EncodingSelector.get(df, None, "col2"), 20 | PreprocessingCategorical.FEW_CATEGORIES, 21 | ) 22 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_marketing.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | from supervised.automl import AutoML 3 | import os 4 | 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | df = pd.read_csv("tests/data/PortugeseBankMarketing/Data_FinalProject.csv") 9 | 10 | X = df[df.columns[:-1]] 11 | y = df["y"] 12 | 13 | 14 | X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25) 15 | 16 | 17 | automl = AutoML( 18 | # results_path="AutoML_22", 19 | total_time_limit=30 * 60, 20 | start_random_models=10, 21 | hill_climbing_steps=3, 22 | top_models_to_improve=3, 23 | train_ensemble=True, 24 | ) 25 | 26 | automl.fit(X_train, y_train) 27 | 28 | 29 | pred = automl.predict(X_test) 30 | print("Test accuracy", accuracy_score(y_test, pred)) 31 | ``` -------------------------------------------------------------------------------- /examples/scripts/regression_acs_fairness.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | 5 | # to get data 6 | # from fairlearn.datasets import fetch_acs_income 7 | # df = fetch_acs_income(as_frame=True) 8 | # df["frame"].to_csv("acs_income.csv", index=False) 9 | 10 | df = pd.read_csv("tests/data/acs_income_1k.csv") 11 | 12 | print(df) 13 | 14 | x_cols = [c for c in df.columns if c != "PINCP"] 15 | 16 | sensitive_features = df["SEX"].astype(str) 17 | 18 | X = df[x_cols] 19 | y = df["PINCP"] 20 | 21 | automl = AutoML( 22 | algorithms=["Xgboost", "LightGBM"], 23 | train_ensemble=True, 24 | fairness_threshold=0.91, 25 | # underprivileged_groups=[{"SEX": "1.0"}], 26 | # privileged_groups=[{"SEX": "2.0"}] 27 | ) 28 | automl.fit(X, y, sensitive_features=sensitive_features) 29 | ``` -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier_digits.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | 3 | # scikit learn utilites 4 | from sklearn.datasets import load_digits 5 | from sklearn.metrics import accuracy_score 6 | from sklearn.model_selection import train_test_split 7 | 8 | # mljar-supervised package 9 | from supervised.automl import AutoML 10 | 11 | # Load the data 12 | digits = load_digits() 13 | X_train, X_test, y_train, y_test = train_test_split( 14 | pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25 15 | ) 16 | 17 | # train models 18 | automl = AutoML(mode="Perform") 19 | automl.fit(X_train, y_train) 20 | 21 | # compute the accuracy on test data 22 | predictions = automl.predict(X_test) 23 | print(predictions.head()) 24 | print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int))) 25 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_random.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | from sklearn.metrics import accuracy_score 5 | import os 6 | 7 | nrows = 100 8 | ncols = 3 9 | X = np.random.rand(nrows, ncols) 10 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)]) 11 | y = np.random.randint(0, 2, nrows) 12 | # y = np.random.permutation(["a", "B"] * 50) 13 | 14 | automl = AutoML(model_time_limit=10) # , algorithms=["Decision Tree"]) 15 | automl.fit(X, y) 16 | print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"])) 17 | 18 | # X = np.random.rand(1000, 10) 19 | # X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)]) 20 | # y = np.random.randint(0, 2, 1000) 21 | # print("Test accuracy", accuracy_score(y, automl.predict(X)["label"])) 22 | ``` -------------------------------------------------------------------------------- /supervised/fairness/utils.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | 3 | 4 | def accuracy(t, y): 5 | return np.round(np.sum(t == y) / t.shape[0], 4) 6 | 7 | 8 | def selection_rate(y): 9 | return np.round( 10 | np.sum((y == 1)) / y.shape[0], 11 | 4, 12 | ) 13 | 14 | 15 | def true_positive_rate(t, y): 16 | return np.round( 17 | np.sum((y == 1) & (t == 1)) / np.sum((t == 1)), 18 | 4, 19 | ) 20 | 21 | 22 | def false_positive_rate(t, y): 23 | return np.round( 24 | np.sum((y == 1) & (t == 0)) / np.sum((t == 0)), 25 | 4, 26 | ) 27 | 28 | 29 | def true_negative_rate(t, y): 30 | return np.round( 31 | np.sum((y == 0) & (t == 0)) / np.sum((t == 0)), 32 | 4, 33 | ) 34 | 35 | 36 | def false_negative_rate(t, y): 37 | return np.round( 38 | np.sum((y == 0) & (t == 1)) / np.sum((t == 1)), 39 | 4, 40 | ) 41 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_learning_curves.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import unittest 3 | 4 | from supervised.utils.learning_curves import LearningCurves 5 | 6 | 7 | class LearningCurvesTest(unittest.TestCase): 8 | def test_plot_close(self): 9 | """ 10 | Test if we close plots. To avoid following warning: 11 | RuntimeWarning: More than 20 figures have been opened. 12 | Figures created through the pyplot interface (`matplotlib.pyplot.figure`) 13 | are retained until explicitly closed and may consume too much memory. 14 | """ 15 | for _ in range( 16 | 1 17 | ): # you can increase the range, for tests speed reason I keep it low 18 | LearningCurves.plot_for_ensemble([3, 2, 1], "random_metrics", ".") 19 | 20 | os.remove(LearningCurves.output_file_name) 21 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_update_errors_report.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLUpdateErrorsReportTest(unittest.TestCase): 11 | automl_dir = "automl_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_custom_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.random.randint(0, 2, size=(30,)) 19 | 20 | automl = AutoML(results_path=self.automl_dir) 21 | automl._update_errors_report("model_1", "bad error") 22 | 23 | errors_filename = os.path.join(self.automl_dir, "errors.md") 24 | self.assertTrue(os.path.exists(errors_filename)) 25 | with open(errors_filename) as file: 26 | self.assertTrue("bad error" in file.read()) 27 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_adult_fairness.py: -------------------------------------------------------------------------------- ```python 1 | 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.datasets import fetch_openml 4 | from supervised.automl import AutoML 5 | 6 | data = fetch_openml(data_id=1590, as_frame=True) 7 | X = data.data 8 | # data.target # 9 | y = data.target # (data.target == ">50K") * 1 10 | sensitive_features = X[["sex"]] 11 | 12 | X_train, X_test, y_train, y_test, S_train, S_test = train_test_split( 13 | X, y, sensitive_features, stratify=y, test_size=0.75, random_state=42 14 | ) 15 | 16 | automl = AutoML( 17 | algorithms=[ 18 | "Xgboost" 19 | ], 20 | train_ensemble=False, 21 | fairness_metric="demographic_parity_ratio", 22 | fairness_threshold=0.8, 23 | privileged_groups = [{"sex": "Male"}], 24 | underprivileged_groups = [{"sex": "Female"}], 25 | ) 26 | 27 | automl.fit(X_train, y_train, sensitive_features=S_train) 28 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_subsample.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.algorithms.registry import REGRESSION 7 | from supervised.utils.subsample import subsample 8 | 9 | 10 | class SubsampleTest(unittest.TestCase): 11 | def test_subsample_regression_10k(self): 12 | rows = 10000 13 | cols = 51 14 | X = np.random.rand(rows, cols) 15 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(cols)]) 16 | y = pd.Series(np.random.rand(rows), name="target") 17 | 18 | X_train, X_test, y_train, y_test = subsample( 19 | X, y, train_size=1000, ml_task=REGRESSION 20 | ) 21 | 22 | self.assertTrue(X_train.shape[0], 1000) 23 | self.assertTrue(X_test.shape[0], 9000) 24 | self.assertTrue(y_train.shape[0], 1000) 25 | self.assertTrue(y_test.shape[0], 9000) 26 | ``` -------------------------------------------------------------------------------- /examples/scripts/tabular_mar_2021.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | from supervised import AutoML 3 | 4 | train = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/train.csv") 5 | test = pd.read_csv("~/Downloads/tabular-playground-series-mar-2021/test.csv") 6 | 7 | X_train = train.drop(["id", "target"], axis=1) 8 | y_train = train.target 9 | X_test = test.drop(["id"], axis=1) 10 | 11 | automl = AutoML( 12 | mode="Optuna", 13 | eval_metric="auc", 14 | algorithms=["CatBoost"], 15 | optuna_time_budget=1800, # tune each algorithm for 30 minutes 16 | total_time_limit=48 17 | * 3600, # total time limit, set large enough to have time to compute all steps 18 | features_selection=False, 19 | ) 20 | automl.fit(X_train, y_train) 21 | 22 | preds = automl.predict_proba(X_test) 23 | submission = pd.DataFrame({"id": test.id, "target": preds[:, 1]}) 24 | submission.to_csv("1_submission.csv", index=False) 25 | ``` -------------------------------------------------------------------------------- /supervised/utils/jsonencoder.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from datetime import date 3 | 4 | import numpy as np 5 | 6 | 7 | class MLJSONEncoder(json.JSONEncoder): 8 | def default(self, o): 9 | if isinstance( 10 | o, 11 | ( 12 | np.int_, 13 | np.intc, 14 | np.intp, 15 | np.int8, 16 | np.int16, 17 | np.int32, 18 | np.int64, 19 | np.uint8, 20 | np.uint16, 21 | np.uint32, 22 | np.uint64, 23 | ), 24 | ): 25 | return int(o) 26 | elif isinstance(o, (np.float_, np.float16, np.float32, np.float64)): 27 | return float(o) 28 | elif isinstance(o, np.ndarray): 29 | return o.tolist() 30 | elif isinstance(obj, date): 31 | return obj.strftime("%Y-%m-%d") 32 | 33 | return super(MLJSONEncoder, self).default(o) 34 | ``` -------------------------------------------------------------------------------- /examples/scripts/multi_class_classifier_MNIST.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | import numpy as np 3 | from supervised.automl import AutoML 4 | 5 | 6 | from supervised.utils.config import mem 7 | 8 | 9 | df = pd.read_csv("tests/data/MNIST/train.csv") 10 | 11 | X = df[[f for f in df.columns if "pixel" in f]] 12 | y = df["label"] 13 | 14 | for _ in range(4): 15 | X = pd.concat([X, X], axis=0) 16 | y = pd.concat([y, y], axis=0) 17 | 18 | 19 | mem() 20 | 21 | 22 | automl = AutoML( 23 | # results_path="AutoML_12", 24 | total_time_limit=60 * 60, 25 | start_random_models=5, 26 | hill_climbing_steps=2, 27 | top_models_to_improve=3, 28 | train_ensemble=True, 29 | ) 30 | 31 | mem() 32 | print("Start fit") 33 | automl.fit(X, y) 34 | 35 | test = pd.read_csv("tests/data/MNIST/test.csv") 36 | predictions = automl.predict(test) 37 | 38 | print(predictions.head()) 39 | print(predictions.tail()) 40 | 41 | sub = pd.DataFrame({"ImageId": 0, "Label": predictions["label"]}) 42 | sub["ImageId"] = sub.index + 1 43 | sub.to_csv("sub1.csv", index=False) 44 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/encoding_selector.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 5 | 6 | 7 | class EncodingSelector: 8 | 9 | """ 10 | EncodingSelector object decides which method should be used for categorical encoding. 11 | 12 | Please keep it fast and simple. Thank you. 13 | """ 14 | 15 | @staticmethod 16 | def get(X, y, column): 17 | try: 18 | unique_cnt = len(np.unique(X.loc[~pd.isnull(X[column]), column])) 19 | if unique_cnt <= 20: 20 | return PreprocessingCategorical.FEW_CATEGORIES 21 | except Exception as e: 22 | pass 23 | 24 | return PreprocessingCategorical.MANY_CATEGORIES 25 | """ 26 | if unique_cnt <= 2 or unique_cnt > 25: 27 | return PreprocessingCategorical.CONVERT_INTEGER 28 | 29 | return PreprocessingCategorical.CONVERT_ONE_HOT 30 | """ 31 | ``` -------------------------------------------------------------------------------- /.github/workflows/test-installation-with-pip-on-windows.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Test installation with pip on Windows 2 | 3 | on: 4 | schedule: 5 | - cron: '0 8 * * 1' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build: 10 | name: Run (${{ matrix.python-version }}, ${{ matrix.os }}) 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | os: [windows-latest] 16 | python-version: ['3.9'] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Check Python version 27 | run: python --version 28 | 29 | - name: Upgrade pip 30 | run: python -m pip install --upgrade pip 31 | 32 | - name: Install MLJAR AutoML 33 | run: pip install mljar-supervised 34 | 35 | - name: Try to import 36 | run: python -c "import supervised; print(supervised.__version__)" 37 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_shap.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.utils.shap import PlotSHAP 7 | 8 | 9 | class PlotSHAPTest(unittest.TestCase): 10 | def test_get_sample_data_larger_1k(self): 11 | """Get sample when data is larger than 1k""" 12 | X = pd.DataFrame(np.random.uniform(size=(5763, 31))) 13 | y = pd.Series(np.random.randint(0, 2, size=(5763,))) 14 | 15 | X_, y_ = PlotSHAP.get_sample(X, y) 16 | 17 | self.assertEqual(X_.shape[0], 1000) 18 | self.assertEqual(y_.shape[0], 1000) 19 | 20 | def test_get_sample_data_smaller_1k(self): 21 | """Get sample when data is smaller than 1k""" 22 | SAMPLES = 100 23 | X = pd.DataFrame(np.random.uniform(size=(SAMPLES, 31))) 24 | y = pd.Series(np.random.randint(0, 2, size=(SAMPLES,))) 25 | 26 | X_, y_ = PlotSHAP.get_sample(X, y) 27 | 28 | self.assertEqual(X_.shape[0], SAMPLES) 29 | self.assertEqual(y_.shape[0], SAMPLES) 30 | ``` -------------------------------------------------------------------------------- /.github/workflows/test-installation-with-conda.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Test installation with conda 2 | 3 | on: 4 | schedule: 5 | - cron: '0 8 * * 1' 6 | # run workflow manually 7 | workflow_dispatch: 8 | 9 | jobs: 10 | build: 11 | name: Run (${{ matrix.python-version }}, ${{ matrix.os }}) 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [windows-latest] 17 | python-version: ['3.9'] 18 | 19 | steps: 20 | - uses: conda-incubator/setup-miniconda@v2 21 | with: 22 | activate-environment: test 23 | auto-update-conda: false 24 | python-version: ${{ matrix.python-version }} 25 | - name: Activate conda and check versions 26 | run: | 27 | conda activate test 28 | conda --version 29 | python --version 30 | - name: Install MLJAR AutoML 31 | run: conda install -c conda-forge mljar-supervised 32 | - name: Try to import 33 | run: python -c "import supervised;print(supervised.__version__)" 34 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/factory.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, AlgorithmsRegistry 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | from supervised.exceptions import AutoMLException 8 | 9 | 10 | class AlgorithmFactory(object): 11 | @classmethod 12 | def get_algorithm(cls, params): 13 | alg_type = params.get("model_type", "Xgboost") 14 | ml_task = params.get("ml_task", BINARY_CLASSIFICATION) 15 | 16 | try: 17 | Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type) 18 | return Algorithm(params) 19 | except Exception as e: 20 | raise AutoMLException(f"Cannot get algorithm class. {str(e)}") 21 | 22 | @classmethod 23 | def load(cls, json_desc, learner_path, lazy_load): 24 | learner = AlgorithmFactory.get_algorithm(json_desc.get("params")) 25 | learner.set_params(json_desc, learner_path) 26 | if not lazy_load: 27 | learner.reload() 28 | return learner 29 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/terminate_on_nan.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | import numpy as np 6 | 7 | from supervised.callbacks.callback import Callback 8 | 9 | 10 | class TerminateOnNan(Callback): 11 | def __init__(self, learner, params): 12 | super(TerminateOnNan, self).__init__(learner, params) 13 | self.metric = Metric(params.get("metric_name")) 14 | 15 | def on_iteration_end(self, iter_cnt, data): 16 | loss_train = 0 17 | if data.get("y_train_predicted") is not None: 18 | loss_train = self.metric( 19 | data.get("y_train_true"), data.get("y_train_predicted") 20 | ) 21 | loss_validation = self.metric( 22 | data.get("y_validation_true"), data.get("y_validation_predicted") 23 | ) 24 | 25 | for loss in [loss_train, loss_validation]: 26 | if np.isnan(loss) or np.isinf(loss) or np.isneginf(loss): 27 | self.learner.stop_training = True 28 | log.info("Terminating learning, invalid loss value") 29 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from supervised.automl import AutoML 4 | from sklearn.model_selection import train_test_split 5 | import os 6 | from sklearn.metrics import log_loss 7 | import warnings 8 | 9 | # warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning) 10 | 11 | df = pd.read_csv( 12 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 13 | skipinitialspace=True, 14 | ) 15 | 16 | X = df[df.columns[:-1]] 17 | y = df["income"] 18 | 19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 20 | 21 | automl = AutoML( 22 | algorithms=["LightGBM"], 23 | mode="Compete", 24 | explain_level=0, 25 | train_ensemble=True, 26 | golden_features=False, 27 | features_selection=False, 28 | eval_metric="auc", 29 | ) 30 | automl.fit(X_train, y_train) 31 | 32 | predictions = automl.predict_all(X_test) 33 | 34 | print(predictions.head()) 35 | print(predictions.tail()) 36 | print(X_test.shape, predictions.shape) 37 | print("LogLoss", log_loss(y_test, predictions["prediction_>50K"])) 38 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_adjust_validation.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLAdjustValidationTest(unittest.TestCase): 11 | automl_dir = "automl_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_custom_init(self): 17 | X = np.random.uniform(size=(60, 2)) 18 | y = np.random.randint(0, 2, size=(60,)) 19 | 20 | automl = AutoML( 21 | results_path=self.automl_dir, 22 | model_time_limit=10, 23 | algorithms=["Xgboost"], 24 | mode="Compete", 25 | explain_level=0, 26 | start_random_models=1, 27 | hill_climbing_steps=0, 28 | top_models_to_improve=0, 29 | kmeans_features=False, 30 | golden_features=False, 31 | features_selection=False, 32 | boost_on_errors=False, 33 | ) 34 | automl.fit(X, y) 35 | 36 | self.assertFalse( 37 | os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree")) 38 | ) 39 | ``` -------------------------------------------------------------------------------- /examples/scripts/multi_class_drug_fairness.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from supervised import AutoML 5 | 6 | 7 | df = pd.read_csv("tests/data/Drug/Drug_Consumption.csv") 8 | 9 | 10 | X = df[df.columns[1:13]] 11 | 12 | # convert to 3 classes 13 | df = df.replace( 14 | { 15 | "Cannabis": { 16 | "CL0": "never_used", 17 | "CL1": "not_in_last_year", 18 | "CL2": "not_in_last_year", 19 | "CL3": "used_in_last_year", 20 | "CL4": "used_in_last_year", 21 | "CL5": "used_in_last_year", 22 | "CL6": "used_in_last_year", 23 | } 24 | } 25 | ) 26 | 27 | y = df["Cannabis"] 28 | 29 | # maybe should be 30 | # The binary sensitive feature is education level (college degree or not). 31 | # like in 32 | # Fairness guarantee in multi-class classification 33 | sensitive_features = df["Gender"] 34 | 35 | 36 | automl = AutoML( 37 | algorithms=["Xgboost"], 38 | train_ensemble=True, 39 | start_random_models=3, 40 | hill_climbing_steps=3, 41 | top_models_to_improve=2, 42 | fairness_threshold=0.8, 43 | explain_level=1 44 | ) 45 | automl.fit(X, y, sensitive_features=sensitive_features) 46 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_datetime_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from supervised.preprocessing.datetime_transformer import DateTimeTransformer 6 | 7 | 8 | class DateTimeTransformerTest(unittest.TestCase): 9 | def test_transformer(self): 10 | d = { 11 | "col1": [ 12 | "2020/06/01", 13 | "2020/06/02", 14 | "2020/06/03", 15 | "2021/06/01", 16 | "2022/06/01", 17 | ] 18 | } 19 | df = pd.DataFrame(data=d) 20 | df["col1"] = pd.to_datetime(df["col1"]) 21 | df_org = df.copy() 22 | 23 | transf = DateTimeTransformer() 24 | transf.fit(df, "col1") 25 | df = transf.transform(df) 26 | 27 | self.assertTrue(df.shape[0] == 5) 28 | self.assertTrue("col1" not in df.columns) 29 | self.assertTrue("col1_Year" in df.columns) 30 | 31 | transf2 = DateTimeTransformer() 32 | transf2.from_json(transf.to_json()) 33 | df2 = transf2.transform(df_org) 34 | self.assertTrue("col1" not in df2.columns) 35 | self.assertTrue("col1_Year" in df2.columns) 36 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_text_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import pandas as pd 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.preprocessing.text_transformer import TextTransformer 7 | 8 | 9 | class TextTransformerTest(unittest.TestCase): 10 | def test_transformer(self): 11 | d = { 12 | "col1": [ 13 | "This is the first document.", 14 | "This document is the second document.", 15 | "And this is the third one.", 16 | None, 17 | "Is this the first document?", 18 | ] 19 | } 20 | df = pd.DataFrame(data=d) 21 | df_org = df.copy() 22 | 23 | transf = TextTransformer() 24 | transf.fit(df, "col1") 25 | df = transf.transform(df) 26 | 27 | self.assertTrue(df.shape[0] == 5) 28 | self.assertTrue("col1" not in df.columns) 29 | 30 | transf2 = TextTransformer() 31 | transf2.from_json(transf.to_json()) 32 | df2 = transf2.transform(df_org) 33 | self.assertTrue("col1" not in df2.columns) 34 | 35 | assert_almost_equal(df.iloc[0, 0], df2.iloc[0, 0]) 36 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_importance.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.tree import DecisionTreeClassifier 8 | 9 | from supervised.utils.importance import PermutationImportance 10 | 11 | 12 | class PermutationImportanceTest(unittest.TestCase): 13 | def test_compute_and_plot(self): 14 | rows = 20 15 | X = np.random.rand(rows, 3) 16 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 17 | y = np.random.randint(0, 2, rows) 18 | 19 | model = DecisionTreeClassifier(max_depth=1) 20 | model.fit(X, y) 21 | 22 | with tempfile.TemporaryDirectory() as tmpdir: 23 | PermutationImportance.compute_and_plot( 24 | model, 25 | X_validation=X, 26 | y_validation=y, 27 | model_file_path=tmpdir, 28 | learner_name="learner_test", 29 | metric_name=None, 30 | ml_task="binary_classification", 31 | ) 32 | self.assertTrue( 33 | os.path.exists(os.path.join(tmpdir, "learner_test_importance.csv")) 34 | ) 35 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/callback_list.py: -------------------------------------------------------------------------------- ```python 1 | class CallbackList(object): 2 | def __init__(self, callbacks=[]): 3 | self.callbacks = callbacks 4 | 5 | def add_and_set_learner(self, learner): 6 | for cb in self.callbacks: 7 | cb.add_and_set_learner(learner) 8 | 9 | def on_learner_train_start(self, logs=None): 10 | for cb in self.callbacks: 11 | cb.on_learner_train_start(logs) 12 | 13 | def on_learner_train_end(self, logs=None): 14 | for cb in self.callbacks: 15 | cb.on_learner_train_end(logs) 16 | 17 | def on_iteration_start(self, logs=None): 18 | for cb in self.callbacks: 19 | cb.on_iteration_start(logs) 20 | 21 | def on_iteration_end(self, logs=None, predictions=None): 22 | for cb in self.callbacks: 23 | cb.on_iteration_end(logs, predictions) 24 | 25 | def on_framework_train_end(self, logs=None): 26 | for cb in self.callbacks: 27 | cb.on_framework_train_end(logs) 28 | 29 | def get(self, callback_name): 30 | for cb in self.callbacks: 31 | if cb.name == callback_name: 32 | return cb 33 | return None 34 | ``` -------------------------------------------------------------------------------- /supervised/utils/common.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | 4 | def construct_learner_name(fold, repeat, repeats): 5 | repeat_str = f"_repeat_{repeat}" if repeats > 1 else "" 6 | return f"learner_fold_{fold}{repeat_str}" 7 | 8 | 9 | def learner_name_to_fold_repeat(name): 10 | fold, repeat = None, None 11 | arr = name.split("_") 12 | fold = int(arr[2]) 13 | if "repeat" in name: 14 | repeat = int(arr[4]) 15 | return fold, repeat 16 | 17 | 18 | def get_fold_repeat_cnt(model_path): 19 | training_logs = [f for f in os.listdir(model_path) if "_training.log" in f] 20 | fold_cnt, repeat_cnt = 0, 0 21 | for fname in training_logs: 22 | fold, repeat = learner_name_to_fold_repeat(fname) 23 | if fold is not None: 24 | fold_cnt = max(fold_cnt, fold) 25 | if repeat is not None: 26 | repeat_cnt = max(repeat_cnt, repeat) 27 | 28 | fold_cnt += 1 # counting from 0 29 | repeat_cnt += 1 30 | 31 | return fold_cnt, repeat_cnt 32 | 33 | 34 | def get_learners_names(model_path): 35 | postfix = "_training.log" 36 | learner_names = [ 37 | f.repleace(postfix, "") for f in os.listdir(model_path) if postfix in f 38 | ] 39 | return learner_names 40 | ``` -------------------------------------------------------------------------------- /tests/tests_ensemble/test_save_load.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import pandas as pd 5 | from sklearn import datasets 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class EnsembleSaveLoadTest(unittest.TestCase): 11 | automl_dir = "EnsembleSaveLoadTest" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_save_load(self): 17 | a = AutoML( 18 | results_path=self.automl_dir, 19 | total_time_limit=10, 20 | explain_level=0, 21 | mode="Explain", 22 | train_ensemble=True, 23 | start_random_models=1, 24 | ) 25 | 26 | X, y = datasets.make_classification( 27 | n_samples=100, 28 | n_features=5, 29 | n_informative=4, 30 | n_redundant=1, 31 | n_classes=2, 32 | n_clusters_per_class=3, 33 | n_repeated=0, 34 | shuffle=False, 35 | random_state=0, 36 | ) 37 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 38 | 39 | a.fit(X, y) 40 | p = a.predict(X) 41 | 42 | a2 = AutoML(results_path=self.automl_dir) 43 | p2 = a2.predict(X) 44 | 45 | self.assertTrue((p == p2).all()) 46 | ``` -------------------------------------------------------------------------------- /supervised/validation/validation_step.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | from supervised.exceptions import AutoMLException 6 | from supervised.validation.validator_custom import CustomValidator 7 | from supervised.validation.validator_kfold import KFoldValidator 8 | from supervised.validation.validator_split import SplitValidator 9 | 10 | 11 | class ValidationStep: 12 | def __init__(self, params): 13 | # kfold is default validation technique 14 | self.validation_type = params.get("validation_type", "kfold") 15 | 16 | if self.validation_type == "kfold": 17 | self.validator = KFoldValidator(params) 18 | elif self.validation_type == "split": 19 | self.validator = SplitValidator(params) 20 | elif self.validation_type == "custom": 21 | self.validator = CustomValidator(params) 22 | else: 23 | raise AutoMLException( 24 | f"The validation type ({self.validation_type}) is not implemented." 25 | ) 26 | 27 | def get_split(self, k, repeat=0): 28 | return self.validator.get_split(k, repeat) 29 | 30 | def split(self): 31 | return self.validator.split() 32 | 33 | def get_n_splits(self): 34 | return self.validator.get_n_splits() 35 | 36 | def get_repeats(self): 37 | return self.validator.get_repeats() 38 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_report.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | from sklearn import datasets 10 | from sklearn.decomposition import PCA 11 | from sklearn.pipeline import make_pipeline 12 | 13 | from supervised import AutoML 14 | from supervised.exceptions import AutoMLException 15 | 16 | iris = datasets.load_iris() 17 | 18 | class AutoMLReportTest(unittest.TestCase): 19 | automl_dir = "AutoMLTest" 20 | 21 | def tearDown(self): 22 | shutil.rmtree(self.automl_dir, ignore_errors=True) 23 | 24 | def setUp(self): 25 | shutil.rmtree(self.automl_dir, ignore_errors=True) 26 | 27 | def test_report(self): 28 | """Tests AutoML in the iris dataset (Multiclass classification)""" 29 | model = AutoML( 30 | algorithms=["Baseline"], 31 | explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir 32 | ) 33 | model.fit(iris.data, iris.target) 34 | model.report() 35 | 36 | report_path = os.path.join(self.automl_dir, "README.html") 37 | self.assertTrue(os.path.exists(report_path)) 38 | 39 | content = None 40 | with open(report_path, "r") as fin: 41 | content = fin.read() 42 | 43 | 44 | #print(content) 45 | link = '<a href="1_Baseline/README.html">' 46 | self.assertFalse(link in content) 47 | 48 | 49 | 50 | ``` -------------------------------------------------------------------------------- /tests/checks/check_automl_with_regression.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import pandas as pd 4 | import sklearn.model_selection 5 | 6 | from supervised.automl import AutoML 7 | 8 | 9 | class AutoMLWithRegressionTest(unittest.TestCase): 10 | def test_fit_and_predict(self): 11 | seed = 1709 12 | 13 | df = pd.read_csv( 14 | "./tests/data/housing_regression_missing_values_missing_target.csv" 15 | ) 16 | print(df.columns) 17 | x_cols = [c for c in df.columns if c != "MEDV"] 18 | X = df[x_cols] 19 | y = df["MEDV"] 20 | 21 | X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( 22 | X, y, test_size=0.3, random_state=seed 23 | ) 24 | automl = AutoML( 25 | total_time_limit=10, 26 | algorithms=["Xgboost"], # ["LightGBM", "RF", "NN", "CatBoost", "Xgboost"], 27 | start_random_models=1, 28 | hill_climbing_steps=0, 29 | top_models_to_improve=0, 30 | train_ensemble=True, 31 | verbose=True, 32 | ) 33 | automl.fit(X_train, y_train) 34 | 35 | response = automl.predict(X_test) # ["p_1"] 36 | print("Response", response) 37 | 38 | # Compute the logloss on test dataset 39 | # ll = log_loss(y_test, response) 40 | # print("(*) Dataset id {} logloss {}".format(dataset_id, ll)) 41 | 42 | 43 | if __name__ == "__main__": 44 | unittest.main() 45 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_data_types.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class AutoMLDataTypesTest(unittest.TestCase): 11 | automl_dir = "automl_tests" 12 | rows = 250 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.automl_dir, ignore_errors=True) 16 | 17 | def test_category_data_type(self): 18 | X = np.random.rand(self.rows, 3) 19 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 20 | y = np.random.randint(0, 2, self.rows) 21 | 22 | X["f1"] = X["f1"].astype("category") 23 | 24 | automl = AutoML( 25 | results_path=self.automl_dir, 26 | total_time_limit=1, 27 | algorithms=["CatBoost"], 28 | train_ensemble=False, 29 | explain_level=0, 30 | start_random_models=1, 31 | ) 32 | automl.fit(X, y) 33 | 34 | def test_encoding_strange_characters(self): 35 | X = np.random.rand(self.rows, 3) 36 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 37 | y = np.random.permutation(["ɛ", "🂲"] * int(self.rows / 2)) 38 | 39 | automl = AutoML( 40 | results_path=self.automl_dir, 41 | total_time_limit=1, 42 | algorithms=["Baseline"], 43 | train_ensemble=False, 44 | explain_level=0, 45 | start_random_models=1, 46 | ) 47 | automl.fit(X, y) 48 | ``` -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Tests 2 | 3 | on: [ push,pull_request ] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ ubuntu-latest ] 12 | python-version: [ '3.10'] 13 | #os: [ ubuntu-latest, macos-latest, windows-latest ] 14 | #python-version: [ '3.8', '3.9', '3.10', '3.11' ] 15 | 16 | steps: 17 | - name: Install OS Dependencies 18 | if: matrix.os == 'ubuntu-latest' 19 | run: | 20 | sudo apt-get update 21 | sudo apt-get -y install graphviz 22 | 23 | - name: Install OS Dependencies 24 | if: matrix.os == 'macos-latest' 25 | run: | 26 | brew install graphviz 27 | 28 | - name: Install OS Dependencies 29 | if: matrix.os == 'windows-latest' 30 | run: | 31 | choco install graphviz 32 | - uses: actions/checkout@v2 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v2 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Install Python Dependencies 38 | run: | 39 | python -m pip install --upgrade pip 40 | pip install --upgrade setuptools 41 | pip install -U importlib-metadata>=1.7.0 42 | pip install -U -r requirements.txt 43 | pip install -U -r requirements_dev.txt 44 | pip install ipython 45 | python setup.py install 46 | - name: Test with pytest 47 | run: | 48 | pytest tests --cov=supervised/ 49 | continue-on-error: true 50 | ``` -------------------------------------------------------------------------------- /supervised/utils/data_validation.py: -------------------------------------------------------------------------------- ```python 1 | def check_greater_than_zero_integer(value, original_var_name): 2 | if not isinstance(value, int): 3 | raise ValueError( 4 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 5 | ) 6 | 7 | if value <= 0: 8 | raise ValueError( 9 | f"'{original_var_name}' must be greater than zero, got '{value}'." 10 | ) 11 | 12 | 13 | def check_positive_integer(value, original_var_name): 14 | if not isinstance(value, int): 15 | raise ValueError( 16 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 17 | ) 18 | 19 | if value < 0: 20 | raise ValueError( 21 | f"'{original_var_name}' must be equal or greater than zero, got '{value}'." 22 | ) 23 | 24 | 25 | def check_integer(value, original_var_name): 26 | if not isinstance(value, int): 27 | raise ValueError( 28 | f"'{original_var_name}' must be an integer, got '{type(value)}'." 29 | ) 30 | 31 | 32 | def check_bool(value, original_var_name): 33 | if not isinstance(value, bool): 34 | raise ValueError( 35 | f"'{original_var_name}' must be a boolean, got '{type(value)}'." 36 | ) 37 | 38 | 39 | def check_greater_than_zero_integer_or_float(value, original_var_name): 40 | if not (isinstance(value, int) or isinstance(value, float)): 41 | raise ValueError( 42 | f"'{original_var_name}' must be an integer or float, got '{type(value)}'." 43 | ) 44 | 45 | if value <= 0: 46 | raise ValueError( 47 | f"'{original_var_name}' must be greater than zero, got '{value}'." 48 | ) 49 | ``` -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- ```python 1 | from setuptools import setup, find_packages 2 | from codecs import open 3 | from os import path 4 | 5 | here = path.abspath(path.dirname(__file__)) 6 | 7 | # Get the long description from the README file 8 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 9 | long_description = f.read() 10 | 11 | setup( 12 | name="mljar-supervised", 13 | version="1.1.18", 14 | description="Automated Machine Learning for Humans", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/mljar/mljar-supervised", 18 | author="MLJAR, Sp. z o.o.", 19 | author_email="[email protected]", 20 | license="MIT", 21 | packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 22 | install_requires=open("requirements.txt").readlines(), 23 | include_package_data=True, 24 | python_requires='>=3.8', 25 | classifiers=[ 26 | "Programming Language :: Python", 27 | "Programming Language :: Python :: 3.8", 28 | "Programming Language :: Python :: 3.9", 29 | "Programming Language :: Python :: 3.10", 30 | "Programming Language :: Python :: 3.11", 31 | ], 32 | keywords=[ 33 | "automated machine learning", 34 | "automl", 35 | "machine learning", 36 | "data science", 37 | "data mining", 38 | "mljar", 39 | "random forest", 40 | "decision tree", 41 | "xgboost", 42 | "lightgbm", 43 | "catboost", 44 | "neural network", 45 | "extra trees", 46 | "linear model", 47 | "features selection", 48 | "features engineering" 49 | ], 50 | ) 51 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/exclude_missing_target.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(LOG_LEVEL) 11 | 12 | 13 | class ExcludeRowsMissingTarget(object): 14 | @staticmethod 15 | def transform( 16 | X=None, y=None, sample_weight=None, sensitive_features=None, warn=False 17 | ): 18 | if y is None: 19 | return X, y, sample_weight, sensitive_features 20 | y_missing = pd.isnull(y) 21 | if np.sum(np.array(y_missing)) == 0: 22 | return X, y, sample_weight, sensitive_features 23 | logger.debug("Exclude rows with missing target values") 24 | if warn: 25 | warnings.warn( 26 | "There are samples with missing target values in the data which will be excluded for further analysis", 27 | UserWarning 28 | ) 29 | y = y.drop(y.index[y_missing]) 30 | y.reset_index(drop=True, inplace=True) 31 | 32 | if X is not None: 33 | X = X.drop(X.index[y_missing]) 34 | X.reset_index(drop=True, inplace=True) 35 | 36 | if sample_weight is not None: 37 | sample_weight = sample_weight.drop(sample_weight.index[y_missing]) 38 | sample_weight.reset_index(drop=True, inplace=True) 39 | 40 | if sensitive_features is not None: 41 | sensitive_features = sensitive_features.drop( 42 | sensitive_features.index[y_missing] 43 | ) 44 | sensitive_features.reset_index(drop=True, inplace=True) 45 | 46 | return X, y, sample_weight, sensitive_features 47 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_prediction_after_load.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | from numpy.testing import assert_almost_equal 5 | from sklearn import datasets 6 | from sklearn.model_selection import train_test_split 7 | 8 | from supervised import AutoML 9 | 10 | 11 | class AutoMLPredictionAfterLoadTest(unittest.TestCase): 12 | automl_dir = "AutoMLPredictionAfterLoadTest" 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.automl_dir, ignore_errors=True) 16 | 17 | def test_integration(self): 18 | a = AutoML( 19 | results_path=self.automl_dir, 20 | mode="Compete", 21 | algorithms=["Baseline", "CatBoost", "LightGBM", "Xgboost"], 22 | stack_models=True, 23 | total_time_limit=60, 24 | validation_strategy={ 25 | "validation_type": "kfold", 26 | "k_folds": 3, 27 | "shuffle": True, 28 | "stratify": True, 29 | "random_seed": 123, 30 | }, 31 | ) 32 | 33 | X, y = datasets.make_classification( 34 | n_samples=1000, 35 | n_features=30, 36 | n_informative=29, 37 | n_redundant=1, 38 | n_classes=8, 39 | n_clusters_per_class=3, 40 | n_repeated=0, 41 | shuffle=False, 42 | random_state=0, 43 | ) 44 | X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) 45 | 46 | a.fit(X_train, y_train) 47 | p = a.predict_all(X_test) 48 | 49 | a2 = AutoML(results_path=self.automl_dir) 50 | p2 = a2.predict_all(X_test) 51 | 52 | assert_almost_equal(p["prediction_0"].iloc[0], p2["prediction_0"].iloc[0]) 53 | assert_almost_equal(p["prediction_7"].iloc[0], p2["prediction_7"].iloc[0]) 54 | ``` -------------------------------------------------------------------------------- /examples/scripts/binary_classifier_ensemble.py: -------------------------------------------------------------------------------- ```python 1 | import pandas as pd 2 | from supervised.automl import AutoML 3 | from supervised.ensemble import Ensemble 4 | import os 5 | 6 | df = pd.read_csv( 7 | "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv", 8 | skipinitialspace=True, 9 | ) 10 | 11 | X = df[df.columns[:-1]] 12 | y = df["income"] 13 | 14 | results_path = "AutoML_2" 15 | automl = AutoML( 16 | results_path=results_path, 17 | total_time_limit=400, 18 | start_random_models=10, 19 | hill_climbing_steps=0, 20 | top_models_to_improve=0, 21 | train_ensemble=False, 22 | ) 23 | 24 | 25 | models_map = {m.get_name(): m for m in automl._models} 26 | 27 | ensemble = Ensemble("logloss", "binary_classification") 28 | ensemble.models_map = models_map 29 | 30 | oofs = {} 31 | target = None 32 | for i in range(1, 30): 33 | oof = pd.read_csv( 34 | os.path.join(results_path, f"model_{i}", "predictions_out_of_folds.csv") 35 | ) 36 | prediction_cols = [c for c in oof.columns if "prediction" in c] 37 | oofs[f"model_{i}"] = oof[prediction_cols] 38 | if target is None: 39 | target_columns = [c for c in oof.columns if "target" in c] 40 | target = oof[target_columns] 41 | 42 | ensemble.target = target 43 | ensemble.target_columns = "target" 44 | ensemble.fit(oofs, target) 45 | ensemble.save(os.path.join(results_path, "ensemble")) 46 | 47 | 48 | predictions = ensemble.predict(X) 49 | print(predictions.head()) 50 | 51 | """ 52 | p_<=50K p_>50K 53 | 0 0.982940 0.017060 54 | 1 0.722781 0.277219 55 | 2 0.972687 0.027313 56 | 3 0.903021 0.096979 57 | 4 0.591373 0.408627 58 | """ 59 | 60 | 61 | ensemble2 = Ensemble.load(os.path.join(results_path, "ensemble"), models_map) 62 | predictions2 = ensemble2.predict(X) 63 | print(predictions2.head()) 64 | 65 | """ 66 | p_<=50K p_>50K 67 | 0 0.982940 0.017060 68 | 1 0.722781 0.277219 69 | 2 0.972687 0.027313 70 | 3 0.903021 0.096979 71 | 4 0.591373 0.408627 72 | """ 73 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/learner_time_constraint.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from supervised.callbacks.callback import Callback 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | log = logging.getLogger(__name__) 10 | log.setLevel(LOG_LEVEL) 11 | 12 | 13 | class LearnerTimeConstraint(Callback): 14 | def __init__(self, params={}): 15 | super(LearnerTimeConstraint, self).__init__(params) 16 | self.name = params.get("name", "learner_time_constraint") 17 | self.min_steps = params.get("min_steps") 18 | self.learner_time_limit = params.get("learner_time_limit") # in seconds 19 | self.iterations_count = 0 20 | 21 | def on_learner_train_start(self, logs): 22 | self.train_start_time = time.time() 23 | self.iterations_count = 0 24 | 25 | def on_iteration_start(self, logs): 26 | self.iter_start_time = time.time() 27 | 28 | def on_iteration_end(self, logs, predictions): 29 | self.iterations_count += 1 30 | iteration_elapsed_time = np.round(time.time() - self.iter_start_time, 2) 31 | learner_elapsed_time = np.round(time.time() - self.train_start_time, 2) 32 | log.debug( 33 | "Iteration {0} took {1} seconds, learner training time {2} seconds".format( 34 | self.iterations_count, iteration_elapsed_time, learner_elapsed_time 35 | ) 36 | ) 37 | 38 | if self.min_steps is not None: 39 | if self.iterations_count < self.min_steps: 40 | # self.learner.stop_training = False 41 | # return before checking other conditions 42 | return 43 | 44 | if self.learner_time_limit is not None: 45 | if learner_elapsed_time >= self.learner_time_limit: 46 | self.learner.stop_training = True 47 | log.info("Terminating learning, time limit reached") 48 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_restore.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from supervised import AutoML 10 | from supervised.algorithms.xgboost import additional 11 | 12 | additional["max_rounds"] = 1 13 | 14 | 15 | class AutoMLRestoreTest(unittest.TestCase): 16 | automl_dir = "automl_tests" 17 | rows = 50 18 | 19 | def tearDown(self): 20 | shutil.rmtree(self.automl_dir, ignore_errors=True) 21 | 22 | def test_tune_only_default(self): 23 | X = np.random.rand(self.rows, 3) 24 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 25 | y = np.random.randint(0, 2, self.rows) 26 | 27 | automl = AutoML( 28 | results_path=self.automl_dir, 29 | total_time_limit=3, 30 | algorithms=["Decision Tree"], 31 | explain_level=0, 32 | train_ensemble=False, 33 | ) 34 | automl.fit(X, y) 35 | 36 | # Get number of starting models 37 | n1 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) 38 | 39 | with open(os.path.join(self.automl_dir, "progress.json"), "r") as file: 40 | progress = json.load(file) 41 | progress["fit_level"] = "default_algorithms" 42 | 43 | with open(os.path.join(self.automl_dir, "progress.json"), "w") as fout: 44 | fout.write(json.dumps(progress, indent=4)) 45 | 46 | automl = AutoML( 47 | results_path=self.automl_dir, 48 | total_time_limit=3, 49 | algorithms=["Decision Tree", "Xgboost"], 50 | explain_level=0, 51 | train_ensemble=False, 52 | ) 53 | automl.fit(X, y) 54 | # Get number of models after second fit 55 | n2 = len([x for x in os.listdir(self.automl_dir) if x[0].isdigit()]) 56 | # number of models should be equal 57 | # user cannot overwrite parameters 58 | self.assertEqual(n2, n1) 59 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/label_encoder.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | from decimal import Decimal 3 | 4 | import numpy as np 5 | from sklearn import preprocessing as sk_preproc 6 | 7 | from supervised.utils.config import LOG_LEVEL 8 | 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(LOG_LEVEL) 11 | 12 | 13 | class LabelEncoder(object): 14 | def __init__(self, try_to_fit_numeric=False): 15 | self.lbl = sk_preproc.LabelEncoder() 16 | self._try_to_fit_numeric = try_to_fit_numeric 17 | 18 | def fit(self, x): 19 | self.lbl.fit(x) # list(x.values)) 20 | if self._try_to_fit_numeric: 21 | logger.debug("Try to fit numeric in LabelEncoder") 22 | try: 23 | arr = {Decimal(c): c for c in self.lbl.classes_} 24 | sorted_arr = dict(sorted(arr.items())) 25 | self.lbl.classes_ = np.array( 26 | list(sorted_arr.values()), dtype=self.lbl.classes_.dtype 27 | ) 28 | except Exception as e: 29 | pass 30 | 31 | def transform(self, x): 32 | try: 33 | return self.lbl.transform(x) # list(x.values)) 34 | except ValueError as ve: 35 | # rescue 36 | classes = np.unique(x) # list(x.values)) 37 | diff = np.setdiff1d(classes, self.lbl.classes_) 38 | self.lbl.classes_ = np.concatenate((self.lbl.classes_, diff)) 39 | return self.lbl.transform(x) # list(x.values)) 40 | 41 | def inverse_transform(self, x): 42 | return self.lbl.inverse_transform(x) # (list(x.values)) 43 | 44 | def to_json(self): 45 | data_json = {} 46 | for i, cl in enumerate(self.lbl.classes_): 47 | data_json[str(cl)] = i 48 | return data_json 49 | 50 | def from_json(self, data_json): 51 | keys = np.array(list(data_json.keys())) 52 | if len(keys) == 2 and "False" in keys and "True" in keys: 53 | keys = np.array([False, True]) 54 | self.lbl.classes_ = keys 55 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_exclude_missing.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget 7 | 8 | 9 | class ExcludeRowsMissingTargetTest(unittest.TestCase): 10 | def test_transform(self): 11 | d_test = { 12 | "col1": [1, 1, np.nan, 3], 13 | "col2": ["a", "a", np.nan, "a"], 14 | "col3": [1, 1, 1, 3], 15 | "col4": ["a", "a", "b", "c"], 16 | "y": [np.nan, 1, np.nan, 2], 17 | } 18 | df_test = pd.DataFrame(data=d_test) 19 | X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 20 | y = df_test.loc[:, "y"] 21 | 22 | self.assertEqual(X.shape[0], 4) 23 | self.assertEqual(y.shape[0], 4) 24 | X, y, _, _ = ExcludeRowsMissingTarget.transform(X, y) 25 | self.assertEqual(X.shape[0], 2) 26 | self.assertEqual(y.shape[0], 2) 27 | 28 | self.assertEqual(y[0], 1) 29 | self.assertEqual(y[1], 2) 30 | 31 | def test_transform_with_sample_weight(self): 32 | d_test = { 33 | "col1": [1, 1, np.nan, 3], 34 | "col2": ["a", "a", np.nan, "a"], 35 | "col3": [1, 1, 1, 3], 36 | "col4": ["a", "a", "b", "c"], 37 | "sample_weight": [1, 2, 3, 4], 38 | "y": [np.nan, 1, np.nan, 2], 39 | } 40 | df_test = pd.DataFrame(data=d_test) 41 | X = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 42 | y = df_test.loc[:, "y"] 43 | sample_weight = df_test.loc[:, "sample_weight"] 44 | 45 | self.assertEqual(X.shape[0], 4) 46 | self.assertEqual(y.shape[0], 4) 47 | X, y, sw, _ = ExcludeRowsMissingTarget.transform(X, y, sample_weight) 48 | self.assertEqual(X.shape[0], 2) 49 | self.assertEqual(y.shape[0], 2) 50 | self.assertEqual(sw.shape[0], 2) 51 | 52 | self.assertEqual(y[0], 1) 53 | self.assertEqual(y[1], 2) 54 | self.assertEqual(sw[0], 2) 55 | self.assertEqual(sw[1], 4) 56 | ``` -------------------------------------------------------------------------------- /tests/tests_fairness/test_multi_class_classification.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class FairnessInMultiClassClassificationTest(unittest.TestCase): 11 | automl_dir = "automl_fairness_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.array(["A", "B", "C"] * 10) 19 | S = pd.DataFrame({"sensitive": ["D", "E"] * 15}) 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | model_time_limit=10, 24 | algorithms=["Xgboost"], 25 | explain_level=0, 26 | train_ensemble=False, 27 | stack_models=False, 28 | validation_strategy={"validation_type": "split"}, 29 | start_random_models=1, 30 | ) 31 | 32 | automl.fit(X, y, sensitive_features=S) 33 | 34 | self.assertGreater(len(automl._models), 0) 35 | 36 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 37 | self.assertEqual(len(sensitive_features_names), 3) 38 | 39 | self.assertTrue("sensitive__A" in sensitive_features_names) 40 | self.assertTrue("sensitive__B" in sensitive_features_names) 41 | self.assertTrue("sensitive__C" in sensitive_features_names) 42 | 43 | self.assertTrue( 44 | automl._models[0].get_fairness_metric("sensitive__A") is not None 45 | ) 46 | self.assertTrue( 47 | automl._models[0].get_fairness_metric("sensitive__B") is not None 48 | ) 49 | self.assertTrue( 50 | automl._models[0].get_fairness_metric("sensitive__C") is not None 51 | ) 52 | 53 | self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1) 54 | self.assertTrue(automl._models[0].get_worst_fairness() is not None) 55 | self.assertTrue(automl._models[0].get_best_fairness() is not None) 56 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/metric_logger.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | log = logging.getLogger(__name__) 4 | 5 | from supervised.callbacks.callback import Callback 6 | from supervised.utils.metric import Metric 7 | 8 | 9 | class MetricLogger(Callback): 10 | def __init__(self, params): 11 | super(MetricLogger, self).__init__(params) 12 | self.name = params.get("name", "metric_logger") 13 | self.loss_values = {} 14 | self.metrics = [] 15 | for metric_name in params.get("metric_names"): 16 | self.metrics += [Metric({"name": metric_name})] 17 | 18 | def add_and_set_learner(self, learner): 19 | self.loss_values[learner.uid] = {"train": {}, "validation": {}, "iters": []} 20 | for metric in self.metrics: 21 | self.loss_values[learner.uid]["train"][metric.name] = [] 22 | self.loss_values[learner.uid]["validation"][metric.name] = [] 23 | 24 | self.current_learner_uid = learner.uid 25 | 26 | def on_iteration_end(self, logs, predictions): 27 | for metric in self.metrics: 28 | train_loss = 0 29 | if predictions.get("y_train_predicted") is not None: 30 | train_loss = metric( 31 | predictions.get("y_train_true"), 32 | predictions.get("y_train_predicted"), 33 | ) 34 | validation_loss = metric( 35 | predictions.get("y_validation_true"), 36 | predictions.get("y_validation_predicted"), 37 | ) 38 | self.loss_values[self.current_learner_uid]["train"][metric.name] += [ 39 | train_loss 40 | ] 41 | self.loss_values[self.current_learner_uid]["validation"][metric.name] += [ 42 | validation_loss 43 | ] 44 | # keep information about iter number only once :) 45 | if metric == self.metrics[0]: 46 | self.loss_values[self.current_learner_uid]["iters"] += [ 47 | logs.get("iter_cnt") 48 | ] 49 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/knn.py: -------------------------------------------------------------------------------- ```python 1 | import optuna 2 | 3 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm 4 | from supervised.algorithms.registry import ( 5 | REGRESSION, 6 | ) 7 | from supervised.utils.metric import Metric 8 | 9 | 10 | class KNNObjective: 11 | def __init__( 12 | self, 13 | ml_task, 14 | X_train, 15 | y_train, 16 | sample_weight, 17 | X_validation, 18 | y_validation, 19 | sample_weight_validation, 20 | eval_metric, 21 | n_jobs, 22 | random_state, 23 | ): 24 | self.ml_task = ml_task 25 | self.X_train = X_train 26 | self.y_train = y_train 27 | self.sample_weight = sample_weight 28 | self.X_validation = X_validation 29 | self.y_validation = y_validation 30 | self.eval_metric = eval_metric 31 | self.n_jobs = n_jobs 32 | self.seed = random_state 33 | 34 | def __call__(self, trial): 35 | try: 36 | params = { 37 | "n_neighbors": trial.suggest_int("n_neighbors", 1, 128), 38 | "weights": trial.suggest_categorical( 39 | "weights", ["uniform", "distance"] 40 | ), 41 | "n_jobs": self.n_jobs, 42 | "rows_limit": 100000, 43 | "ml_task": self.ml_task, 44 | } 45 | Algorithm = ( 46 | KNeighborsRegressorAlgorithm 47 | if self.ml_task == REGRESSION 48 | else KNeighborsAlgorithm 49 | ) 50 | model = Algorithm(params) 51 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 52 | preds = model.predict(self.X_validation) 53 | 54 | score = self.eval_metric(self.y_validation, preds) 55 | if Metric.optimize_negative(self.eval_metric.name): 56 | score *= -1.0 57 | 58 | except optuna.exceptions.TrialPruned as e: 59 | raise e 60 | except Exception as e: 61 | print("Exception in KNNObjective", str(e)) 62 | return None 63 | 64 | return score 65 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_init.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from supervised import AutoML 7 | 8 | 9 | class AutoMLInitTest(unittest.TestCase): 10 | automl_dir = "AutoMLInitTest" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_custom_init(self): 16 | X = np.random.uniform(size=(30, 2)) 17 | y = np.random.randint(0, 2, size=(30,)) 18 | 19 | automl = AutoML( 20 | results_path=self.automl_dir, 21 | model_time_limit=1, 22 | algorithms=["Xgboost"], 23 | explain_level=0, 24 | train_ensemble=False, 25 | stack_models=False, 26 | validation_strategy={"validation_type": "split"}, 27 | start_random_models=3, 28 | hill_climbing_steps=1, 29 | top_models_to_improve=1, 30 | ) 31 | 32 | automl.fit(X, y) 33 | self.assertGreater(len(automl._models), 3) 34 | 35 | def test_get_results_path(self): 36 | automl = AutoML(algorithms=["Baseline"], total_time_limit=1) 37 | first_path = automl._get_results_path() 38 | self.assertEqual(first_path, automl._get_results_path()) 39 | shutil.rmtree(first_path, ignore_errors=True) 40 | 41 | automl = AutoML( 42 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 43 | ) 44 | self.assertEqual(self.automl_dir, automl._get_results_path()) 45 | shutil.rmtree(self.automl_dir, ignore_errors=True) 46 | 47 | # get results path after save 48 | automl = AutoML( 49 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 50 | ) 51 | X = np.random.uniform(size=(30, 2)) 52 | y = np.random.randint(0, 2, size=(30,)) 53 | automl.fit(X, y) 54 | self.assertEqual(self.automl_dir, automl._get_results_path()) 55 | 56 | automl2 = AutoML( 57 | algorithms=["Baseline"], total_time_limit=1, results_path=self.automl_dir 58 | ) 59 | self.assertEqual(self.automl_dir, automl2._get_results_path()) 60 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_stack_models_constraints.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from supervised import AutoML 7 | 8 | 9 | class AutoMLStackModelsConstraintsTest(unittest.TestCase): 10 | automl_dir = "AutoMLStackModelsConstraintsTest" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_allow_stack_models(self): 16 | X = np.random.uniform(size=(100, 2)) 17 | y = np.random.randint(0, 2, size=(100,)) 18 | X[:, 0] = y 19 | X[:, 1] = -y 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | total_time_limit=5, 24 | mode="Compete", 25 | validation_strategy={"validation_type": "kfold", "k_folds": 5}, 26 | ) 27 | automl.fit(X, y) 28 | self.assertTrue(automl._stack_models) 29 | self.assertTrue(automl.tuner._stack_models) 30 | self.assertTrue(automl._time_ctrl._is_stacking) 31 | 32 | def test_disable_stack_models(self): 33 | X = np.random.uniform(size=(100, 2)) 34 | y = np.random.randint(0, 2, size=(100,)) 35 | X[:, 0] = y 36 | X[:, 1] = -y 37 | 38 | automl = AutoML( 39 | results_path=self.automl_dir, 40 | total_time_limit=5, 41 | mode="Compete", 42 | validation_strategy={"validation_type": "split"}, 43 | ) 44 | automl.fit(X, y) 45 | self.assertFalse(automl._stack_models) 46 | self.assertFalse(automl.tuner._stack_models) 47 | self.assertFalse(automl._time_ctrl._is_stacking) 48 | 49 | def test_disable_stack_models_adjusted_validation(self): 50 | X = np.random.uniform(size=(100, 2)) 51 | y = np.random.randint(0, 2, size=(100,)) 52 | X[:, 0] = y 53 | X[:, 1] = -y 54 | 55 | automl = AutoML( 56 | results_path=self.automl_dir, total_time_limit=5, mode="Compete" 57 | ) 58 | automl.fit(X, y) 59 | # the stacking should be disabled 60 | # because of small time limit 61 | self.assertFalse(automl._stack_models) 62 | self.assertFalse(automl.tuner._stack_models) 63 | self.assertFalse(automl._time_ctrl._is_stacking) 64 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_decision_tree.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.decision_tree import ( 9 | DecisionTreeRegressorAlgorithm, 10 | ) 11 | from supervised.utils.metric import Metric 12 | 13 | 14 | class DecisionTreeTest(unittest.TestCase): 15 | @classmethod 16 | def setUpClass(cls): 17 | cls.X, cls.y = datasets.make_regression( 18 | n_samples=100, 19 | n_features=5, 20 | n_informative=4, 21 | n_targets=1, 22 | shuffle=False, 23 | random_state=0, 24 | ) 25 | 26 | def test_reproduce_fit_regression(self): 27 | metric = Metric({"name": "rmse"}) 28 | params = {"max_depth": 1, "seed": 1, "ml_task": "regression"} 29 | prev_loss = None 30 | for _ in range(3): 31 | model = DecisionTreeRegressorAlgorithm(params) 32 | model.fit(self.X, self.y) 33 | y_predicted = model.predict(self.X) 34 | loss = metric(self.y, y_predicted) 35 | if prev_loss is not None: 36 | assert_almost_equal(prev_loss, loss) 37 | prev_loss = loss 38 | 39 | def test_save_and_load(self): 40 | metric = Metric({"name": "rmse"}) 41 | dt = DecisionTreeRegressorAlgorithm({"ml_task": "regression"}) 42 | dt.fit(self.X, self.y) 43 | y_predicted = dt.predict(self.X) 44 | loss = metric(self.y, y_predicted) 45 | 46 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 47 | 48 | dt.save(filename) 49 | dt2 = DecisionTreeRegressorAlgorithm({"ml_task": "regression"}) 50 | dt2.load(filename) 51 | 52 | y_predicted = dt2.predict(self.X) 53 | loss2 = metric(self.y, y_predicted) 54 | assert_almost_equal(loss, loss2) 55 | 56 | # Finished with temp file, delete it 57 | os.remove(filename) 58 | 59 | def test_is_fitted(self): 60 | params = {"max_depth": 1, "seed": 1, "ml_task": "regression"} 61 | model = DecisionTreeRegressorAlgorithm(params) 62 | self.assertFalse(model.is_fitted()) 63 | model.fit(self.X, self.y) 64 | self.assertTrue(model.is_fitted()) 65 | ``` -------------------------------------------------------------------------------- /tests/tests_callbacks/test_total_time_constraint.py: -------------------------------------------------------------------------------- ```python 1 | import time 2 | import unittest 3 | 4 | from supervised.callbacks.total_time_constraint import TotalTimeConstraint 5 | from supervised.exceptions import NotTrainedException 6 | 7 | 8 | class TotalTimeConstraintTest(unittest.TestCase): 9 | def test_stop_on_first_learner(self): 10 | params = { 11 | "total_time_limit": 100, 12 | "total_time_start": time.time(), 13 | "expected_learners_cnt": 1001, 14 | } 15 | callback = TotalTimeConstraint(params) 16 | callback.add_and_set_learner(learner={}) 17 | callback.on_learner_train_start(logs=None) 18 | time.sleep(0.1) 19 | with self.assertRaises(NotTrainedException) as context: 20 | callback.on_learner_train_end(logs=None) 21 | self.assertTrue("Stop training after the first fold" in str(context.exception)) 22 | 23 | def test_stop_on_not_first_learner(self): 24 | params = { 25 | "total_time_limit": 100, 26 | "total_time_start": time.time(), 27 | "expected_learners_cnt": 10, 28 | } 29 | callback = TotalTimeConstraint(params) 30 | callback.add_and_set_learner(learner={}) 31 | callback.on_learner_train_start(logs=None) 32 | callback.on_learner_train_end(logs=None) 33 | with self.assertRaises(NotTrainedException) as context: 34 | # 35 | # hardcoded change just for tests! 36 | callback.total_time_start = time.time() - 600 - 100 - 1 37 | # 38 | callback.add_and_set_learner(learner={}) 39 | callback.on_learner_train_start(logs=None) 40 | callback.on_learner_train_end(logs=None) 41 | self.assertTrue("Force to stop" in str(context.exception)) 42 | 43 | def test_dont_stop(self): 44 | params = { 45 | "total_time_limit": 100, 46 | "total_time_start": time.time(), 47 | "expected_learners_cnt": 10, 48 | } 49 | callback = TotalTimeConstraint(params) 50 | 51 | for i in range(10): 52 | callback.add_and_set_learner(learner={}) 53 | callback.on_learner_train_start(logs=None) 54 | callback.on_learner_train_end(logs=None) 55 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_preprocessing_utils.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 7 | 8 | 9 | class PreprocessingUtilsTest(unittest.TestCase): 10 | def test_get_type_numpy_number(self): 11 | tmp = np.array([1, 2, 3]) 12 | tmp_type = PreprocessingUtils.get_type(tmp) 13 | self.assertNotEqual(tmp_type, PreprocessingUtils.CATEGORICAL) 14 | 15 | def test_get_type_numpy_categorical(self): 16 | tmp = np.array(["a", "b", "c"]) 17 | tmp_type = PreprocessingUtils.get_type(tmp) 18 | self.assertEqual(tmp_type, PreprocessingUtils.CATEGORICAL) 19 | 20 | def test_get_type_pandas_bug(self): 21 | d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} 22 | df = pd.DataFrame(data=d) 23 | col1_type = PreprocessingUtils.get_type(df.loc[:, "col2"]) 24 | self.assertEqual(col1_type, PreprocessingUtils.CATEGORICAL) 25 | 26 | def test_get_type_pandas(self): 27 | d = {"col1": [1, 2, 3], "col2": ["a", "b", "c"]} 28 | df = pd.DataFrame(data=d) 29 | col1_type = PreprocessingUtils.get_type(df["col1"]) 30 | self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) 31 | col2_type = PreprocessingUtils.get_type(df["col2"]) 32 | self.assertNotEqual(col1_type, PreprocessingUtils.CATEGORICAL) 33 | 34 | def test_get_stats(self): 35 | tmp = np.array([1, np.nan, 2, 3, np.nan, np.nan]) 36 | self.assertEqual(1, PreprocessingUtils.get_min(tmp)) 37 | self.assertEqual(2, PreprocessingUtils.get_mean(tmp)) 38 | self.assertEqual(2, PreprocessingUtils.get_median(tmp)) 39 | d = {"col1": [1, 2, 1, 3, 1, np.nan], "col2": ["a", np.nan, "b", "a", "c", "a"]} 40 | df = pd.DataFrame(data=d) 41 | self.assertEqual(1, PreprocessingUtils.get_min(df["col1"])) 42 | self.assertEqual(8.0 / 5.0, PreprocessingUtils.get_mean(df["col1"])) 43 | self.assertEqual(1, PreprocessingUtils.get_median(df["col1"])) 44 | 45 | self.assertEqual(1, PreprocessingUtils.get_most_frequent(df["col1"])) 46 | self.assertEqual("a", PreprocessingUtils.get_most_frequent(df["col2"])) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/nn.py: -------------------------------------------------------------------------------- ```python 1 | import optuna 2 | 3 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm 4 | from supervised.algorithms.registry import ( 5 | REGRESSION, 6 | ) 7 | from supervised.utils.metric import Metric 8 | 9 | 10 | class NeuralNetworkObjective: 11 | def __init__( 12 | self, 13 | ml_task, 14 | X_train, 15 | y_train, 16 | sample_weight, 17 | X_validation, 18 | y_validation, 19 | sample_weight_validation, 20 | eval_metric, 21 | n_jobs, 22 | random_state, 23 | ): 24 | self.ml_task = ml_task 25 | self.X_train = X_train 26 | self.y_train = y_train 27 | self.sample_weight = sample_weight 28 | self.X_validation = X_validation 29 | self.y_validation = y_validation 30 | self.eval_metric = eval_metric 31 | self.seed = random_state 32 | 33 | def __call__(self, trial): 34 | try: 35 | Algorithm = ( 36 | MLPRegressorAlgorithm if self.ml_task == REGRESSION else MLPAlgorithm 37 | ) 38 | params = { 39 | "dense_1_size": trial.suggest_int("dense_1_size", 4, 100), 40 | "dense_2_size": trial.suggest_int("dense_2_size", 2, 100), 41 | "learning_rate": trial.suggest_categorical( 42 | "learning_rate", [0.005, 0.01, 0.05, 0.1, 0.2] 43 | ), 44 | "learning_rate_type": trial.suggest_categorical( 45 | "learning_rate_type", ["constant", "adaptive"] 46 | ), 47 | "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True), 48 | "seed": self.seed, 49 | "ml_task": self.ml_task, 50 | } 51 | model = Algorithm(params) 52 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 53 | 54 | preds = model.predict(self.X_validation) 55 | 56 | score = self.eval_metric(self.y_validation, preds) 57 | if Metric.optimize_negative(self.eval_metric.name): 58 | score *= -1.0 59 | 60 | except optuna.exceptions.TrialPruned as e: 61 | raise e 62 | except Exception as e: 63 | print("Exception in NeuralNetworkObjective", str(e)) 64 | return None 65 | 66 | return score 67 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_compute_additional_metrics.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from supervised.algorithms.registry import BINARY_CLASSIFICATION, REGRESSION 6 | from supervised.utils.additional_metrics import AdditionalMetrics 7 | 8 | 9 | class ComputeAdditionalMetricsTest(unittest.TestCase): 10 | def test_compute(self): 11 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 12 | pred = np.array([0.1, 0.8, 0.1, 0.1, 0.8, 0.1, 0.8, 0.8]) 13 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 14 | details = info["metric_details"] 15 | max_metrics = info["max_metrics"] 16 | conf = info["confusion_matrix"] 17 | self.assertEqual(conf.iloc[0, 0], 3) 18 | self.assertEqual(conf.iloc[1, 1], 3) 19 | self.assertTrue(details is not None) 20 | self.assertTrue(max_metrics is not None) 21 | 22 | def test_compute_f1(self): 23 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 24 | pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) 25 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 26 | details = info["metric_details"] 27 | max_metrics = info["max_metrics"] 28 | conf = info["confusion_matrix"] 29 | self.assertEqual(max_metrics["f1"]["score"], 1) 30 | self.assertTrue(details is not None) 31 | self.assertTrue(conf is not None) 32 | 33 | def test_compute_for_regression(self): 34 | target = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 35 | pred = np.array([0.01, 0.2, 0.1, 0.1, 0.8, 0.8, 0.8, 0.8]) 36 | info = AdditionalMetrics.compute(target, pred, None, REGRESSION) 37 | all_metrics = list(info["max_metrics"]["Metric"].values) 38 | for m in ["MAE", "MSE", "RMSE", "R2"]: 39 | self.assertTrue(m in all_metrics) 40 | 41 | def test_compute_constant_preds(self): 42 | target = np.array([0, 0, 1, 1, 0, 0, 0, 0]) 43 | pred = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) 44 | info = AdditionalMetrics.compute(target, pred, None, BINARY_CLASSIFICATION) 45 | details = info["metric_details"] 46 | max_metrics = info["max_metrics"] 47 | conf = info["confusion_matrix"] 48 | self.assertTrue(max_metrics["f1"]["score"] < 1) 49 | self.assertTrue(max_metrics["mcc"]["score"] < 1) 50 | ``` -------------------------------------------------------------------------------- /tests/tests_fairness/test_regression.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class FairnessInRegressionTest(unittest.TestCase): 11 | automl_dir = "automl_fairness_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.random.randint(0, 100, size=(30,)) 19 | S = pd.DataFrame({"sensitive": ["A", "B"] * 15}) 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | model_time_limit=10, 24 | algorithms=["Xgboost"], 25 | explain_level=0, 26 | train_ensemble=False, 27 | stack_models=False, 28 | validation_strategy={"validation_type": "split"}, 29 | start_random_models=1, 30 | ) 31 | 32 | automl.fit(X, y, sensitive_features=S) 33 | 34 | self.assertGreater(len(automl._models), 0) 35 | 36 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 37 | self.assertEqual(len(sensitive_features_names), 1) 38 | self.assertTrue("sensitive" in sensitive_features_names) 39 | 40 | self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None) 41 | self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1) 42 | self.assertTrue(automl._models[0].get_worst_fairness() is not None) 43 | self.assertTrue(automl._models[0].get_best_fairness() is not None) 44 | 45 | def test_two_sensitive_features(self): 46 | X = np.random.uniform(size=(30, 2)) 47 | y = np.random.randint(0, 100, size=(30,)) 48 | S = pd.DataFrame( 49 | { 50 | "sensitive_1": ["White", "Black"] * 15, 51 | "sensitive_2": ["Male", "Female"] * 15, 52 | } 53 | ) 54 | 55 | automl = AutoML( 56 | results_path=self.automl_dir, 57 | model_time_limit=10, 58 | algorithms=["Xgboost"], 59 | explain_level=0, 60 | train_ensemble=False, 61 | stack_models=False, 62 | start_random_models=1, 63 | ) 64 | 65 | automl.fit(X, y, sensitive_features=S) 66 | 67 | self.assertGreater(len(automl._models), 0) 68 | 69 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 70 | self.assertEqual(len(sensitive_features_names), 2) 71 | ``` -------------------------------------------------------------------------------- /tests/tests_tuner/test_time_controller.py: -------------------------------------------------------------------------------- ```python 1 | import time 2 | import unittest 3 | 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.tuner.time_controller import TimeController 7 | 8 | 9 | class TimeControllerTest(unittest.TestCase): 10 | def test_to_and_from_json(self): 11 | tc = TimeController( 12 | start_time=time.time(), 13 | total_time_limit=10, 14 | model_time_limit=None, 15 | steps=["simple_algorithms"], 16 | algorithms=["Baseline"], 17 | ) 18 | tc.log_time("1_Baseline", "Baseline", "simple_algorithms", 123.1) 19 | 20 | tc2 = TimeController.from_json(tc.to_json()) 21 | 22 | assert_almost_equal(tc2.step_spend("simple_algorithms"), 123.1) 23 | assert_almost_equal(tc2.model_spend("Baseline"), 123.1) 24 | 25 | def test_enough_time_for_stacking(self): 26 | for t in [5, 10, 20]: 27 | tc = TimeController( 28 | start_time=time.time(), 29 | total_time_limit=100, 30 | model_time_limit=None, 31 | steps=[ 32 | "default_algorithms", 33 | "not_so_random", 34 | "golden_features", 35 | "insert_random_feature", 36 | "features_selection", 37 | "hill_climbing_1", 38 | "hill_climbing_3", 39 | "hill_climbing_5", 40 | "ensemble", 41 | "stack", 42 | "ensemble_stacked", 43 | ], 44 | algorithms=["Xgboost"], 45 | ) 46 | tc.log_time("1_Xgboost", "Xgboost", "default_algorithms", t) 47 | tc.log_time("2_Xgboost", "Xgboost", "not_so_random", t) 48 | tc.log_time("3_Xgboost", "Xgboost", "insert_random_feature", t) 49 | tc.log_time("4_Xgboost", "Xgboost", "features_selection", t) 50 | tc.log_time("5_Xgboost", "Xgboost", "hill_climbing_1", t) 51 | tc.log_time("6_Xgboost", "Xgboost", "hill_climbing_2", t) 52 | tc.log_time("7_Xgboost", "Xgboost", "hill_climbing_3", t) 53 | 54 | tc._start_time = time.time() - 7 * t 55 | assert_almost_equal(tc.already_spend(), 7 * t) 56 | if t < 20: 57 | self.assertTrue(tc.enough_time("Xgboost", "stack")) 58 | else: 59 | self.assertFalse(tc.enough_time("Xgboost", "stack")) 60 | self.assertTrue(tc.enough_time("Ensemble_Stacked", "ensemble_stacked")) 61 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/registry.py: -------------------------------------------------------------------------------- ```python 1 | # tasks that can be handled by the package 2 | BINARY_CLASSIFICATION = "binary_classification" 3 | MULTICLASS_CLASSIFICATION = "multiclass_classification" 4 | REGRESSION = "regression" 5 | 6 | class AlgorithmsRegistry: 7 | registry = { 8 | BINARY_CLASSIFICATION: {}, 9 | MULTICLASS_CLASSIFICATION: {}, 10 | REGRESSION: {}, 11 | } 12 | 13 | @staticmethod 14 | def add( 15 | task_name, 16 | model_class, 17 | model_params, 18 | required_preprocessing, 19 | additional, 20 | default_params, 21 | ): 22 | model_information = { 23 | "class": model_class, 24 | "params": model_params, 25 | "required_preprocessing": required_preprocessing, 26 | "additional": additional, 27 | "default_params": default_params, 28 | } 29 | AlgorithmsRegistry.registry[task_name][ 30 | model_class.algorithm_short_name 31 | ] = model_information 32 | 33 | @staticmethod 34 | def get_supported_ml_tasks(): 35 | return AlgorithmsRegistry.registry.keys() 36 | 37 | @staticmethod 38 | def get_algorithm_class(ml_task, algorithm_name): 39 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["class"] 40 | 41 | @staticmethod 42 | def get_long_name(ml_task, algorithm_name): 43 | return AlgorithmsRegistry.registry[ml_task][algorithm_name][ 44 | "class" 45 | ].algorithm_name 46 | 47 | @staticmethod 48 | def get_max_rows_limit(ml_task, algorithm_name): 49 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][ 50 | "max_rows_limit" 51 | ] 52 | 53 | @staticmethod 54 | def get_max_cols_limit(ml_task, algorithm_name): 55 | return AlgorithmsRegistry.registry[ml_task][algorithm_name]["additional"][ 56 | "max_cols_limit" 57 | ] 58 | 59 | @staticmethod 60 | def get_eval_metric(algorithm_name, ml_task, automl_eval_metric): 61 | if algorithm_name == "Xgboost": 62 | return xgboost_eval_metric(ml_task, automl_eval_metric) 63 | 64 | return automl_eval_metric 65 | 66 | # Import algorithm to be registered 67 | import supervised.algorithms.baseline 68 | import supervised.algorithms.catboost 69 | import supervised.algorithms.decision_tree 70 | import supervised.algorithms.extra_trees 71 | import supervised.algorithms.knn 72 | import supervised.algorithms.lightgbm 73 | import supervised.algorithms.linear 74 | import supervised.algorithms.nn 75 | import supervised.algorithms.random_forest 76 | import supervised.algorithms.xgboost ``` -------------------------------------------------------------------------------- /supervised/tuner/hill_climbing.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | 3 | import numpy as np 4 | 5 | from supervised.algorithms.registry import AlgorithmsRegistry 6 | 7 | 8 | class HillClimbing: 9 | 10 | """ 11 | Example params are in JSON format: 12 | { 13 | "booster": ["gbtree", "gblinear"], 14 | "objective": ["binary:logistic"], 15 | "eval_metric": ["auc", "logloss"], 16 | "eta": [0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1] 17 | } 18 | """ 19 | 20 | @staticmethod 21 | def get(params, ml_task, seed=1): 22 | np.random.seed(seed) 23 | keys = list(params.keys()) 24 | for k in [ 25 | "num_class", 26 | "model_type", 27 | "seed", 28 | "ml_task", 29 | "explain_level", 30 | "model_architecture_json", 31 | "n_jobs", 32 | "metric", 33 | "eval_metric", 34 | "custom_eval_metric_name", 35 | "eval_metric_name", 36 | ]: 37 | if k in keys: 38 | keys.remove(k) 39 | 40 | model_type = params["model_type"] 41 | if model_type == "Baseline": 42 | return [None, None] 43 | model_info = AlgorithmsRegistry.registry[ml_task][model_type] 44 | model_params = model_info["params"] 45 | 46 | permuted_keys = np.random.permutation(keys) 47 | key_to_update = None 48 | values = None 49 | 50 | for key_to_update in permuted_keys: 51 | if key_to_update not in model_params: 52 | continue 53 | values = model_params[key_to_update] 54 | if len(values) > 1: 55 | break 56 | if values is None: 57 | return [None, None] 58 | 59 | left, right = None, None 60 | for i, v in enumerate(values): 61 | if v == params[key_to_update]: 62 | if i + 1 < len(values): 63 | right = values[i + 1] 64 | if i - 1 >= 0: 65 | left = values[i - 1] 66 | 67 | params_1, params_2 = None, None 68 | if left is not None: 69 | params_1 = copy.deepcopy(params) 70 | params_1[key_to_update] = left 71 | if right is not None: 72 | params_2 = copy.deepcopy(params) 73 | params_2[key_to_update] = right 74 | 75 | if params_1 is not None and "model_architecture_json" in params_1: 76 | del params_1["model_architecture_json"] 77 | if params_2 is not None and "model_architecture_json" in params_2: 78 | del params_2["model_architecture_json"] 79 | 80 | return [params_1, params_2] 81 | ``` -------------------------------------------------------------------------------- /supervised/tuner/data_info.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.algorithms.registry import ( 5 | BINARY_CLASSIFICATION, 6 | MULTICLASS_CLASSIFICATION, 7 | REGRESSION, 8 | ) 9 | from supervised.preprocessing.encoding_selector import EncodingSelector 10 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 11 | 12 | 13 | class DataInfo: 14 | @staticmethod 15 | def compute(X, y, machinelearning_task): 16 | columns_info = {} 17 | for col in X.columns: 18 | columns_info[col] = [] 19 | # 20 | empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0] 21 | if empty_column: 22 | columns_info[col] += ["empty_column"] 23 | continue 24 | # 25 | constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1 26 | if constant_column: 27 | columns_info[col] += ["constant_column"] 28 | continue 29 | # 30 | if PreprocessingUtils.is_na(X[col]): 31 | columns_info[col] += ["missing_values"] 32 | # 33 | if PreprocessingUtils.is_categorical(X[col]): 34 | columns_info[col] += ["categorical"] 35 | columns_info[col] += [EncodingSelector.get(X, y, col)] 36 | elif PreprocessingUtils.is_datetime(X[col]): 37 | columns_info[col] += ["datetime_transform"] 38 | elif PreprocessingUtils.is_text(X[col]): 39 | columns_info[col] = ["text_transform"] # override other transforms 40 | else: 41 | # numeric type, check if scale needed 42 | if PreprocessingUtils.is_scale_needed(X[col]): 43 | columns_info[col] += ["scale"] 44 | 45 | target_info = [] 46 | if machinelearning_task == BINARY_CLASSIFICATION: 47 | if not PreprocessingUtils.is_0_1(y): 48 | target_info += ["convert_0_1"] 49 | 50 | if machinelearning_task == REGRESSION: 51 | if PreprocessingUtils.is_log_scale_needed(y): 52 | target_info += ["scale_log"] 53 | elif PreprocessingUtils.is_scale_needed(y): 54 | target_info += ["scale"] 55 | 56 | num_class = None 57 | if machinelearning_task == MULTICLASS_CLASSIFICATION: 58 | num_class = PreprocessingUtils.num_class(y) 59 | 60 | return { 61 | "columns_info": columns_info, 62 | "target_info": target_info, 63 | "num_class": num_class, 64 | } 65 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_dir_change.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | from sklearn import datasets 8 | 9 | from supervised import AutoML 10 | 11 | 12 | class AutoMLDirChangeTest(unittest.TestCase): 13 | automl_dir_a = "automl_testing_A" 14 | automl_dir_b = "automl_testing_B" 15 | automl_dir = "automl_testing" 16 | 17 | def tearDown(self): 18 | shutil.rmtree(self.automl_dir_a, ignore_errors=True) 19 | shutil.rmtree(self.automl_dir_b, ignore_errors=True) 20 | 21 | def create_dir(self, dir_path): 22 | if not os.path.exists(dir_path): 23 | try: 24 | os.mkdir(dir_path) 25 | except Exception as e: 26 | pass 27 | 28 | def test_create_report_after_dir_change(self): 29 | # 30 | # test for https://github.com/mljar/mljar-supervised/issues/384 31 | # 32 | self.create_dir(self.automl_dir_a) 33 | self.create_dir(self.automl_dir_b) 34 | 35 | path_a = os.path.join(self.automl_dir_a, self.automl_dir) 36 | path_b = os.path.join(self.automl_dir_b, self.automl_dir) 37 | 38 | X = np.random.uniform(size=(30, 2)) 39 | y = np.random.randint(0, 2, size=(30,)) 40 | 41 | automl = AutoML(results_path=path_a, algorithms=["Baseline"], explain_level=0) 42 | automl.fit(X, y) 43 | 44 | shutil.move(path_a, path_b) 45 | 46 | automl2 = AutoML( 47 | results_path=path_b, 48 | ) 49 | automl2.report() 50 | 51 | def test_compute_predictions_after_dir_change(self): 52 | # 53 | # test for https://github.com/mljar/mljar-supervised/issues/384 54 | # 55 | self.create_dir(self.automl_dir_a) 56 | self.create_dir(self.automl_dir_b) 57 | 58 | path_a = os.path.join(self.automl_dir_a, self.automl_dir) 59 | path_b = os.path.join(self.automl_dir_b, self.automl_dir) 60 | 61 | X, y = datasets.make_regression( 62 | n_samples=100, 63 | n_features=5, 64 | n_informative=4, 65 | n_targets=1, 66 | shuffle=False, 67 | random_state=0, 68 | ) 69 | 70 | automl = AutoML( 71 | results_path=path_a, 72 | explain_level=0, 73 | ml_task="regression", 74 | total_time_limit=10, 75 | ) 76 | automl.fit(X, y) 77 | p = automl.predict(X[:3]) 78 | 79 | shutil.move(path_a, path_b) 80 | 81 | automl2 = AutoML( 82 | results_path=path_b, 83 | ) 84 | p2 = automl2.predict(X[:3]) 85 | 86 | for i in range(3): 87 | assert_almost_equal(p[i], p2[i]) 88 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_scale.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from numpy.testing import assert_almost_equal 6 | 7 | from supervised.preprocessing.scale import Scale 8 | 9 | 10 | class ScaleTest(unittest.TestCase): 11 | def test_fit_log_and_normal(self): 12 | # training data 13 | d = { 14 | "col1": [12, 13, 3, 4, 5, 6, 7, 8000, 9000, 10000.0], 15 | "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0], 16 | "col3": [12, 2, 3, 4, 5, 6, 7, 8000, 9000, 10000.0], 17 | } 18 | df = pd.DataFrame(data=d) 19 | 20 | scale = Scale(["col1", "col3"], scale_method=Scale.SCALE_LOG_AND_NORMAL) 21 | scale.fit(df) 22 | df = scale.transform(df) 23 | val = float(df["col1"][0]) 24 | 25 | assert_almost_equal(np.mean(df["col1"]), 0) 26 | self.assertTrue( 27 | df["col1"][0] + 0.01 < df["col1"][1] 28 | ) # in case of wrong scaling the small values will be squeezed 29 | 30 | df = scale.inverse_transform(df) 31 | 32 | scale2 = Scale() 33 | scale_params = scale.to_json() 34 | 35 | scale2.from_json(scale_params) 36 | df = scale2.transform(df) 37 | assert_almost_equal(df["col1"][0], val) 38 | 39 | def test_fit(self): 40 | # training data 41 | d = { 42 | "col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10.0], 43 | "col2": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30.0], 44 | } 45 | df = pd.DataFrame(data=d) 46 | 47 | scale = Scale(["col1"]) 48 | scale.fit(df) 49 | df = scale.transform(df) 50 | 51 | assert_almost_equal(np.mean(df["col1"]), 0) 52 | assert_almost_equal(np.mean(df["col2"]), 25.5) 53 | 54 | df = scale.inverse_transform(df) 55 | assert_almost_equal(df["col1"][0], 1) 56 | assert_almost_equal(df["col1"][1], 2) 57 | 58 | def test_to_and_from_json(self): 59 | # training data 60 | d = { 61 | "col1": [1, 2, 3, 4, 5, 6, 7, 8.0, 9, 10], 62 | "col2": [21, 22.0, 23, 24, 25, 26, 27, 28, 29, 30], 63 | } 64 | df = pd.DataFrame(data=d) 65 | 66 | scale = Scale(["col1"]) 67 | scale.fit(df) 68 | # do not transform 69 | assert_almost_equal(np.mean(df["col1"]), 5.5) 70 | assert_almost_equal(np.mean(df["col2"]), 25.5) 71 | # to and from json 72 | 73 | json_data = scale.to_json() 74 | scale2 = Scale() 75 | scale2.from_json(json_data) 76 | # transform with loaded scaler 77 | df = scale2.transform(df) 78 | assert_almost_equal(np.mean(df["col1"]), 0) 79 | assert_almost_equal(np.mean(df["col2"]), 25.5) 80 | ``` -------------------------------------------------------------------------------- /tests/tests_utils/test_metric.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | 6 | from supervised.utils.metric import Metric 7 | from supervised.utils.metric import UserDefinedEvalMetric 8 | 9 | 10 | class MetricTest(unittest.TestCase): 11 | def test_create(self): 12 | params = {"name": "logloss"} 13 | m = Metric(params) 14 | y_true = np.array([0, 0, 1, 1]) 15 | y_predicted = np.array([0, 0, 1, 1]) 16 | score = m(y_true, y_predicted) 17 | self.assertTrue(score < 0.1) 18 | y_true = np.array([0, 0, 1, 1]) 19 | y_predicted = np.array([1, 1, 0, 0]) 20 | score = m(y_true, y_predicted) 21 | self.assertTrue(score > 1.0) 22 | 23 | def test_metric_improvement(self): 24 | params = {"name": "logloss"} 25 | m = Metric(params) 26 | y_true = np.array([0, 0, 1, 1]) 27 | y_predicted = np.array([0, 0, 0, 1]) 28 | score_1 = m(y_true, y_predicted) 29 | y_true = np.array([0, 0, 1, 1]) 30 | y_predicted = np.array([0, 0, 1, 1]) 31 | score_2 = m(y_true, y_predicted) 32 | self.assertTrue(m.improvement(score_1, score_2)) 33 | 34 | def test_sample_weight(self): 35 | metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"] 36 | for m in metrics: 37 | metric = Metric({"name": m}) 38 | y_true = np.array([0, 0, 1, 1]) 39 | y_predicted = np.array([0, 0, 0, 1]) 40 | sample_weight = np.array([1, 1, 1, 1]) 41 | 42 | score_1 = metric(y_true, y_predicted) 43 | score_2 = metric(y_true, y_predicted, sample_weight) 44 | assert_almost_equal(score_1, score_2) 45 | 46 | def test_r2_metric(self): 47 | params = {"name": "r2"} 48 | m = Metric(params) 49 | y_true = np.array([0, 0, 1, 1]) 50 | y_predicted = np.array([0, 0, 1, 1]) 51 | score = m(y_true, y_predicted) 52 | self.assertEqual(score, -1.0) # negative r2 53 | 54 | def test_mape_metric(self): 55 | params = {"name": "mape"} 56 | m = Metric(params) 57 | y_true = np.array([0, 0, 1, 1]) 58 | y_predicted = np.array([0, 0, 1, 1]) 59 | score = m(y_true, y_predicted) 60 | self.assertEqual(score, 0.0) 61 | 62 | def test_user_defined_metric(self): 63 | def custom(x, y, sample_weight=None): 64 | return np.sum(x + y) 65 | 66 | UserDefinedEvalMetric().set_metric(custom) 67 | 68 | params = {"name": "user_defined_metric"} 69 | m = Metric(params) 70 | 71 | a = np.array([1, 1, 1]) 72 | 73 | score = m(a, a) 74 | self.assertEqual(score, 6) 75 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_joblib_version.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import joblib 7 | import numpy as np 8 | 9 | from supervised import AutoML 10 | from supervised.exceptions import AutoMLException 11 | 12 | 13 | class TestJoblibVersion(unittest.TestCase): 14 | automl_dir = "TestJoblibVersion" 15 | 16 | def tearDown(self): 17 | shutil.rmtree(self.automl_dir, ignore_errors=True) 18 | 19 | def test_joblib_good_version(self): 20 | X = np.random.uniform(size=(60, 2)) 21 | y = np.random.randint(0, 2, size=(60,)) 22 | 23 | automl = AutoML( 24 | results_path=self.automl_dir, 25 | model_time_limit=10, 26 | algorithms=["Xgboost"], 27 | mode="Explain", 28 | explain_level=0, 29 | start_random_models=1, 30 | hill_climbing_steps=0, 31 | top_models_to_improve=0, 32 | kmeans_features=False, 33 | golden_features=False, 34 | features_selection=False, 35 | boost_on_errors=False, 36 | ) 37 | automl.fit(X, y) 38 | 39 | # Test if joblib is in json 40 | json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json") 41 | 42 | with open(json_path) as file: 43 | frame = json.load(file) 44 | 45 | json_version = frame["joblib_version"] 46 | expected_result = joblib.__version__ 47 | 48 | self.assertEqual(expected_result, json_version) 49 | 50 | def test_joblib_wrong_version(self): 51 | X = np.random.uniform(size=(60, 2)) 52 | y = np.random.randint(0, 2, size=(60,)) 53 | 54 | automl = AutoML( 55 | results_path=self.automl_dir, 56 | model_time_limit=10, 57 | algorithms=["Xgboost"], 58 | mode="Explain", 59 | explain_level=0, 60 | start_random_models=1, 61 | hill_climbing_steps=0, 62 | top_models_to_improve=0, 63 | kmeans_features=False, 64 | golden_features=False, 65 | features_selection=False, 66 | boost_on_errors=False, 67 | ) 68 | automl.fit(X, y) 69 | 70 | json_path = os.path.join(self.automl_dir, "1_Default_Xgboost", "framework.json") 71 | 72 | with open(json_path) as file: 73 | frame = json.load(file) 74 | 75 | # Injection of wrong joblib version 76 | frame["joblib_version"] = "0.2.0" 77 | 78 | with open(json_path, "w") as file: 79 | json.dump(frame, file) 80 | 81 | with self.assertRaises(AutoMLException): 82 | automl_2 = AutoML(results_path=self.automl_dir) 83 | automl_2.predict(X) 84 | 85 | 86 | if __name__ == "__main__": 87 | unittest.main() 88 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/baseline.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.dummy import DummyClassifier, DummyRegressor 6 | 7 | from supervised.algorithms.registry import ( 8 | BINARY_CLASSIFICATION, 9 | MULTICLASS_CLASSIFICATION, 10 | REGRESSION, 11 | AlgorithmsRegistry, 12 | ) 13 | from supervised.algorithms.sklearn import SklearnAlgorithm 14 | from supervised.utils.config import LOG_LEVEL 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(LOG_LEVEL) 18 | 19 | 20 | class BaselineClassifierAlgorithm(ClassifierMixin, SklearnAlgorithm): 21 | algorithm_name = "Baseline Classifier" 22 | algorithm_short_name = "Baseline" 23 | 24 | def __init__(self, params): 25 | super(BaselineClassifierAlgorithm, self).__init__(params) 26 | logger.debug("BaselineClassifierAlgorithm.__init__") 27 | 28 | self.library_version = sklearn.__version__ 29 | self.max_iters = additional.get("max_steps", 1) 30 | self.model = DummyClassifier( 31 | strategy="prior", random_state=params.get("seed", 1) 32 | ) 33 | 34 | def file_extension(self): 35 | return "baseline" 36 | 37 | def is_fitted(self): 38 | return ( 39 | hasattr(self.model, "n_outputs_") 40 | and self.model.n_outputs_ is not None 41 | and self.model.n_outputs_ > 0 42 | ) 43 | 44 | 45 | class BaselineRegressorAlgorithm(RegressorMixin, SklearnAlgorithm): 46 | algorithm_name = "Baseline Regressor" 47 | algorithm_short_name = "Baseline" 48 | 49 | def __init__(self, params): 50 | super(BaselineRegressorAlgorithm, self).__init__(params) 51 | logger.debug("BaselineRegressorAlgorithm.__init__") 52 | 53 | self.library_version = sklearn.__version__ 54 | self.max_iters = additional.get("max_steps", 1) 55 | self.model = DummyRegressor(strategy="mean") 56 | 57 | def file_extension(self): 58 | return "baseline" 59 | 60 | def is_fitted(self): 61 | return ( 62 | hasattr(self.model, "n_outputs_") 63 | and self.model.n_outputs_ is not None 64 | and self.model.n_outputs_ > 0 65 | ) 66 | 67 | 68 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None} 69 | required_preprocessing = ["target_as_integer"] 70 | 71 | AlgorithmsRegistry.add( 72 | BINARY_CLASSIFICATION, 73 | BaselineClassifierAlgorithm, 74 | {}, 75 | required_preprocessing, 76 | additional, 77 | {}, 78 | ) 79 | 80 | AlgorithmsRegistry.add( 81 | MULTICLASS_CLASSIFICATION, 82 | BaselineClassifierAlgorithm, 83 | {}, 84 | required_preprocessing, 85 | additional, 86 | {}, 87 | ) 88 | 89 | 90 | AlgorithmsRegistry.add(REGRESSION, BaselineRegressorAlgorithm, {}, {}, additional, {}) 91 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/extra_trees.py: -------------------------------------------------------------------------------- ```python 1 | import optuna 2 | 3 | from supervised.algorithms.extra_trees import ( 4 | ExtraTreesAlgorithm, 5 | ExtraTreesRegressorAlgorithm, 6 | ) 7 | from supervised.algorithms.registry import ( 8 | REGRESSION, 9 | ) 10 | from supervised.utils.metric import Metric 11 | 12 | EPS = 1e-8 13 | 14 | 15 | class ExtraTreesObjective: 16 | def __init__( 17 | self, 18 | ml_task, 19 | X_train, 20 | y_train, 21 | sample_weight, 22 | X_validation, 23 | y_validation, 24 | sample_weight_validation, 25 | eval_metric, 26 | n_jobs, 27 | random_state, 28 | ): 29 | self.ml_task = ml_task 30 | self.X_train = X_train 31 | self.y_train = y_train 32 | self.sample_weight = sample_weight 33 | self.X_validation = X_validation 34 | self.y_validation = y_validation 35 | self.eval_metric = eval_metric 36 | self.n_jobs = n_jobs 37 | self.objective = "squared_error" if ml_task == REGRESSION else "gini" 38 | self.max_steps = 10 # ET is trained in steps 100 trees each 39 | self.seed = random_state 40 | 41 | def __call__(self, trial): 42 | try: 43 | Algorithm = ( 44 | ExtraTreesRegressorAlgorithm 45 | if self.ml_task == REGRESSION 46 | else ExtraTreesAlgorithm 47 | ) 48 | self.objective = ( 49 | "squared_error" 50 | if self.ml_task == REGRESSION 51 | else trial.suggest_categorical("criterion", ["gini", "entropy"]) 52 | ) 53 | params = { 54 | "max_steps": self.max_steps, 55 | "criterion": self.objective, 56 | "max_depth": trial.suggest_int("max_depth", 2, 32), 57 | "min_samples_split": trial.suggest_int("min_samples_split", 2, 100), 58 | "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100), 59 | "max_features": trial.suggest_float("max_features", 0.01, 1), 60 | "n_jobs": self.n_jobs, 61 | "seed": self.seed, 62 | "ml_task": self.ml_task, 63 | } 64 | model = Algorithm(params) 65 | 66 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 67 | 68 | preds = model.predict(self.X_validation) 69 | 70 | score = self.eval_metric(self.y_validation, preds) 71 | if Metric.optimize_negative(self.eval_metric.name): 72 | score *= -1.0 73 | 74 | except optuna.exceptions.TrialPruned as e: 75 | raise e 76 | except Exception as e: 77 | print("Exception in ExtraTreesObjective", str(e)) 78 | return None 79 | 80 | return score 81 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/random_forest.py: -------------------------------------------------------------------------------- ```python 1 | import optuna 2 | 3 | from supervised.algorithms.random_forest import ( 4 | RandomForestAlgorithm, 5 | RandomForestRegressorAlgorithm, 6 | ) 7 | from supervised.algorithms.registry import ( 8 | REGRESSION, 9 | ) 10 | from supervised.utils.metric import Metric 11 | 12 | 13 | class RandomForestObjective: 14 | def __init__( 15 | self, 16 | ml_task, 17 | X_train, 18 | y_train, 19 | sample_weight, 20 | X_validation, 21 | y_validation, 22 | sample_weight_validation, 23 | eval_metric, 24 | n_jobs, 25 | random_state, 26 | ): 27 | self.ml_task = ml_task 28 | self.X_train = X_train 29 | self.y_train = y_train 30 | self.sample_weight = sample_weight 31 | self.X_validation = X_validation 32 | self.y_validation = y_validation 33 | self.eval_metric = eval_metric 34 | self.n_jobs = n_jobs 35 | self.objective = "squared_error" if ml_task == REGRESSION else "gini" 36 | self.max_steps = 10 # RF is trained in steps 100 trees each 37 | self.seed = random_state 38 | 39 | def __call__(self, trial): 40 | try: 41 | Algorithm = ( 42 | RandomForestRegressorAlgorithm 43 | if self.ml_task == REGRESSION 44 | else RandomForestAlgorithm 45 | ) 46 | self.objective = ( 47 | "squared_error" 48 | if self.ml_task == REGRESSION 49 | else trial.suggest_categorical("criterion", ["gini", "entropy"]) 50 | ) 51 | params = { 52 | "max_steps": self.max_steps, 53 | "criterion": self.objective, 54 | "max_depth": trial.suggest_int("max_depth", 2, 32), 55 | "min_samples_split": trial.suggest_int("min_samples_split", 2, 100), 56 | "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100), 57 | "max_features": trial.suggest_float("max_features", 0.01, 1), 58 | "n_jobs": self.n_jobs, 59 | "seed": self.seed, 60 | "ml_task": self.ml_task, 61 | } 62 | model = Algorithm(params) 63 | model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) 64 | 65 | preds = model.predict(self.X_validation) 66 | 67 | score = self.eval_metric(self.y_validation, preds) 68 | if Metric.optimize_negative(self.eval_metric.name): 69 | score *= -1.0 70 | 71 | except optuna.exceptions.TrialPruned as e: 72 | raise e 73 | except Exception as e: 74 | print("Exception in RandomForestObjective", str(e)) 75 | return None 76 | 77 | return score 78 | ``` -------------------------------------------------------------------------------- /tests/tests_tuner/test_hill_climbing.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | from supervised.tuner.mljar_tuner import MljarTuner 4 | 5 | 6 | class ModelMock: 7 | def __init__(self, name, model_type, final_loss, params): 8 | self.name = name 9 | self.model_type = model_type 10 | self.final_loss = final_loss 11 | self.params = params 12 | 13 | def get_name(self): 14 | return self.name 15 | 16 | def get_type(self): 17 | return self.model_type 18 | 19 | def get_final_loss(self): 20 | return self.final_loss 21 | 22 | def get_train_time(self): 23 | return 0.1 24 | 25 | 26 | class TunerHillClimbingTest(unittest.TestCase): 27 | def test_hill_climbing(self): 28 | models = [] 29 | models += [ 30 | ModelMock( 31 | "121_RandomForest", 32 | "Random Forest", 33 | 0.1, 34 | { 35 | "learner": {"max_features": 0.4, "model_type": "Random Forest"}, 36 | "preprocessing": {}, 37 | "validation_strategy": {}, 38 | }, 39 | ) 40 | ] 41 | models += [ 42 | ModelMock( 43 | "1_RandomForest", 44 | "Random Forest", 45 | 0.1, 46 | { 47 | "learner": {"max_features": 0.4, "model_type": "Random Forest"}, 48 | "preprocessing": {}, 49 | "validation_strategy": {}, 50 | }, 51 | ) 52 | ] 53 | tuner = MljarTuner( 54 | { 55 | "start_random_models": 0, 56 | "hill_climbing_steps": 1, 57 | "top_models_to_improve": 2, 58 | }, 59 | algorithms=["Random Foresrt"], 60 | ml_task="binary_classification", 61 | eval_metric="logloss", 62 | validation_strategy={}, 63 | explain_level=2, 64 | data_info={"columns_info": [], "target_info": []}, 65 | golden_features=False, 66 | features_selection=False, 67 | train_ensemble=False, 68 | stack_models=False, 69 | adjust_validation=False, 70 | boost_on_errors=False, 71 | kmeans_features=False, 72 | mix_encoding=False, 73 | optuna_time_budget=None, 74 | optuna_init_params={}, 75 | optuna_verbose=True, 76 | n_jobs=1, 77 | seed=12, 78 | ) 79 | ind = 121 80 | score = 0.1 81 | for _ in range(5): 82 | for p in tuner.get_hill_climbing_params(models): 83 | models += [ModelMock(p["name"], "Random Forest", score, p)] 84 | score *= 0.1 85 | self.assertTrue(int(p["name"].split("_")[0]) > ind) 86 | ind += 1 87 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/text_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import warnings 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | 6 | 7 | class TextTransformer(object): 8 | def __init__(self): 9 | self._new_columns = [] 10 | self._old_column = None 11 | self._max_features = 100 12 | self._vectorizer = None 13 | 14 | def fit(self, X, column): 15 | self._old_column = column 16 | self._vectorizer = TfidfVectorizer( 17 | analyzer="word", 18 | stop_words="english", 19 | lowercase=True, 20 | max_features=self._max_features, 21 | ) 22 | 23 | x = X[column][~pd.isnull(X[column])] 24 | self._vectorizer.fit(x) 25 | for f in list(self._vectorizer.get_feature_names_out()): 26 | new_col = self._old_column + "_" + f 27 | self._new_columns += [new_col] 28 | 29 | def transform(self, X): 30 | with warnings.catch_warnings(): 31 | warnings.simplefilter( 32 | action="ignore", category=pd.errors.PerformanceWarning 33 | ) 34 | ii = ~pd.isnull(X[self._old_column]) 35 | x = X[self._old_column][ii] 36 | vect = self._vectorizer.transform(x) 37 | 38 | for f in self._new_columns: 39 | X[f] = 0.0 40 | 41 | X.loc[ii, self._new_columns] = vect.toarray() 42 | X.drop(self._old_column, axis=1, inplace=True) 43 | return X 44 | 45 | def to_json(self): 46 | for k in self._vectorizer.vocabulary_.keys(): 47 | self._vectorizer.vocabulary_[k] = int(self._vectorizer.vocabulary_[k]) 48 | 49 | data_json = { 50 | "new_columns": list(self._new_columns), 51 | "old_column": self._old_column, 52 | "vocabulary": self._vectorizer.vocabulary_, 53 | "fixed_vocabulary": self._vectorizer.fixed_vocabulary_, 54 | "idf": list(self._vectorizer.idf_), 55 | } 56 | return data_json 57 | 58 | def from_json(self, data_json): 59 | self._new_columns = data_json.get("new_columns", None) 60 | self._old_column = data_json.get("old_column", None) 61 | vocabulary = data_json.get("vocabulary") 62 | fixed_vocabulary = data_json.get("fixed_vocabulary") 63 | idf = data_json.get("idf") 64 | if vocabulary is not None and fixed_vocabulary is not None and idf is not None: 65 | self._vectorizer = TfidfVectorizer( 66 | analyzer="word", 67 | stop_words="english", 68 | lowercase=True, 69 | max_features=self._max_features, 70 | ) 71 | self._vectorizer.vocabulary_ = vocabulary 72 | self._vectorizer.fixed_vocabulary_ = fixed_vocabulary 73 | self._vectorizer.idf_ = np.array(idf) 74 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_baseline.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.baseline import ( 9 | BaselineClassifierAlgorithm, 10 | BaselineRegressorAlgorithm, 11 | ) 12 | from supervised.utils.metric import Metric 13 | 14 | 15 | class BaselineTest(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.X, cls.y = datasets.make_regression( 19 | n_samples=100, 20 | n_features=5, 21 | n_informative=4, 22 | n_targets=1, 23 | shuffle=False, 24 | random_state=0, 25 | ) 26 | 27 | def test_reproduce_fit_regression(self): 28 | metric = Metric({"name": "rmse"}) 29 | prev_loss = None 30 | for _ in range(3): 31 | model = BaselineRegressorAlgorithm({"ml_task": "regression"}) 32 | model.fit(self.X, self.y) 33 | y_predicted = model.predict(self.X) 34 | loss = metric(self.y, y_predicted) 35 | if prev_loss is not None: 36 | assert_almost_equal(prev_loss, loss) 37 | prev_loss = loss 38 | 39 | def test_reproduce_fit_bin_class(self): 40 | X, y = datasets.make_classification( 41 | n_samples=100, 42 | n_features=5, 43 | n_informative=4, 44 | n_redundant=1, 45 | n_classes=2, 46 | n_clusters_per_class=3, 47 | n_repeated=0, 48 | shuffle=False, 49 | random_state=0, 50 | ) 51 | metric = Metric({"name": "logloss"}) 52 | prev_loss = None 53 | for _ in range(3): 54 | model = BaselineClassifierAlgorithm({"ml_task": "binary_classification"}) 55 | model.fit(X, y) 56 | y_predicted = model.predict(X) 57 | loss = metric(y, y_predicted) 58 | if prev_loss is not None: 59 | assert_almost_equal(prev_loss, loss) 60 | prev_loss = loss 61 | 62 | def test_save_and_load(self): 63 | metric = Metric({"name": "rmse"}) 64 | dt = BaselineRegressorAlgorithm({"ml_task": "regression"}) 65 | dt.fit(self.X, self.y) 66 | y_predicted = dt.predict(self.X) 67 | loss = metric(self.y, y_predicted) 68 | 69 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 70 | 71 | dt.save(filename) 72 | dt2 = BaselineRegressorAlgorithm({"ml_task": "regression"}) 73 | dt2.load(filename) 74 | # Finished with the file, delete it 75 | os.remove(filename) 76 | 77 | y_predicted = dt2.predict(self.X) 78 | loss2 = metric(self.y, y_predicted) 79 | assert_almost_equal(loss, loss2) 80 | 81 | def test_is_fitted(self): 82 | model = BaselineRegressorAlgorithm({"ml_task": "regression"}) 83 | self.assertFalse(model.is_fitted()) 84 | model.fit(self.X, self.y) 85 | self.assertTrue(model.is_fitted()) 86 | ```