This is page 3 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /supervised/utils/learning_curves.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | logger = logging.getLogger(__name__) 8 | from supervised.utils.common import learner_name_to_fold_repeat 9 | from supervised.utils.config import LOG_LEVEL 10 | from supervised.utils.metric import Metric 11 | 12 | logger.setLevel(LOG_LEVEL) 13 | 14 | import matplotlib.colors as mcolors 15 | import matplotlib.pyplot as plt 16 | 17 | MY_COLORS = list(mcolors.TABLEAU_COLORS.values()) 18 | 19 | 20 | class LearningCurves: 21 | output_file_name = "learning_curves.png" 22 | 23 | @staticmethod 24 | def single_iteration(learner_names, model_path): 25 | for ln in learner_names: 26 | df = pd.read_csv( 27 | os.path.join(model_path, f"{ln}_training.log"), 28 | names=["iteration", "train", "test"], 29 | ) 30 | if df.shape[0] > 1: 31 | return False 32 | return True 33 | 34 | @staticmethod 35 | def plot(learner_names, metric_name, model_path, trees_in_iteration=None): 36 | colors = MY_COLORS 37 | if len(learner_names) > len(colors): 38 | repeat_colors = int(np.ceil(len(learner_names) / len(colors))) 39 | colors = colors * repeat_colors 40 | 41 | if LearningCurves.single_iteration(learner_names, model_path): 42 | LearningCurves.plot_single_iter( 43 | learner_names, metric_name, model_path, colors 44 | ) 45 | else: 46 | LearningCurves.plot_iterations( 47 | learner_names, metric_name, model_path, colors, trees_in_iteration 48 | ) 49 | 50 | @staticmethod 51 | def plot_single_iter(learner_names, metric_name, model_path, colors): 52 | plt.figure(figsize=(10, 7)) 53 | for ln in learner_names: 54 | df = pd.read_csv( 55 | os.path.join(model_path, f"{ln}_training.log"), 56 | names=["iteration", "train", "test"], 57 | ) 58 | 59 | fold, repeat = learner_name_to_fold_repeat(ln) 60 | repeat_str = f" Reapeat {repeat+1}," if repeat is not None else "" 61 | plt.bar( 62 | f"Fold {fold+1},{repeat_str} train", 63 | df.train[0], 64 | color="white", 65 | edgecolor=colors[fold], 66 | ) 67 | plt.bar(f"Fold {fold+1},{repeat_str} test", df.test[0], color=colors[fold]) 68 | 69 | plt.ylabel(metric_name) 70 | plt.xticks(rotation=90) 71 | plt.tight_layout(pad=2.0) 72 | plot_path = os.path.join(model_path, LearningCurves.output_file_name) 73 | plt.savefig(plot_path) 74 | plt.close("all") 75 | 76 | @staticmethod 77 | def plot_iterations( 78 | learner_names, metric_name, model_path, colors, trees_in_iteration=None 79 | ): 80 | plt.figure(figsize=(10, 7)) 81 | for ln in learner_names: 82 | df = pd.read_csv( 83 | os.path.join(model_path, f"{ln}_training.log"), 84 | names=["iteration", "train", "test"], 85 | ) 86 | 87 | fold, repeat = learner_name_to_fold_repeat(ln) 88 | repeat_str = f" Reapeat {repeat+1}," if repeat is not None else "" 89 | # if trees_in_iteration is not None: 90 | # df.iteration = df.iteration * trees_in_iteration 91 | any_none = np.sum(pd.isnull(df.train)) 92 | if any_none == 0: 93 | plt.plot( 94 | df.iteration, 95 | df.train, 96 | "--", 97 | color=colors[fold], 98 | label=f"Fold {fold+1},{repeat_str} train", 99 | ) 100 | any_none = np.sum(pd.isnull(df.test)) 101 | if any_none == 0: 102 | plt.plot( 103 | df.iteration, 104 | df.test, 105 | color=colors[fold], 106 | label=f"Fold {fold+1},{repeat_str} test", 107 | ) 108 | 109 | 110 | if not df.test.isnull().values.any(): 111 | best_iter = None 112 | if Metric.optimize_negative(metric_name): 113 | best_iter = df.test.argmax() 114 | else: 115 | best_iter = df.test.argmin() 116 | 117 | if best_iter is not None and best_iter != -1: 118 | plt.axvline(best_iter, color=colors[fold], alpha=0.3) 119 | 120 | if trees_in_iteration is not None: 121 | plt.xlabel("#Trees") 122 | else: 123 | plt.xlabel("#Iteration") 124 | plt.ylabel(metric_name) 125 | 126 | # limit number of learners in the legend 127 | # too many will raise warnings 128 | if len(learner_names) <= 15: 129 | plt.legend(loc="best") 130 | 131 | plt.tight_layout(pad=2.0) 132 | plot_path = os.path.join(model_path, LearningCurves.output_file_name) 133 | plt.savefig(plot_path) 134 | plt.close("all") 135 | 136 | @staticmethod 137 | def plot_for_ensemble(scores, metric_name, model_path): 138 | plt.figure(figsize=(10, 7)) 139 | plt.plot(range(1, len(scores) + 1), scores, label=f"Ensemble") 140 | plt.xlabel("#Iteration") 141 | plt.ylabel(metric_name) 142 | plt.legend(loc="best") 143 | plot_path = os.path.join(model_path, LearningCurves.output_file_name) 144 | plt.savefig(plot_path) 145 | plt.close("all") 146 | ``` -------------------------------------------------------------------------------- /supervised/fairness/report.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | 4 | class FairnessReport: 5 | """Saves information about fairness in the report.""" 6 | 7 | @staticmethod 8 | def save_classification(fairness_metrics, fout, model_path, is_multi=False): 9 | for k, v in fairness_metrics.items(): 10 | if k == "fairness_optimization": 11 | continue 12 | 13 | if is_multi: 14 | a = k.split("__", maxsplit=1) 15 | feature, class_name = a 16 | 17 | if is_multi: 18 | fout.write( 19 | f"\n\n## Fairness metrics for {feature} feature and {class_name} class\n\n" 20 | ) 21 | else: 22 | fout.write(f"\n\n## Fairness metrics for {k} feature\n\n") 23 | 24 | fout.write(v["metrics"].to_markdown()) 25 | fout.write("\n\n") 26 | fout.write(v["stats"].to_markdown()) 27 | fout.write("\n\n") 28 | 29 | if is_multi: 30 | fout.write( 31 | f"\n\n## Is model fair for {feature} feature and {class_name} class?\n" 32 | ) 33 | else: 34 | fout.write(f"\n\n## Is model fair for {k} feature?\n") 35 | fair_str = "fair" if v["is_fair"] else "unfair" 36 | fairness_threshold = fairness_metrics.get("fairness_optimization", {}).get( 37 | "fairness_threshold" 38 | ) 39 | fairness_threshold_str = "" 40 | if fairness_threshold is not None: 41 | if "ratio" in v["fairness_metric_name"].lower(): 42 | fairness_threshold_str = ( 43 | f"It should be higher than {fairness_threshold}." 44 | ) 45 | else: 46 | fairness_threshold_str = ( 47 | f"It should be lower than {fairness_threshold}." 48 | ) 49 | 50 | if is_multi: 51 | fout.write( 52 | f"Model is {fair_str} for {feature} feature and {class_name} class.\n" 53 | ) 54 | else: 55 | fout.write(f"Model is {fair_str} for {k} feature.\n") 56 | fout.write( 57 | f'The {v["fairness_metric_name"]} is {v["fairness_metric_value"]}. {fairness_threshold_str}\n' 58 | ) 59 | if not v["is_fair"]: 60 | # display information about privileged and underprivileged groups 61 | # for unfair models 62 | if v.get("underprivileged_value") is not None: 63 | fout.write( 64 | f'Underprivileged value is {v["underprivileged_value"]}.\n' 65 | ) 66 | if v.get("privileged_value") is not None: 67 | fout.write(f'Privileged value is {v["privileged_value"]}.\n') 68 | 69 | for figure in v["figures"]: 70 | fout.write(f"\n\n### {figure['title']}\n\n") 71 | figure["figure"].savefig(os.path.join(model_path, figure["fname"])) 72 | fout.write(f"\n\n\n") 73 | 74 | @staticmethod 75 | def regression(fairness_metrics, fout, model_path): 76 | for k, v in fairness_metrics.items(): 77 | if k == "fairness_optimization": 78 | continue 79 | fout.write(f"\n\n## Fairness metrics for {k} feature\n\n") 80 | 81 | fout.write(v["metrics"].to_markdown()) 82 | fout.write("\n\n") 83 | 84 | fout.write(f'Privileged value: {v["privileged_value"]}\n\n') 85 | fout.write(f'Underprivileged value: {v["underprivileged_value"]}\n\n\n') 86 | fout.write(f'Fairness metric: {v["fairness_metric_name"]}\n\n') 87 | fout.write(f'{v["metric_name"]} Difference: {v["diff"]}\n\n') 88 | fout.write(f'{v["metric_name"]} Ratio: {v["ratio"]}\n\n') 89 | 90 | # add sentence about model fairness 91 | if v["is_fair"]: 92 | fout.write(f"Model is fair for {k} feature.\n") 93 | if "ratio" in v["fairness_metric_name"].lower(): 94 | fout.write( 95 | f"The {v['fairness_metric_name']} value is above threshold {v['fairness_threshold']}.\n\n" 96 | ) 97 | else: 98 | fout.write( 99 | f"The {v['fairness_metric_name']} value is below threshold {v['fairness_threshold']}.\n\n" 100 | ) 101 | else: 102 | # model is not fair 103 | fout.write(f"Model is unfair for {k} feature.\n") 104 | if "ratio" in v["fairness_metric_name"].lower(): 105 | fout.write( 106 | f"The {v['fairness_metric_name']} value is below threshold {v['fairness_threshold']}.\n\n" 107 | ) 108 | else: 109 | fout.write( 110 | f"The {v['fairness_metric_name']} value is above threshold {v['fairness_threshold']}.\n\n" 111 | ) 112 | 113 | for figure in v["figures"]: 114 | fout.write(f"\n\n### {figure['title']}\n\n") 115 | figure["figure"].savefig(os.path.join(model_path, figure["fname"])) 116 | fout.write(f"\n\n\n") 117 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_catboost.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import pandas as pd 6 | from numpy.testing import assert_almost_equal 7 | from sklearn import datasets 8 | 9 | from supervised.algorithms.catboost import CatBoostAlgorithm, additional 10 | from supervised.utils.metric import Metric 11 | 12 | additional["max_rounds"] = 1 13 | 14 | 15 | class CatBoostRegressorAlgorithmTest(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.X, cls.y = datasets.make_regression( 19 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 20 | ) 21 | cls.X = pd.DataFrame(cls.X, columns=[f"f_{i}" for i in range(cls.X.shape[1])]) 22 | cls.params = { 23 | "learning_rate": 0.1, 24 | "depth": 4, 25 | "rsm": 0.5, 26 | "l2_leaf_reg": 1, 27 | "seed": 1, 28 | "ml_task": "regression", 29 | "loss_function": "RMSE", 30 | "eval_metric": "RMSE", 31 | } 32 | 33 | def test_reproduce_fit(self): 34 | metric = Metric({"name": "mse"}) 35 | prev_loss = None 36 | for _ in range(2): 37 | model = CatBoostAlgorithm(self.params) 38 | model.fit(self.X, self.y) 39 | y_predicted = model.predict(self.X) 40 | loss = metric(self.y, y_predicted) 41 | if prev_loss is not None: 42 | assert_almost_equal(prev_loss, loss, decimal=3) 43 | prev_loss = loss 44 | 45 | def test_get_metric_name(self): 46 | model = CatBoostAlgorithm(self.params) 47 | self.assertEqual(model.get_metric_name(), "rmse") 48 | 49 | 50 | class CatBoostAlgorithmTest(unittest.TestCase): 51 | @classmethod 52 | def setUpClass(cls): 53 | cls.X, cls.y = datasets.make_classification( 54 | n_samples=100, 55 | n_features=5, 56 | n_informative=4, 57 | n_redundant=1, 58 | n_classes=2, 59 | n_clusters_per_class=3, 60 | n_repeated=0, 61 | shuffle=False, 62 | random_state=0, 63 | ) 64 | cls.X = pd.DataFrame(cls.X, columns=[f"f_{i}" for i in range(cls.X.shape[1])]) 65 | cls.params = { 66 | "learning_rate": 0.1, 67 | "depth": 4, 68 | "rsm": 0.5, 69 | "l2_leaf_reg": 1, 70 | "seed": 1, 71 | "ml_task": "binary_classification", 72 | "loss_function": "Logloss", 73 | "eval_metric": "Logloss", 74 | } 75 | 76 | def test_reproduce_fit(self): 77 | metric = Metric({"name": "logloss"}) 78 | prev_loss = None 79 | for _ in range(2): 80 | model = CatBoostAlgorithm(self.params) 81 | model.fit(self.X, self.y) 82 | y_predicted = model.predict(self.X) 83 | loss = metric(self.y, y_predicted) 84 | if prev_loss is not None: 85 | assert_almost_equal(prev_loss, loss, decimal=3) 86 | prev_loss = loss 87 | 88 | def test_fit_predict(self): 89 | metric = Metric({"name": "logloss"}) 90 | loss_prev = None 91 | for _ in range(2): 92 | cat = CatBoostAlgorithm(self.params) 93 | cat.fit(self.X, self.y) 94 | y_predicted = cat.predict(self.X) 95 | loss = metric(self.y, y_predicted) 96 | if loss_prev is not None: 97 | assert_almost_equal(loss, loss_prev, decimal=3) 98 | loss_prev = loss 99 | 100 | def test_copy(self): 101 | # train model #1 102 | metric = Metric({"name": "logloss"}) 103 | cat = CatBoostAlgorithm(self.params) 104 | cat.fit(self.X, self.y) 105 | y_predicted = cat.predict(self.X) 106 | loss = metric(self.y, y_predicted) 107 | # create model #2 108 | cat2 = CatBoostAlgorithm(self.params) 109 | # model #2 is initialized in constructor 110 | self.assertTrue(cat2.model is not None) 111 | # do a copy and use it for predictions 112 | cat2 = cat.copy() 113 | self.assertEqual(type(cat), type(cat2)) 114 | y_predicted = cat2.predict(self.X) 115 | loss2 = metric(self.y, y_predicted) 116 | self.assertEqual(loss, loss2) 117 | 118 | def test_save_and_load(self): 119 | metric = Metric({"name": "logloss"}) 120 | cat = CatBoostAlgorithm(self.params) 121 | cat.fit(self.X, self.y) 122 | y_predicted = cat.predict(self.X) 123 | loss = metric(self.y, y_predicted) 124 | 125 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 126 | 127 | cat.save(filename) 128 | cat2 = CatBoostAlgorithm(self.params) 129 | self.assertTrue(cat.uid != cat2.uid) 130 | self.assertTrue(cat2.model is not None) 131 | cat2.load(filename) 132 | # Finished with the file, delete it 133 | os.remove(filename) 134 | 135 | y_predicted = cat2.predict(self.X) 136 | loss2 = metric(self.y, y_predicted) 137 | assert_almost_equal(loss, loss2, decimal=3) 138 | 139 | def test_get_metric_name(self): 140 | model = CatBoostAlgorithm(self.params) 141 | self.assertEqual(model.get_metric_name(), "logloss") 142 | params = dict(self.params) 143 | params["loss_function"] = "MultiClass" 144 | params["eval_metric"] = "MultiClass" 145 | model = CatBoostAlgorithm(params) 146 | self.assertEqual(model.get_metric_name(), "logloss") 147 | 148 | def test_is_fitted(self): 149 | cat = CatBoostAlgorithm(self.params) 150 | self.assertFalse(cat.is_fitted()) 151 | cat.fit(self.X, self.y) 152 | self.assertTrue(cat.is_fitted()) 153 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/extra_trees.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor 6 | 7 | from supervised.algorithms.registry import ( 8 | BINARY_CLASSIFICATION, 9 | MULTICLASS_CLASSIFICATION, 10 | REGRESSION, 11 | AlgorithmsRegistry, 12 | ) 13 | from supervised.algorithms.sklearn import ( 14 | SklearnTreesEnsembleClassifierAlgorithm, 15 | SklearnTreesEnsembleRegressorAlgorithm, 16 | ) 17 | from supervised.utils.config import LOG_LEVEL 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(LOG_LEVEL) 21 | 22 | 23 | class ExtraTreesAlgorithm(ClassifierMixin, SklearnTreesEnsembleClassifierAlgorithm): 24 | algorithm_name = "Extra Trees Classifier" 25 | algorithm_short_name = "Extra Trees" 26 | 27 | def __init__(self, params): 28 | super(ExtraTreesAlgorithm, self).__init__(params) 29 | logger.debug("ExtraTreesAlgorithm.__init__") 30 | 31 | self.library_version = sklearn.__version__ 32 | self.trees_in_step = additional.get("trees_in_step", 100) 33 | self.max_steps = additional.get("max_steps", 50) 34 | self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) 35 | self.model = ExtraTreesClassifier( 36 | n_estimators=self.trees_in_step, 37 | criterion=params.get("criterion", "gini"), 38 | max_features=params.get("max_features", 0.8), 39 | max_depth=params.get("max_depth", 6), 40 | min_samples_split=params.get("min_samples_split", 4), 41 | min_samples_leaf=params.get("min_samples_leaf", 1), 42 | warm_start=True, 43 | n_jobs=params.get("n_jobs", -1), 44 | random_state=params.get("seed", 1), 45 | ) 46 | self.max_steps = self.params.get("max_steps", self.max_steps) 47 | 48 | def file_extension(self): 49 | return "extra_trees" 50 | 51 | 52 | class ExtraTreesRegressorAlgorithm( 53 | RegressorMixin, SklearnTreesEnsembleRegressorAlgorithm 54 | ): 55 | algorithm_name = "Extra Trees Regressor" 56 | algorithm_short_name = "Extra Trees" 57 | 58 | def __init__(self, params): 59 | super(ExtraTreesRegressorAlgorithm, self).__init__(params) 60 | logger.debug("ExtraTreesRegressorAlgorithm.__init__") 61 | 62 | self.library_version = sklearn.__version__ 63 | self.trees_in_step = regression_additional.get("trees_in_step", 100) 64 | self.max_steps = regression_additional.get("max_steps", 50) 65 | self.early_stopping_rounds = regression_additional.get( 66 | "early_stopping_rounds", 50 67 | ) 68 | self.model = ExtraTreesRegressor( 69 | n_estimators=self.trees_in_step, 70 | criterion=params.get("criterion", "squared_error"), 71 | max_features=params.get("max_features", 0.6), 72 | max_depth=params.get("max_depth", 6), 73 | min_samples_split=params.get("min_samples_split", 30), 74 | min_samples_leaf=params.get("min_samples_leaf", 1), 75 | warm_start=True, 76 | n_jobs=params.get("n_jobs", -1), 77 | random_state=params.get("seed", 1), 78 | ) 79 | self.max_steps = self.params.get("max_steps", self.max_steps) 80 | 81 | def file_extension(self): 82 | return "extra_trees" 83 | 84 | 85 | # For binary classification target should be 0, 1. There should be no NaNs in target. 86 | et_params = { 87 | "criterion": ["gini", "entropy"], 88 | "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 89 | "min_samples_split": [10, 20, 30, 40, 50], 90 | "max_depth": [3, 4, 5, 6, 7], 91 | } 92 | 93 | classification_default_params = { 94 | "criterion": "gini", 95 | "max_features": 0.9, 96 | "min_samples_split": 30, 97 | "max_depth": 4, 98 | } 99 | 100 | additional = { 101 | "trees_in_step": 100, 102 | "max_steps": 50, 103 | "early_stopping_rounds": 50, 104 | "max_rows_limit": None, 105 | "max_cols_limit": None, 106 | } 107 | required_preprocessing = [ 108 | "missing_values_inputation", 109 | "convert_categorical", 110 | "datetime_transform", 111 | "text_transform", 112 | "target_as_integer", 113 | ] 114 | 115 | AlgorithmsRegistry.add( 116 | BINARY_CLASSIFICATION, 117 | ExtraTreesAlgorithm, 118 | et_params, 119 | required_preprocessing, 120 | additional, 121 | classification_default_params, 122 | ) 123 | 124 | AlgorithmsRegistry.add( 125 | MULTICLASS_CLASSIFICATION, 126 | ExtraTreesAlgorithm, 127 | et_params, 128 | required_preprocessing, 129 | additional, 130 | classification_default_params, 131 | ) 132 | 133 | 134 | # 135 | # REGRESSION 136 | # 137 | 138 | regression_et_params = { 139 | "criterion": [ 140 | "squared_error" 141 | ], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626 142 | "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 143 | "min_samples_split": [10, 20, 30, 40, 50], 144 | "max_depth": [3, 4, 5, 6, 7], 145 | } 146 | 147 | regression_default_params = { 148 | "criterion": "squared_error", 149 | "max_features": 0.9, 150 | "min_samples_split": 30, 151 | "max_depth": 4, 152 | } 153 | 154 | regression_additional = { 155 | "trees_in_step": 100, 156 | "max_steps": 50, 157 | "early_stopping_rounds": 50, 158 | "max_rows_limit": None, 159 | "max_cols_limit": None, 160 | } 161 | regression_required_preprocessing = [ 162 | "missing_values_inputation", 163 | "convert_categorical", 164 | "datetime_transform", 165 | "text_transform", 166 | "target_scale", 167 | ] 168 | 169 | AlgorithmsRegistry.add( 170 | REGRESSION, 171 | ExtraTreesRegressorAlgorithm, 172 | regression_et_params, 173 | regression_required_preprocessing, 174 | regression_additional, 175 | regression_default_params, 176 | ) 177 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/random_forest.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 6 | 7 | from supervised.algorithms.registry import ( 8 | BINARY_CLASSIFICATION, 9 | MULTICLASS_CLASSIFICATION, 10 | REGRESSION, 11 | AlgorithmsRegistry, 12 | ) 13 | from supervised.algorithms.sklearn import ( 14 | SklearnTreesEnsembleClassifierAlgorithm, 15 | SklearnTreesEnsembleRegressorAlgorithm, 16 | ) 17 | from supervised.utils.config import LOG_LEVEL 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(LOG_LEVEL) 21 | 22 | 23 | class RandomForestAlgorithm(ClassifierMixin, SklearnTreesEnsembleClassifierAlgorithm): 24 | algorithm_name = "Random Forest" 25 | algorithm_short_name = "Random Forest" 26 | 27 | def __init__(self, params): 28 | super(RandomForestAlgorithm, self).__init__(params) 29 | logger.debug("RandomForestAlgorithm.__init__") 30 | 31 | self.library_version = sklearn.__version__ 32 | self.trees_in_step = additional.get("trees_in_step", 5) 33 | self.max_steps = additional.get("max_steps", 3) 34 | self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) 35 | self.model = RandomForestClassifier( 36 | n_estimators=self.trees_in_step, 37 | criterion=params.get("criterion", "gini"), 38 | max_features=params.get("max_features", 0.8), 39 | max_depth=params.get("max_depth", 6), 40 | min_samples_split=params.get("min_samples_split", 4), 41 | min_samples_leaf=params.get("min_samples_leaf", 1), 42 | warm_start=True, 43 | n_jobs=params.get("n_jobs", -1), 44 | random_state=params.get("seed", 1), 45 | ) 46 | self.max_steps = self.params.get("max_steps", self.max_steps) 47 | 48 | def file_extension(self): 49 | return "random_forest" 50 | 51 | 52 | class RandomForestRegressorAlgorithm( 53 | RegressorMixin, SklearnTreesEnsembleRegressorAlgorithm 54 | ): 55 | algorithm_name = "Random Forest" 56 | algorithm_short_name = "Random Forest" 57 | 58 | def __init__(self, params): 59 | super(RandomForestRegressorAlgorithm, self).__init__(params) 60 | logger.debug("RandomForestRegressorAlgorithm.__init__") 61 | 62 | self.library_version = sklearn.__version__ 63 | self.trees_in_step = regression_additional.get("trees_in_step", 5) 64 | self.max_steps = regression_additional.get("max_steps", 3) 65 | self.early_stopping_rounds = regression_additional.get( 66 | "early_stopping_rounds", 50 67 | ) 68 | self.model = RandomForestRegressor( 69 | n_estimators=self.trees_in_step, 70 | criterion=params.get("criterion", "squared_error"), 71 | max_features=params.get("max_features", 0.8), 72 | max_depth=params.get("max_depth", 6), 73 | min_samples_split=params.get("min_samples_split", 4), 74 | min_samples_leaf=params.get("min_samples_leaf", 1), 75 | warm_start=True, 76 | n_jobs=params.get("n_jobs", -1), 77 | random_state=params.get("seed", 1), 78 | ) 79 | self.max_steps = self.params.get("max_steps", self.max_steps) 80 | 81 | def file_extension(self): 82 | return "random_forest" 83 | 84 | 85 | # For binary classification target should be 0, 1. There should be no NaNs in target. 86 | rf_params = { 87 | "criterion": ["gini", "entropy"], 88 | "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 89 | "min_samples_split": [10, 20, 30, 40, 50], 90 | "max_depth": [3, 4, 5, 6, 7], 91 | } 92 | 93 | classification_default_params = { 94 | "criterion": "gini", 95 | "max_features": 0.9, 96 | "min_samples_split": 30, 97 | "max_depth": 4, 98 | } 99 | 100 | 101 | additional = { 102 | "trees_in_step": 100, 103 | "train_cant_improve_limit": 1, 104 | "min_steps": 1, 105 | "max_steps": 50, 106 | "early_stopping_rounds": 50, 107 | "max_rows_limit": None, 108 | "max_cols_limit": None, 109 | } 110 | required_preprocessing = [ 111 | "missing_values_inputation", 112 | "convert_categorical", 113 | "datetime_transform", 114 | "text_transform", 115 | "target_as_integer", 116 | ] 117 | 118 | AlgorithmsRegistry.add( 119 | BINARY_CLASSIFICATION, 120 | RandomForestAlgorithm, 121 | rf_params, 122 | required_preprocessing, 123 | additional, 124 | classification_default_params, 125 | ) 126 | 127 | AlgorithmsRegistry.add( 128 | MULTICLASS_CLASSIFICATION, 129 | RandomForestAlgorithm, 130 | rf_params, 131 | required_preprocessing, 132 | additional, 133 | classification_default_params, 134 | ) 135 | 136 | 137 | # 138 | # REGRESSION 139 | # 140 | 141 | regression_rf_params = { 142 | "criterion": [ 143 | "squared_error" 144 | ], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626 145 | "max_features": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 146 | "min_samples_split": [10, 20, 30, 40, 50], 147 | "max_depth": [3, 4, 5, 6, 7], 148 | } 149 | 150 | regression_default_params = { 151 | "criterion": "squared_error", 152 | "max_features": 0.9, 153 | "min_samples_split": 30, 154 | "max_depth": 4, 155 | } 156 | 157 | regression_additional = { 158 | "trees_in_step": 100, 159 | "train_cant_improve_limit": 1, 160 | "min_steps": 1, 161 | "max_steps": 50, 162 | "early_stopping_rounds": 50, 163 | "max_rows_limit": None, 164 | "max_cols_limit": None, 165 | } 166 | regression_required_preprocessing = [ 167 | "missing_values_inputation", 168 | "convert_categorical", 169 | "datetime_transform", 170 | "text_transform", 171 | "target_scale", 172 | ] 173 | 174 | AlgorithmsRegistry.add( 175 | REGRESSION, 176 | RandomForestRegressorAlgorithm, 177 | regression_rf_params, 178 | regression_required_preprocessing, 179 | regression_additional, 180 | regression_default_params, 181 | ) 182 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_xgboost.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from numpy.testing import assert_almost_equal 8 | from sklearn import datasets 9 | 10 | from supervised.algorithms.xgboost import XgbAlgorithm, additional 11 | from supervised.utils.constants import BINARY_CLASSIFICATION 12 | from supervised.utils.metric import Metric 13 | 14 | additional["max_rounds"] = 1 15 | 16 | 17 | class XgboostAlgorithmTest(unittest.TestCase): 18 | @classmethod 19 | def setUpClass(cls): 20 | cls.X, cls.y = datasets.make_classification( 21 | n_samples=100, 22 | n_features=5, 23 | n_informative=4, 24 | n_redundant=1, 25 | n_classes=2, 26 | n_clusters_per_class=3, 27 | n_repeated=0, 28 | shuffle=False, 29 | random_state=0, 30 | ) 31 | 32 | def test_reproduce_fit(self): 33 | metric = Metric({"name": "logloss"}) 34 | params = { 35 | "objective": "binary:logistic", 36 | "eval_metric": "logloss", 37 | "seed": 1, 38 | "ml_task": BINARY_CLASSIFICATION, 39 | } 40 | prev_loss = None 41 | for _ in range(3): 42 | xgb = XgbAlgorithm(params) 43 | xgb.fit(self.X, self.y) 44 | y_predicted = xgb.predict(self.X) 45 | loss = metric(self.y, y_predicted) 46 | if prev_loss is not None: 47 | assert_almost_equal(prev_loss, loss) 48 | prev_loss = loss 49 | 50 | def test_copy(self): 51 | metric = Metric({"name": "logloss"}) 52 | params = { 53 | "objective": "binary:logistic", 54 | "eval_metric": "logloss", 55 | "ml_task": BINARY_CLASSIFICATION, 56 | } 57 | xgb = XgbAlgorithm(params) 58 | xgb.fit(self.X, self.y) 59 | y_predicted = xgb.predict(self.X) 60 | loss = metric(self.y, y_predicted) 61 | 62 | xgb2 = XgbAlgorithm(params) 63 | self.assertTrue(xgb2.model is None) # model is set to None, while initialized 64 | xgb2 = xgb.copy() 65 | self.assertEqual(type(xgb), type(xgb2)) 66 | y_predicted = xgb2.predict(self.X) 67 | loss2 = metric(self.y, y_predicted) 68 | self.assertEqual(loss, loss2) 69 | self.assertNotEqual(id(xgb), id(xgb2)) 70 | 71 | def test_save_and_load(self): 72 | metric = Metric({"name": "logloss"}) 73 | params = { 74 | "objective": "binary:logistic", 75 | "eval_metric": "logloss", 76 | "ml_task": BINARY_CLASSIFICATION, 77 | } 78 | xgb = XgbAlgorithm(params) 79 | xgb.fit(self.X, self.y) 80 | y_predicted = xgb.predict(self.X) 81 | loss = metric(self.y, y_predicted) 82 | 83 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 84 | 85 | xgb.save(filename) 86 | 87 | xgb2 = XgbAlgorithm(params) 88 | self.assertTrue(xgb2.model is None) 89 | xgb2.load(filename) 90 | # Finished with the file, delete it 91 | os.remove(filename) 92 | 93 | y_predicted = xgb2.predict(self.X) 94 | loss2 = metric(self.y, y_predicted) 95 | assert_almost_equal(loss, loss2) 96 | 97 | def test_save_and_load_with_early_stopping(self): 98 | metric = Metric({"name": "logloss"}) 99 | params = { 100 | "objective": "binary:logistic", 101 | "eval_metric": "logloss", 102 | "ml_task": BINARY_CLASSIFICATION, 103 | } 104 | xgb = XgbAlgorithm(params) 105 | xgb.fit(self.X, self.y, X_validation=self.X, y_validation=self.y) 106 | y_predicted = xgb.predict(self.X) 107 | loss = metric(self.y, y_predicted) 108 | 109 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 110 | prev_best_iteration = xgb.model.best_iteration 111 | xgb.save(filename) 112 | 113 | xgb2 = XgbAlgorithm(params) 114 | self.assertTrue(xgb2.model is None) 115 | xgb2.load(filename) 116 | # Finished with the file, delete it 117 | os.remove(filename) 118 | 119 | y_predicted = xgb2.predict(self.X) 120 | loss2 = metric(self.y, y_predicted) 121 | assert_almost_equal(loss, loss2) 122 | self.assertEqual(prev_best_iteration, xgb2.model.best_iteration) 123 | 124 | def test_restricted_characters_in_feature_name(self): 125 | df = pd.DataFrame( 126 | { 127 | "y": np.random.randint(0, 2, size=100), 128 | "[test1]": np.random.uniform(0, 1, size=100), 129 | "test2 < 1": np.random.uniform(0, 1, size=100), 130 | } 131 | ) 132 | 133 | y = df.iloc[:, 0] 134 | X = df.iloc[:, 1:] 135 | 136 | metric = Metric({"name": "logloss"}) 137 | params = { 138 | "objective": "binary:logistic", 139 | "eval_metric": "logloss", 140 | "ml_task": BINARY_CLASSIFICATION, 141 | } 142 | xgb = XgbAlgorithm(params) 143 | xgb.fit(X, y) 144 | xgb.predict(X) 145 | 146 | def test_get_metric_name(self): 147 | params = { 148 | "objective": "binary:logistic", 149 | "eval_metric": "logloss", 150 | "ml_task": BINARY_CLASSIFICATION, 151 | } 152 | model = XgbAlgorithm(params) 153 | self.assertEqual(model.get_metric_name(), "logloss") 154 | 155 | params = {"eval_metric": "rmse"} 156 | model = XgbAlgorithm(params) 157 | self.assertEqual(model.get_metric_name(), "rmse") 158 | 159 | def test_is_fitted(self): 160 | params = { 161 | "objective": "binary:logistic", 162 | "eval_metric": "logloss", 163 | "ml_task": BINARY_CLASSIFICATION, 164 | } 165 | model = XgbAlgorithm(params) 166 | self.assertFalse(model.is_fitted()) 167 | model.fit(self.X, self.y) 168 | self.assertTrue(model.is_fitted()) 169 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_goldenfeatures_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn import datasets 8 | 9 | from supervised.algorithms.registry import ( 10 | BINARY_CLASSIFICATION, 11 | MULTICLASS_CLASSIFICATION, 12 | REGRESSION, 13 | ) 14 | from supervised.preprocessing.goldenfeatures_transformer import ( 15 | GoldenFeaturesTransformer, 16 | ) 17 | 18 | 19 | class GoldenFeaturesTransformerTest(unittest.TestCase): 20 | automl_dir = "automl_testing" 21 | 22 | def tearDown(self): 23 | shutil.rmtree(self.automl_dir, ignore_errors=True) 24 | 25 | def test_transformer(self): 26 | X, y = datasets.make_classification( 27 | n_samples=100, 28 | n_features=10, 29 | n_informative=6, 30 | n_redundant=1, 31 | n_classes=2, 32 | n_clusters_per_class=3, 33 | n_repeated=0, 34 | shuffle=False, 35 | random_state=0, 36 | ) 37 | 38 | df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 39 | 40 | with tempfile.TemporaryDirectory() as tmpdir: 41 | gft = GoldenFeaturesTransformer(tmpdir, "binary_classification") 42 | gft.fit(df, y) 43 | 44 | df = gft.transform(df) 45 | 46 | gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification") 47 | gft3.from_json(gft.to_json(), tmpdir) 48 | 49 | def test_subsample_regression_10k(self): 50 | rows = 10000 51 | X = np.random.rand(rows, 3) 52 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 53 | y = pd.Series(np.random.rand(rows), name="target") 54 | 55 | gft3 = GoldenFeaturesTransformer(self.automl_dir, REGRESSION) 56 | X_train, X_test, y_train, y_test = gft3._subsample(X, y) 57 | 58 | self.assertTrue(X_train.shape[0], 2500) 59 | self.assertTrue(X_test.shape[0], 2500) 60 | self.assertTrue(y_train.shape[0], 2500) 61 | self.assertTrue(y_test.shape[0], 2500) 62 | 63 | def test_subsample_regression_4k(self): 64 | rows = 4000 65 | X = np.random.rand(rows, 3) 66 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 67 | y = pd.Series(np.random.rand(rows), name="target") 68 | 69 | gft3 = GoldenFeaturesTransformer(self.automl_dir, REGRESSION) 70 | X_train, X_test, y_train, y_test = gft3._subsample(X, y) 71 | 72 | self.assertTrue(X_train.shape[0], 2000) 73 | self.assertTrue(X_test.shape[0], 2000) 74 | self.assertTrue(y_train.shape[0], 2000) 75 | self.assertTrue(y_test.shape[0], 2000) 76 | 77 | def test_subsample_multiclass_10k(self): 78 | rows = 10000 79 | X = np.random.rand(rows, 3) 80 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 81 | y = pd.Series(np.random.randint(0, 4, rows), name="target") 82 | 83 | gft3 = GoldenFeaturesTransformer(self.automl_dir, MULTICLASS_CLASSIFICATION) 84 | X_train, X_test, y_train, y_test = gft3._subsample(X, y) 85 | 86 | self.assertTrue(X_train.shape[0], 2500) 87 | self.assertTrue(X_test.shape[0], 2500) 88 | self.assertTrue(y_train.shape[0], 2500) 89 | self.assertTrue(y_test.shape[0], 2500) 90 | 91 | for uni in [np.unique(y_train), np.unique(y_test)]: 92 | for i in range(4): 93 | self.assertTrue(i in uni) 94 | 95 | def test_subsample_multiclass_4k(self): 96 | rows = 4000 97 | X = np.random.rand(rows, 3) 98 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 99 | y = pd.Series(np.random.randint(0, 4, rows), name="target") 100 | 101 | gft3 = GoldenFeaturesTransformer(self.automl_dir, MULTICLASS_CLASSIFICATION) 102 | X_train, X_test, y_train, y_test = gft3._subsample(X, y) 103 | 104 | self.assertTrue(X_train.shape[0], 2000) 105 | self.assertTrue(X_test.shape[0], 2000) 106 | self.assertTrue(y_train.shape[0], 2000) 107 | self.assertTrue(y_test.shape[0], 2000) 108 | 109 | for uni in [np.unique(y_train), np.unique(y_test)]: 110 | for i in range(4): 111 | self.assertTrue(i in uni) 112 | 113 | def test_subsample_binclass_4k(self): 114 | rows = 4000 115 | X = np.random.rand(rows, 3) 116 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) 117 | y = pd.Series(np.random.randint(0, 2, rows), name="target") 118 | 119 | gft3 = GoldenFeaturesTransformer(self.automl_dir, BINARY_CLASSIFICATION) 120 | X_train, X_test, y_train, y_test = gft3._subsample(X, y) 121 | 122 | self.assertTrue(X_train.shape[0], 2000) 123 | self.assertTrue(X_test.shape[0], 2000) 124 | self.assertTrue(y_train.shape[0], 2000) 125 | self.assertTrue(y_test.shape[0], 2000) 126 | 127 | for uni in [np.unique(y_train), np.unique(y_test)]: 128 | for i in range(2): 129 | self.assertTrue(i in uni) 130 | 131 | def test_features_count(self): 132 | N_COLS = 10 133 | X, y = datasets.make_classification( 134 | n_samples=100, 135 | n_features=N_COLS, 136 | n_informative=6, 137 | n_redundant=1, 138 | n_classes=2, 139 | n_clusters_per_class=3, 140 | n_repeated=0, 141 | shuffle=False, 142 | random_state=0, 143 | ) 144 | 145 | df = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 146 | 147 | with tempfile.TemporaryDirectory() as tmpdir: 148 | FEATURES_COUNT = 42 149 | gft = GoldenFeaturesTransformer( 150 | tmpdir, "binary_classification", features_count=FEATURES_COUNT 151 | ) 152 | gft.fit(df, y) 153 | 154 | self.assertEqual(len(gft._new_features), FEATURES_COUNT) 155 | 156 | gft3 = GoldenFeaturesTransformer(tmpdir, "binary_classification") 157 | gft3.from_json(gft.to_json(), tmpdir) 158 | 159 | df = gft3.transform(df) 160 | self.assertEqual(df.shape[1], N_COLS + FEATURES_COUNT) 161 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/catboost.py: -------------------------------------------------------------------------------- ```python 1 | import optuna 2 | from catboost import CatBoostClassifier, CatBoostRegressor, Pool 3 | 4 | from supervised.algorithms.catboost import catboost_eval_metric, catboost_objective 5 | from supervised.algorithms.registry import ( 6 | BINARY_CLASSIFICATION, 7 | MULTICLASS_CLASSIFICATION, 8 | REGRESSION, 9 | ) 10 | from supervised.utils.metric import ( 11 | CatBoostEvalMetricAveragePrecision, 12 | CatBoostEvalMetricMSE, 13 | CatBoostEvalMetricPearson, 14 | CatBoostEvalMetricSpearman, 15 | CatBoostEvalMetricUserDefined, 16 | Metric, 17 | ) 18 | 19 | EPS = 1e-8 20 | 21 | 22 | class CatBoostObjective: 23 | def __init__( 24 | self, 25 | ml_task, 26 | X_train, 27 | y_train, 28 | sample_weight, 29 | X_validation, 30 | y_validation, 31 | sample_weight_validation, 32 | eval_metric, 33 | cat_features_indices, 34 | n_jobs, 35 | random_state, 36 | ): 37 | self.ml_task = ml_task 38 | self.X_train = X_train 39 | self.y_train = y_train 40 | self.sample_weight = sample_weight 41 | self.X_validation = X_validation 42 | self.y_validation = y_validation 43 | self.eval_metric = eval_metric 44 | self.cat_features = cat_features_indices 45 | self.eval_set = Pool( 46 | data=X_validation, 47 | label=y_validation, 48 | cat_features=self.cat_features, 49 | weight=sample_weight_validation, 50 | ) 51 | self.n_jobs = n_jobs 52 | self.rounds = 1000 53 | self.learning_rate = 0.0125 54 | self.early_stopping_rounds = 50 55 | self.seed = random_state 56 | 57 | self.objective = catboost_objective(ml_task, self.eval_metric.name) 58 | self.eval_metric_name = catboost_eval_metric(ml_task, self.eval_metric.name) 59 | self.custom_eval_metric = None 60 | if self.eval_metric_name == "spearman": 61 | self.custom_eval_metric = CatBoostEvalMetricSpearman() 62 | elif self.eval_metric_name == "pearson": 63 | self.custom_eval_metric = CatBoostEvalMetricPearson() 64 | elif self.eval_metric_name == "average_precision": 65 | self.custom_eval_metric = CatBoostEvalMetricAveragePrecision() 66 | elif self.eval_metric_name == "mse": 67 | self.custom_eval_metric = CatBoostEvalMetricMSE() 68 | elif self.eval_metric_name == "user_defined_metric": 69 | self.custom_eval_metric = CatBoostEvalMetricUserDefined() 70 | 71 | def __call__(self, trial): 72 | try: 73 | params = { 74 | "iterations": self.rounds, 75 | "learning_rate": trial.suggest_categorical( 76 | "learning_rate", [0.05, 0.1, 0.2] 77 | ), 78 | "depth": trial.suggest_int("depth", 2, 9), 79 | "l2_leaf_reg": trial.suggest_float( 80 | "l2_leaf_reg", 0.0001, 10.0, log=False 81 | ), 82 | "random_strength": trial.suggest_float( 83 | "random_strength", EPS, 10.0, log=False 84 | ), 85 | "rsm": trial.suggest_float("rsm", 0.1, 1), # colsample_bylevel=rsm 86 | "loss_function": self.objective, 87 | "eval_metric": self.eval_metric_name, 88 | "verbose": False, 89 | "allow_writing_files": False, 90 | "thread_count": self.n_jobs, 91 | "random_seed": self.seed, 92 | # "border_count": trial.suggest_int("border_count", 16, 2048), 93 | "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), 94 | # "bootstrap_type": "Bernoulli" 95 | # trial.suggest_categorical( 96 | # "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"] 97 | # ), 98 | } 99 | # if params["bootstrap_type"] == "Bayesian": 100 | # params["bagging_temperature"] = trial.suggest_float( 101 | # "bagging_temperature", 0, 10 102 | # ) 103 | # elif params["bootstrap_type"] in ["Bernoulli", "MVS"]: 104 | # params["subsample"] = trial.suggest_float("subsample", 0.1, 1) 105 | 106 | Algorithm = ( 107 | CatBoostRegressor if self.ml_task == REGRESSION else CatBoostClassifier 108 | ) 109 | if self.custom_eval_metric is not None: 110 | params["eval_metric"] = self.custom_eval_metric 111 | model = Algorithm(**params) 112 | 113 | model.fit( 114 | self.X_train, 115 | self.y_train, 116 | sample_weight=self.sample_weight, 117 | early_stopping_rounds=self.early_stopping_rounds, 118 | eval_set=self.eval_set, 119 | verbose_eval=False, 120 | cat_features=self.cat_features, 121 | ) 122 | 123 | if self.ml_task == BINARY_CLASSIFICATION: 124 | preds = model.predict_proba( 125 | self.X_validation, ntree_end=model.best_iteration_ + 1 126 | )[:, 1] 127 | elif self.ml_task == MULTICLASS_CLASSIFICATION: 128 | preds = model.predict_proba( 129 | self.X_validation, ntree_end=model.best_iteration_ + 1 130 | ) 131 | else: # REGRESSION 132 | preds = model.predict( 133 | self.X_validation, ntree_end=model.best_iteration_ + 1 134 | ) 135 | 136 | score = self.eval_metric(self.y_validation, preds) 137 | if Metric.optimize_negative(self.eval_metric.name): 138 | score *= -1.0 139 | 140 | except optuna.exceptions.TrialPruned as e: 141 | raise e 142 | except Exception as e: 143 | print("Exception in CatBoostObjective", str(e)) 144 | # import traceback 145 | # print(traceback.format_exc()) 146 | return None 147 | 148 | return score 149 | ``` -------------------------------------------------------------------------------- /supervised/validation/validator_kfold.py: -------------------------------------------------------------------------------- ```python 1 | import gc 2 | import logging 3 | import os 4 | import warnings 5 | 6 | import numpy as np 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | from sklearn.model_selection import KFold, StratifiedKFold 11 | 12 | from supervised.exceptions import AutoMLException 13 | from supervised.utils.utils import load_data 14 | from supervised.validation.validator_base import BaseValidator 15 | 16 | 17 | class KFoldValidator(BaseValidator): 18 | def __init__(self, params): 19 | BaseValidator.__init__(self, params) 20 | 21 | self.k_folds = self.params.get("k_folds", 5) 22 | self.shuffle = self.params.get("shuffle", True) 23 | self.stratify = self.params.get("stratify", False) 24 | self.random_seed = self.params.get("random_seed", 1906) 25 | self.repeats = self.params.get("repeats", 1) 26 | 27 | if not self.shuffle and self.repeats > 1: 28 | warnings.warn( 29 | "Disable repeats in validation because shuffle is disabled", UserWarning 30 | ) 31 | self.repeats = 1 32 | 33 | self.skf = [] 34 | 35 | for r in range(self.repeats): 36 | random_seed = self.random_seed + r if self.shuffle else None 37 | if self.stratify: 38 | if self.shuffle: 39 | self.skf += [ 40 | StratifiedKFold( 41 | n_splits=self.k_folds, 42 | shuffle=self.shuffle, 43 | random_state=random_seed, 44 | ) 45 | ] 46 | else: 47 | self.skf += [ 48 | StratifiedKFold( 49 | n_splits=self.k_folds, 50 | shuffle=self.shuffle, 51 | random_state=random_seed, 52 | ) 53 | ] 54 | else: 55 | self.skf += [ 56 | KFold( 57 | n_splits=self.k_folds, 58 | shuffle=self.shuffle, 59 | random_state=random_seed, 60 | ) 61 | ] 62 | 63 | self._results_path = self.params.get("results_path") 64 | self._X_path = self.params.get("X_path") 65 | self._y_path = self.params.get("y_path") 66 | self._sample_weight_path = self.params.get("sample_weight_path") 67 | self._sensitive_features_path = self.params.get("sensitive_features_path") 68 | 69 | if self._X_path is None or self._y_path is None: 70 | raise AutoMLException("No data path set in KFoldValidator params") 71 | 72 | folds_path = os.path.join(self._results_path, "folds") 73 | 74 | if not os.path.exists(folds_path): 75 | os.mkdir(folds_path) 76 | X = load_data(self._X_path) 77 | y = load_data(self._y_path) 78 | y = y["target"] 79 | 80 | if isinstance(y[0], bytes): 81 | # see https://github.com/scikit-learn/scikit-learn/issues/16980 82 | y = y.astype(str) 83 | 84 | for repeat_cnt, skf in enumerate(self.skf): 85 | for fold_cnt, (train_index, validation_index) in enumerate( 86 | skf.split(X, y) 87 | ): 88 | repeat_str = f"_repeat_{repeat_cnt}" if len(self.skf) > 1 else "" 89 | train_index_file = os.path.join( 90 | self._results_path, 91 | "folds", 92 | f"fold_{fold_cnt}{repeat_str}_train_indices.npy", 93 | ) 94 | validation_index_file = os.path.join( 95 | self._results_path, 96 | "folds", 97 | f"fold_{fold_cnt}{repeat_str}_validation_indices.npy", 98 | ) 99 | 100 | np.save(train_index_file, train_index) 101 | np.save(validation_index_file, validation_index) 102 | del X 103 | del y 104 | gc.collect() 105 | 106 | else: 107 | log.debug("Folds split already done, reuse it") 108 | 109 | def get_split(self, k, repeat=0): 110 | repeat_str = f"_repeat_{repeat}" if self.repeats > 1 else "" 111 | 112 | train_index_file = os.path.join( 113 | self._results_path, "folds", f"fold_{k}{repeat_str}_train_indices.npy" 114 | ) 115 | validation_index_file = os.path.join( 116 | self._results_path, "folds", f"fold_{k}{repeat_str}_validation_indices.npy" 117 | ) 118 | 119 | train_index = np.load(train_index_file) 120 | validation_index = np.load(validation_index_file) 121 | 122 | X = load_data(self._X_path) 123 | y = load_data(self._y_path) 124 | y = y["target"] 125 | 126 | sample_weight = None 127 | if self._sample_weight_path is not None: 128 | sample_weight = load_data(self._sample_weight_path) 129 | sample_weight = sample_weight["sample_weight"] 130 | 131 | sensitive_features = None 132 | if self._sensitive_features_path is not None: 133 | sensitive_features = load_data(self._sensitive_features_path) 134 | 135 | train_data = {"X": X.loc[train_index], "y": y.loc[train_index]} 136 | validation_data = {"X": X.loc[validation_index], "y": y.loc[validation_index]} 137 | if sample_weight is not None: 138 | train_data["sample_weight"] = sample_weight.loc[train_index] 139 | validation_data["sample_weight"] = sample_weight.loc[validation_index] 140 | 141 | if sensitive_features is not None: 142 | train_data["sensitive_features"] = sensitive_features.loc[train_index] 143 | validation_data["sensitive_features"] = sensitive_features.loc[ 144 | validation_index 145 | ] 146 | 147 | return (train_data, validation_data) 148 | 149 | def get_n_splits(self): 150 | return self.k_folds 151 | 152 | def get_repeats(self): 153 | return self.repeats 154 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/disable_eda.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn import datasets 8 | 9 | from supervised import AutoML 10 | from supervised.preprocessing.eda import EDA 11 | 12 | 13 | class EDATest(unittest.TestCase): 14 | automl_dir = "automl_tests" 15 | 16 | def tearDown(self): 17 | shutil.rmtree(self.automl_dir, ignore_errors=True) 18 | 19 | def test_explain_default(self): 20 | a = AutoML( 21 | results_path=self.automl_dir, 22 | total_time_limit=5, 23 | algorithms=["Baseline"], 24 | train_ensemble=False, 25 | explain_level=2, 26 | ) 27 | 28 | X, y = datasets.make_classification(n_samples=100, n_features=5) 29 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 30 | y = pd.Series(y, name="class") 31 | 32 | a.fit(X, y) 33 | 34 | result_files = os.listdir(os.path.join(a._results_path, "EDA")) 35 | 36 | for col in X.columns: 37 | self.assertTrue(f"{col}.png" in result_files) 38 | self.assertTrue("target.png" in result_files) 39 | self.assertTrue("README.md" in result_files) 40 | 41 | def test_column_name_to_filename(self): 42 | """Valid feature name should be untouched""" 43 | col = "feature_1" 44 | self.assertEqual(EDA.prepare(col), col) 45 | 46 | self.tearDown() 47 | 48 | def test_extensive_eda(self): 49 | """ 50 | Test for extensive_eda feature 51 | """ 52 | 53 | X, y = datasets.make_regression(n_samples=100, n_features=5) 54 | 55 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 56 | y = pd.Series(y, name="class") 57 | 58 | results_path = self.automl_dir 59 | EDA.extensive_eda(X, y, results_path) 60 | result_files = os.listdir(results_path) 61 | 62 | for col in X.columns: 63 | self.assertTrue(f"{col}_target.png" in result_files) 64 | self.assertTrue("heatmap.png" in result_files) 65 | self.assertTrue("Extensive_EDA.md" in result_files) 66 | 67 | X, y = datasets.make_classification(n_samples=100, n_features=5) 68 | 69 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 70 | y = pd.Series(y, name="class") 71 | 72 | results_path = self.automl_dir 73 | EDA.extensive_eda(X, y, results_path) 74 | result_files = os.listdir(results_path) 75 | 76 | for col in X.columns: 77 | self.assertTrue(f"{col}_target.png" in result_files) 78 | self.assertTrue("heatmap.png" in result_files) 79 | self.assertTrue("Extensive_EDA.md" in result_files) 80 | 81 | self.tearDown() 82 | 83 | def test_extensive_eda_missing(self): 84 | """ 85 | Test for dataframe with missing values 86 | """ 87 | 88 | X, y = datasets.make_regression(n_samples=100, n_features=5) 89 | 90 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 91 | y = pd.Series(y, name="class") 92 | 93 | ##add some nan values 94 | X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan 95 | 96 | results_path = self.automl_dir 97 | EDA.extensive_eda(X, y, results_path) 98 | result_files = os.listdir(results_path) 99 | 100 | for col in X.columns: 101 | self.assertTrue(f"{col}_target.png" in result_files) 102 | self.assertTrue("heatmap.png" in result_files) 103 | self.assertTrue("Extensive_EDA.md" in result_files) 104 | 105 | X, y = datasets.make_regression(n_samples=100, n_features=5) 106 | 107 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 108 | y = pd.Series(y, name="class") 109 | 110 | ##add some nan values 111 | X.loc[np.random.randint(0, 100, 20), "f_0"] = np.nan 112 | 113 | results_path = self.automl_dir 114 | EDA.extensive_eda(X, y, results_path) 115 | result_files = os.listdir(results_path) 116 | 117 | for col in X.columns: 118 | self.assertTrue(f"{col}_target.png" in result_files) 119 | self.assertTrue("heatmap.png" in result_files) 120 | self.assertTrue("Extensive_EDA.md" in result_files) 121 | 122 | self.tearDown() 123 | 124 | def test_symbol_feature(self): 125 | """ 126 | Test for columns with forbidden filenames 127 | """ 128 | 129 | X, y = datasets.make_regression(n_samples=100, n_features=5) 130 | 131 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 132 | X.rename({"f_0": "ff*", "f_1": "fg/"}, axis=1, inplace=True) 133 | y = pd.Series(y, name="class") 134 | 135 | results_path = self.automl_dir 136 | EDA.extensive_eda(X, y, results_path) 137 | result_files = os.listdir(results_path) 138 | 139 | for col in X.columns: 140 | self.assertTrue(EDA.plot_fname(f"{col}_target") in result_files) 141 | self.assertTrue("heatmap.png" in result_files) 142 | self.assertTrue("Extensive_EDA.md" in result_files) 143 | 144 | self.tearDown() 145 | 146 | def test_naughty_column_name_to_filename(self): 147 | """Test with naughty strings. 148 | String from https://github.com/minimaxir/big-list-of-naughty-strings""" 149 | os.mkdir(self.automl_dir) 150 | naughty_columns = [ 151 | "feature_1", 152 | "*", 153 | "😍", 154 | "¯\_(ツ)_/¯", 155 | "表", 156 | "𠜎𠜱𠝹𠱓", 157 | "عاملة بولندا", 158 | "Ṱ̺̺̕o͞ ̷" "🇸🇦🇫🇦🇲", 159 | "⁰⁴⁵", 160 | "∆˚¬…æ", 161 | "!@#$%^&*()`~", 162 | "onfocus=JaVaSCript:alert(123) autofocus", 163 | "`\"'><img src=xxx:x \x20onerror=javascript:alert(1)>", 164 | 'System("ls -al /")', 165 | 'Kernel.exec("ls -al /")', 166 | "لُلُصّبُلُل" "{% print 'x' * 64 * 1024**3 %}", 167 | '{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}', 168 | "ÜBER Über German Umlaut", 169 | "影師嗎", 170 | "C'est déjà l'été." "Nín hǎo. Wǒ shì zhōng guó rén", 171 | "Компьютер", 172 | "jaja---lol-méméméoo--a", 173 | ] 174 | for col in naughty_columns: 175 | fname = EDA.plot_path(self.automl_dir, col) 176 | with open(fname, "w") as fout: 177 | fout.write("ok") 178 | 179 | self.tearDown() 180 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/linear.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | from sklearn.base import ClassifierMixin, RegressorMixin 8 | from sklearn.linear_model import LinearRegression, LogisticRegression 9 | 10 | from supervised.algorithms.registry import ( 11 | BINARY_CLASSIFICATION, 12 | MULTICLASS_CLASSIFICATION, 13 | REGRESSION, 14 | AlgorithmsRegistry, 15 | ) 16 | from supervised.algorithms.sklearn import SklearnAlgorithm 17 | from supervised.utils.config import LOG_LEVEL 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(LOG_LEVEL) 21 | 22 | 23 | class LinearAlgorithm(ClassifierMixin, SklearnAlgorithm): 24 | algorithm_name = "Logistic Regression" 25 | algorithm_short_name = "Linear" 26 | 27 | def __init__(self, params): 28 | super(LinearAlgorithm, self).__init__(params) 29 | logger.debug("LinearAlgorithm.__init__") 30 | self.max_iters = 1 31 | self.library_version = sklearn.__version__ 32 | self.model = LogisticRegression( 33 | max_iter=500, tol=5e-4, n_jobs=self.params.get("n_jobs", -1) 34 | ) 35 | 36 | def is_fitted(self): 37 | return ( 38 | hasattr(self.model, "coef_") 39 | and self.model.coef_ is not None 40 | and self.model.coef_.shape[0] > 0 41 | ) 42 | 43 | def file_extension(self): 44 | return "linear" 45 | 46 | def interpret( 47 | self, 48 | X_train, 49 | y_train, 50 | X_validation, 51 | y_validation, 52 | model_file_path, 53 | learner_name, 54 | target_name=None, 55 | class_names=None, 56 | metric_name=None, 57 | ml_task=None, 58 | explain_level=2, 59 | ): 60 | super(LinearAlgorithm, self).interpret( 61 | X_train, 62 | y_train, 63 | X_validation, 64 | y_validation, 65 | model_file_path, 66 | learner_name, 67 | target_name, 68 | class_names, 69 | metric_name, 70 | ml_task, 71 | explain_level, 72 | ) 73 | if explain_level == 0: 74 | return 75 | if X_train.shape[1] > 100: 76 | # if too many columns, skip this step 77 | return 78 | coefs = self.model.coef_ 79 | intercept = self.model.intercept_ 80 | if self.params["ml_task"] == BINARY_CLASSIFICATION: 81 | df = pd.DataFrame( 82 | { 83 | "feature": ["intercept"] + X_train.columns.tolist(), 84 | "weight": [intercept[0]] + list(coefs[0, :]), 85 | } 86 | ) 87 | df.to_csv( 88 | os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=False 89 | ) 90 | elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: 91 | classes = list(class_names) 92 | if isinstance(class_names, dict): 93 | classes = class_names.values() 94 | if len(classes) > 20: 95 | # if there are too many classes, skip this step 96 | return 97 | df = pd.DataFrame( 98 | np.transpose(np.column_stack((intercept, coefs))), 99 | index=["intercept"] + X_train.columns.tolist(), 100 | columns=classes, 101 | ) 102 | df.to_csv( 103 | os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=True 104 | ) 105 | 106 | 107 | class LinearRegressorAlgorithm(RegressorMixin, SklearnAlgorithm): 108 | algorithm_name = "Linear Regression" 109 | algorithm_short_name = "Linear" 110 | 111 | def __init__(self, params): 112 | super(LinearRegressorAlgorithm, self).__init__(params) 113 | logger.debug("LinearRegressorAlgorithm.__init__") 114 | self.max_iters = 1 115 | self.library_version = sklearn.__version__ 116 | self.model = LinearRegression(n_jobs=self.params.get("n_jobs", -1)) 117 | 118 | def is_fitted(self): 119 | return ( 120 | hasattr(self.model, "coef_") 121 | and self.model.coef_ is not None 122 | and self.model.coef_.shape[0] > 0 123 | ) 124 | 125 | def file_extension(self): 126 | return "linear" 127 | 128 | def interpret( 129 | self, 130 | X_train, 131 | y_train, 132 | X_validation, 133 | y_validation, 134 | model_file_path, 135 | learner_name, 136 | target_name=None, 137 | class_names=None, 138 | metric_name=None, 139 | ml_task=None, 140 | explain_level=2, 141 | ): 142 | super(LinearRegressorAlgorithm, self).interpret( 143 | X_train, 144 | y_train, 145 | X_validation, 146 | y_validation, 147 | model_file_path, 148 | learner_name, 149 | target_name, 150 | class_names, 151 | metric_name, 152 | ml_task, 153 | explain_level, 154 | ) 155 | if explain_level == 0: 156 | return 157 | if X_train.shape[1] > 100: 158 | # if too many columns, skip this step 159 | return 160 | coefs = self.model.coef_ 161 | intercept = self.model.intercept_ 162 | df = pd.DataFrame( 163 | { 164 | "feature": ["intercept"] + X_train.columns.tolist(), 165 | "weight": [intercept] + list(coefs), 166 | } 167 | ) 168 | df.to_csv( 169 | os.path.join(model_file_path, f"{learner_name}_coefs.csv"), index=False 170 | ) 171 | 172 | 173 | additional = {"max_steps": 1, "max_rows_limit": None, "max_cols_limit": None} 174 | required_preprocessing = [ 175 | "missing_values_inputation", 176 | "convert_categorical", 177 | "datetime_transform", 178 | "text_transform", 179 | "scale", 180 | "target_as_integer", 181 | ] 182 | 183 | AlgorithmsRegistry.add( 184 | BINARY_CLASSIFICATION, LinearAlgorithm, {}, required_preprocessing, additional, {} 185 | ) 186 | AlgorithmsRegistry.add( 187 | MULTICLASS_CLASSIFICATION, 188 | LinearAlgorithm, 189 | {}, 190 | required_preprocessing, 191 | additional, 192 | {}, 193 | ) 194 | 195 | regression_required_preprocessing = [ 196 | "missing_values_inputation", 197 | "convert_categorical", 198 | "datetime_transform", 199 | "text_transform", 200 | "scale", 201 | "target_scale", 202 | ] 203 | 204 | AlgorithmsRegistry.add( 205 | REGRESSION, 206 | LinearRegressorAlgorithm, 207 | {}, 208 | regression_required_preprocessing, 209 | additional, 210 | {}, 211 | ) 212 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/lightgbm.py: -------------------------------------------------------------------------------- ```python 1 | import lightgbm as lgb 2 | import numpy as np 3 | import optuna 4 | import optuna_integration 5 | import pandas as pd 6 | 7 | from supervised.algorithms.lightgbm import lightgbm_eval_metric, lightgbm_objective 8 | from supervised.algorithms.registry import ( 9 | MULTICLASS_CLASSIFICATION, 10 | ) 11 | from supervised.utils.metric import ( 12 | Metric, 13 | lightgbm_eval_metric_accuracy, 14 | lightgbm_eval_metric_average_precision, 15 | lightgbm_eval_metric_f1, 16 | lightgbm_eval_metric_pearson, 17 | lightgbm_eval_metric_r2, 18 | lightgbm_eval_metric_spearman, 19 | lightgbm_eval_metric_user_defined, 20 | ) 21 | 22 | EPS = 1e-8 23 | 24 | 25 | class LightgbmObjective: 26 | def __init__( 27 | self, 28 | ml_task, 29 | X_train, 30 | y_train, 31 | sample_weight, 32 | X_validation, 33 | y_validation, 34 | sample_weight_validation, 35 | eval_metric, 36 | cat_features_indices, 37 | n_jobs, 38 | random_state, 39 | ): 40 | self.X_train = X_train 41 | self.y_train = y_train 42 | self.sample_weight = sample_weight 43 | self.X_validation = X_validation 44 | self.y_validation = y_validation 45 | self.sample_weight_validation = sample_weight_validation 46 | self.dtrain = lgb.Dataset( 47 | self.X_train.to_numpy() 48 | if isinstance(self.X_train, pd.DataFrame) 49 | else self.X_train, 50 | label=self.y_train, 51 | weight=self.sample_weight, 52 | ) 53 | self.dvalid = lgb.Dataset( 54 | self.X_validation.to_numpy() 55 | if isinstance(self.X_validation, pd.DataFrame) 56 | else self.X_validation, 57 | label=self.y_validation, 58 | weight=self.sample_weight_validation, 59 | ) 60 | 61 | self.cat_features_indices = cat_features_indices 62 | self.eval_metric = eval_metric 63 | self.learning_rate = 0.025 64 | self.rounds = 1000 65 | self.early_stopping_rounds = 50 66 | self.seed = random_state 67 | 68 | self.n_jobs = n_jobs 69 | if n_jobs == -1: 70 | self.n_jobs = 0 71 | 72 | self.objective = "" 73 | self.eval_metric_name = "" 74 | 75 | self.eval_metric_name, self.custom_eval_metric_name = lightgbm_eval_metric( 76 | ml_task, eval_metric.name 77 | ) 78 | 79 | self.custom_eval_metric = None 80 | if self.eval_metric.name == "r2": 81 | self.custom_eval_metric = lightgbm_eval_metric_r2 82 | elif self.eval_metric.name == "spearman": 83 | self.custom_eval_metric = lightgbm_eval_metric_spearman 84 | elif self.eval_metric.name == "pearson": 85 | self.custom_eval_metric = lightgbm_eval_metric_pearson 86 | elif self.eval_metric.name == "f1": 87 | self.custom_eval_metric = lightgbm_eval_metric_f1 88 | elif self.eval_metric.name == "average_precision": 89 | self.custom_eval_metric = lightgbm_eval_metric_average_precision 90 | elif self.eval_metric.name == "accuracy": 91 | self.custom_eval_metric = lightgbm_eval_metric_accuracy 92 | elif self.eval_metric.name == "user_defined_metric": 93 | self.custom_eval_metric = lightgbm_eval_metric_user_defined 94 | 95 | self.num_class = ( 96 | len(np.unique(y_train)) if ml_task == MULTICLASS_CLASSIFICATION else None 97 | ) 98 | self.objective = lightgbm_objective(ml_task, eval_metric.name) 99 | 100 | def __call__(self, trial): 101 | param = { 102 | "objective": self.objective, 103 | "metric": self.eval_metric_name, 104 | "verbosity": -1, 105 | "boosting_type": "gbdt", 106 | "learning_rate": trial.suggest_categorical( 107 | "learning_rate", [0.0125, 0.025, 0.05, 0.1] 108 | ), 109 | "num_leaves": trial.suggest_int("num_leaves", 2, 2048), 110 | "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True), 111 | "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True), 112 | "feature_fraction": min( 113 | trial.suggest_float("feature_fraction", 0.3, 1.0 + EPS), 1.0 114 | ), 115 | "bagging_fraction": min( 116 | trial.suggest_float("bagging_fraction", 0.3, 1.0 + EPS), 1.0 117 | ), 118 | "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), 119 | "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), 120 | "feature_pre_filter": False, 121 | "seed": self.seed, 122 | "num_threads": self.n_jobs, 123 | "extra_trees": trial.suggest_categorical("extra_trees", [True, False]), 124 | } 125 | 126 | if self.cat_features_indices: 127 | param["cat_feature"] = self.cat_features_indices 128 | param["cat_l2"] = trial.suggest_float("cat_l2", EPS, 100.0) 129 | param["cat_smooth"] = trial.suggest_float("cat_smooth", EPS, 100.0) 130 | 131 | if self.num_class is not None: 132 | param["num_class"] = self.num_class 133 | 134 | try: 135 | metric_name = self.eval_metric_name 136 | if metric_name == "custom": 137 | metric_name = self.custom_eval_metric_name 138 | pruning_callback = optuna_integration.LightGBMPruningCallback( 139 | trial, metric_name, "validation" 140 | ) 141 | early_stopping_callback = lgb.early_stopping( 142 | self.early_stopping_rounds, verbose=False 143 | ) 144 | 145 | gbm = lgb.train( 146 | param, 147 | self.dtrain, 148 | valid_sets=[self.dvalid], 149 | valid_names=["validation"], 150 | callbacks=[pruning_callback, early_stopping_callback], 151 | num_boost_round=self.rounds, 152 | feval=self.custom_eval_metric, 153 | ) 154 | 155 | preds = gbm.predict(self.X_validation) 156 | score = self.eval_metric(self.y_validation, preds) 157 | if Metric.optimize_negative(self.eval_metric.name): 158 | score *= -1.0 159 | except optuna.exceptions.TrialPruned as e: 160 | raise e 161 | except Exception as e: 162 | print("Exception in LightgbmObjective", str(e)) 163 | return None 164 | 165 | return score 166 | ``` -------------------------------------------------------------------------------- /supervised/tuner/preprocessing_tuner.py: -------------------------------------------------------------------------------- ```python 1 | from supervised.algorithms.registry import ( 2 | BINARY_CLASSIFICATION, 3 | MULTICLASS_CLASSIFICATION, 4 | REGRESSION, 5 | ) 6 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 7 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues 8 | from supervised.preprocessing.scale import Scale 9 | 10 | 11 | class PreprocessingTuner: 12 | 13 | """ 14 | This class prepare configuration for data preprocessing 15 | """ 16 | 17 | CATEGORICALS_MIX = "categorical_mix" # mix int and one-hot 18 | CATEGORICALS_ALL_INT = "categoricals_all_integers" 19 | 20 | @staticmethod 21 | def get( 22 | required_preprocessing, 23 | data_info, 24 | machinelearning_task, 25 | categorical_strategy=CATEGORICALS_ALL_INT, 26 | ): 27 | columns_preprocessing = {} 28 | columns_info = data_info["columns_info"] 29 | 30 | for col, preprocessing_needed in columns_info.items(): 31 | preprocessing_to_apply = [] 32 | 33 | # remove empty columns and columns with only one variable 34 | if ( 35 | "empty_column" in preprocessing_needed 36 | or "constant_column" in preprocessing_needed 37 | ): 38 | preprocessing_to_apply += ["remove_column"] 39 | columns_preprocessing[col] = preprocessing_to_apply 40 | continue 41 | 42 | # always check for missing values 43 | if ( 44 | "missing_values_inputation" in required_preprocessing 45 | and "missing_values" in preprocessing_needed 46 | ): 47 | preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN] 48 | # convert to categorical only for categorical types 49 | convert_to_integer_will_be_applied = False 50 | if ( 51 | "convert_categorical" 52 | in required_preprocessing # the algorithm needs converted categoricals 53 | and "categorical" in preprocessing_needed # the feature is categorical 54 | ): 55 | if categorical_strategy == PreprocessingTuner.CATEGORICALS_MIX: 56 | if PreprocessingCategorical.MANY_CATEGORIES in preprocessing_needed: 57 | preprocessing_to_apply += [ 58 | PreprocessingCategorical.CONVERT_INTEGER 59 | ] 60 | convert_to_integer_will_be_applied = True # maybe scale needed 61 | else: 62 | preprocessing_to_apply += [ 63 | PreprocessingCategorical.CONVERT_ONE_HOT 64 | ] 65 | else: # all integers 66 | preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] 67 | convert_to_integer_will_be_applied = True # maybe scale needed 68 | 69 | """ 70 | if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed: 71 | preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT] 72 | elif PreprocessingCategorical.CONVERT_LOO in preprocessing_needed: 73 | preprocessing_to_apply += [PreprocessingCategorical.CONVERT_LOO] 74 | convert_to_integer_will_be_applied = True # maybe scale needed 75 | else: 76 | preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER] 77 | convert_to_integer_will_be_applied = True # maybe scale needed 78 | """ 79 | 80 | if ( 81 | "datetime_transform" in required_preprocessing 82 | and "datetime_transform" in preprocessing_needed 83 | ): 84 | preprocessing_to_apply += ["datetime_transform"] 85 | if ( 86 | "text_transform" in required_preprocessing 87 | and "text_transform" in preprocessing_needed 88 | ): 89 | preprocessing_to_apply += ["text_transform"] 90 | 91 | if "scale" in required_preprocessing: 92 | if ( 93 | convert_to_integer_will_be_applied 94 | or "scale" in preprocessing_needed 95 | ): 96 | preprocessing_to_apply += [Scale.SCALE_NORMAL] 97 | 98 | # remeber which preprocessing we need to apply 99 | if preprocessing_to_apply: 100 | columns_preprocessing[col] = preprocessing_to_apply 101 | 102 | target_info = data_info["target_info"] 103 | target_preprocessing = [] 104 | # always remove missing values from target, 105 | # target with missing values might be in the train and in the validation datasets 106 | target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE] 107 | 108 | if "target_as_integer" in required_preprocessing: 109 | if machinelearning_task == BINARY_CLASSIFICATION: 110 | if "convert_0_1" in target_info: 111 | target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] 112 | 113 | if machinelearning_task == MULTICLASS_CLASSIFICATION: 114 | # if PreprocessingUtils.is_categorical(y): 115 | # always convert to integer, there can be many situations that can break 116 | # for example, classes starting from 1, ... 117 | # or classes not for every number, for example 0,2,3,4 118 | # just always convert 119 | target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER] 120 | 121 | elif "target_as_one_hot" in required_preprocessing: 122 | target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT] 123 | 124 | if ( 125 | machinelearning_task == REGRESSION 126 | and "target_scale" in required_preprocessing 127 | ): 128 | if "scale_log" in target_info: 129 | target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL] 130 | elif "scale" in target_info: 131 | target_preprocessing += [Scale.SCALE_NORMAL] 132 | 133 | return { 134 | "columns_preprocessing": columns_preprocessing, 135 | "target_preprocessing": target_preprocessing, 136 | "ml_task": machinelearning_task, 137 | } 138 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/sklearn.py: -------------------------------------------------------------------------------- ```python 1 | import copy 2 | import logging 3 | import time 4 | import warnings 5 | 6 | import joblib 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from supervised.algorithms.algorithm import BaseAlgorithm 11 | from supervised.algorithms.registry import ( 12 | BINARY_CLASSIFICATION, 13 | MULTICLASS_CLASSIFICATION, 14 | REGRESSION, 15 | ) 16 | from supervised.utils.config import LOG_LEVEL 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(LOG_LEVEL) 20 | 21 | 22 | class SklearnAlgorithm(BaseAlgorithm): 23 | def __init__(self, params): 24 | super(SklearnAlgorithm, self).__init__(params) 25 | 26 | def fit( 27 | self, 28 | X, 29 | y, 30 | sample_weight=None, 31 | X_validation=None, 32 | y_validation=None, 33 | sample_weight_validation=None, 34 | log_to_file=None, 35 | max_time=None, 36 | ): 37 | with warnings.catch_warnings(): 38 | warnings.simplefilter(action="ignore") 39 | self.model.fit(X, y, sample_weight=sample_weight) 40 | if self.params["ml_task"] != REGRESSION: 41 | self.classes_ = np.unique(y) 42 | 43 | def copy(self): 44 | return copy.deepcopy(self) 45 | 46 | def save(self, model_file_path): 47 | logger.debug("SklearnAlgorithm save to {0}".format(model_file_path)) 48 | joblib.dump(self.model, model_file_path, compress=True) 49 | self.model_file_path = model_file_path 50 | 51 | def load(self, model_file_path): 52 | logger.debug("SklearnAlgorithm loading model from {0}".format(model_file_path)) 53 | self.model = joblib.load(model_file_path) 54 | self.model_file_path = model_file_path 55 | 56 | def is_fitted(self): 57 | return ( 58 | hasattr(self.model, "n_features_in_") 59 | and self.model.n_features_in_ is not None 60 | and self.model.n_features_in_ > 0 61 | ) 62 | 63 | def predict(self, X): 64 | self.reload() 65 | if self.params["ml_task"] == BINARY_CLASSIFICATION: 66 | return self.model.predict_proba(X)[:, 1] 67 | elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: 68 | return self.model.predict_proba(X) 69 | return self.model.predict(X) 70 | 71 | 72 | from supervised.utils.metric import Metric 73 | 74 | 75 | def predict_proba_function_binary(estimator, X): 76 | return estimator.predict_proba(X)[:, 1] 77 | 78 | 79 | def predict_proba_function_multiclass(estimator, X): 80 | return estimator.predict_proba(X) 81 | 82 | 83 | class SklearnTreesEnsembleClassifierAlgorithm(SklearnAlgorithm): 84 | def __init__(self, params): 85 | super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params) 86 | self.log_metric = Metric( 87 | {"name": self.params.get("eval_metric_name", "logloss")} 88 | ) 89 | self.max_iters = ( 90 | 1 # max iters is used by model_framework, max_steps is used internally 91 | ) 92 | if params.get("ml_task") == BINARY_CLASSIFICATION: 93 | self.predict_function = predict_proba_function_binary 94 | else: 95 | self.predict_function = predict_proba_function_multiclass 96 | 97 | def fit( 98 | self, 99 | X, 100 | y, 101 | sample_weight=None, 102 | X_validation=None, 103 | y_validation=None, 104 | sample_weight_validation=None, 105 | log_to_file=None, 106 | max_time=None, 107 | ): 108 | max_steps = self.max_steps 109 | n_estimators = 0 110 | 111 | min_val = 10e12 112 | min_e = 0 113 | 114 | p_tr, p_vd = None, None 115 | result = {"iteration": [], "train": [], "validation": []} 116 | 117 | start_time = time.time() 118 | with warnings.catch_warnings(): 119 | warnings.simplefilter(action="ignore") 120 | 121 | for i in range(max_steps): 122 | self.model.fit(X, np.ravel(y), sample_weight=sample_weight) 123 | self.model.n_estimators += self.trees_in_step 124 | 125 | if X_validation is None or y_validation is None: 126 | continue 127 | estimators = self.model.estimators_ 128 | 129 | stop = False 130 | for e in range(n_estimators, len(estimators)): 131 | p = self.predict_function(estimators[e], X) 132 | if p_tr is None: 133 | p_tr = p 134 | else: 135 | p_tr += p 136 | 137 | p = self.predict_function(estimators[e], X_validation) 138 | if p_vd is None: 139 | p_vd = p 140 | else: 141 | p_vd += p 142 | 143 | tr = self.log_metric( 144 | y, p_tr / float(e + 1), sample_weight=sample_weight 145 | ) 146 | vd = self.log_metric( 147 | y_validation, 148 | p_vd / float(e + 1), 149 | sample_weight=sample_weight_validation, 150 | ) 151 | 152 | if vd < min_val: # optimize direction 153 | min_val = vd 154 | min_e = e 155 | 156 | if e - min_e >= self.early_stopping_rounds: 157 | stop = True 158 | break 159 | 160 | result["iteration"] += [e] 161 | result["train"] += [tr] 162 | result["validation"] += [vd] 163 | 164 | # disable for now ... 165 | # if max_time is not None and time.time()-start_time > max_time: 166 | # stop = True 167 | 168 | if stop: 169 | self.model.estimators_ = estimators[: (min_e + 1)] 170 | break 171 | n_estimators = len(estimators) 172 | 173 | if log_to_file is not None: 174 | df_result = pd.DataFrame(result) 175 | if self.log_metric.is_negative(): 176 | df_result["train"] *= -1.0 177 | df_result["validation"] *= -1.0 178 | df_result.to_csv(log_to_file, index=False, header=False) 179 | 180 | self.classes_ = np.unique(y) 181 | 182 | def get_metric_name(self): 183 | return self.params.get("eval_metric_name", "logloss") 184 | 185 | 186 | def predict_function(estimator, X): 187 | return estimator.predict(X) 188 | 189 | 190 | class SklearnTreesEnsembleRegressorAlgorithm(SklearnTreesEnsembleClassifierAlgorithm): 191 | def __init__(self, params): 192 | super(SklearnTreesEnsembleRegressorAlgorithm, self).__init__(params) 193 | self.log_metric = Metric({"name": self.params.get("eval_metric_name", "rmse")}) 194 | self.predict_function = predict_function 195 | 196 | def get_metric_name(self): 197 | return self.params.get("eval_metric_name", "rmse") 198 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_preprocessing_missing.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from supervised.preprocessing.preprocessing_missing import PreprocessingMissingValues 7 | 8 | 9 | class PreprocessingMissingValuesTest(unittest.TestCase): 10 | def test_preprocessing_constructor(self): 11 | """ 12 | Check if PreprocessingMissingValues object is properly initialized 13 | """ 14 | preprocess_missing = PreprocessingMissingValues( 15 | PreprocessingMissingValues.FILL_NA_MEDIAN 16 | ) 17 | self.assertEqual( 18 | preprocess_missing._na_fill_method, 19 | PreprocessingMissingValues.FILL_NA_MEDIAN, 20 | ) 21 | self.assertEqual(preprocess_missing._na_fill_params, {}) 22 | 23 | def test_get_fill_value(self): 24 | """ 25 | Check if correct value is returned for filling in case of different 26 | column type and fill method 27 | """ 28 | d = {"col1": [1, 2, 3, np.nan, np.nan], "col2": ["a", "a", np.nan, "b", "c"]} 29 | df = pd.DataFrame(data=d) 30 | # fill with median 31 | preprocess_missing = PreprocessingMissingValues( 32 | df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 33 | ) 34 | self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 2) 35 | self.assertEqual(preprocess_missing._get_fill_value(df["col2"]), "a") 36 | # fill with mean 37 | preprocess_missing = PreprocessingMissingValues( 38 | df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 39 | ) 40 | self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 2) 41 | self.assertEqual(preprocess_missing._get_fill_value(df["col2"]), "a") 42 | # fill with min 43 | preprocess_missing = PreprocessingMissingValues( 44 | df.columns, PreprocessingMissingValues.FILL_NA_MIN 45 | ) 46 | self.assertEqual(preprocess_missing._get_fill_value(df["col1"]), 0) 47 | self.assertEqual( 48 | preprocess_missing._get_fill_value(df["col2"]), "_missing_value_" 49 | ) # added new value 50 | 51 | def test_fit_na_fill(self): 52 | """ 53 | Check fit private method 54 | """ 55 | d = { 56 | "col1": [1, 2, 3, np.nan, np.nan], 57 | "col2": ["a", "a", np.nan, "b", "c"], 58 | "col3": ["a", "a", "d", "b", "c"], 59 | } 60 | df = pd.DataFrame(data=d) 61 | # fill with median 62 | preprocess_missing = PreprocessingMissingValues( 63 | df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 64 | ) 65 | preprocess_missing._fit_na_fill(df) 66 | self.assertTrue("col1" in preprocess_missing._na_fill_params) 67 | self.assertTrue("col2" in preprocess_missing._na_fill_params) 68 | self.assertTrue("col3" not in preprocess_missing._na_fill_params) 69 | self.assertEqual(2, preprocess_missing._na_fill_params["col1"]) 70 | self.assertEqual("a", preprocess_missing._na_fill_params["col2"]) 71 | # fill with mean 72 | preprocess_missing = PreprocessingMissingValues( 73 | df.columns, PreprocessingMissingValues.FILL_NA_MEAN 74 | ) 75 | preprocess_missing._fit_na_fill(df) 76 | self.assertTrue("col1" in preprocess_missing._na_fill_params) 77 | self.assertTrue("col2" in preprocess_missing._na_fill_params) 78 | self.assertTrue("col3" not in preprocess_missing._na_fill_params) 79 | self.assertEqual(2, preprocess_missing._na_fill_params["col1"]) 80 | self.assertEqual("a", preprocess_missing._na_fill_params["col2"]) 81 | # fill with min 82 | preprocess_missing = PreprocessingMissingValues( 83 | df.columns, PreprocessingMissingValues.FILL_NA_MIN 84 | ) 85 | preprocess_missing._fit_na_fill(df) 86 | self.assertTrue("col1" in preprocess_missing._na_fill_params) 87 | self.assertTrue("col2" in preprocess_missing._na_fill_params) 88 | self.assertTrue("col3" not in preprocess_missing._na_fill_params) 89 | self.assertEqual(0, preprocess_missing._na_fill_params["col1"]) 90 | self.assertEqual("_missing_value_", preprocess_missing._na_fill_params["col2"]) 91 | 92 | def test_transform(self): 93 | """ 94 | Check transform 95 | """ 96 | # training data 97 | d = { 98 | "col1": [1, 2, 3, np.nan, np.nan], 99 | "col2": ["a", "a", np.nan, "a", "c"], 100 | "col3": [1, 1, 3, 1, 1], 101 | "col4": ["a", "a", "a", "c", "a"], 102 | } 103 | df = pd.DataFrame(data=d) 104 | # test data 105 | d_test = { 106 | "col1": [1, 2, 3, np.nan, np.nan], 107 | "col2": ["b", "b", np.nan, "b", "c"], 108 | "col3": [1, 2, 2, np.nan, 2], 109 | "col4": ["b", "b", np.nan, "b", "c"], 110 | } 111 | df_test = pd.DataFrame(data=d_test) 112 | # fill with median 113 | preprocess_missing = PreprocessingMissingValues( 114 | df.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 115 | ) 116 | preprocess_missing.fit(df) 117 | self.assertEqual( 118 | 2, len(preprocess_missing._na_fill_params) 119 | ) # there should be only two columns 120 | df_transformed = preprocess_missing.transform(df_test) 121 | self.assertTrue( 122 | np.isnan(df.loc[3, "col1"]) 123 | ) # training data frame is not filled 124 | self.assertEqual( 125 | 2, df_test.loc[3, "col1"] 126 | ) # data frame is filled after transform 127 | self.assertEqual("a", df_test.loc[2, "col2"]) 128 | 129 | # it is disabled, should be treated separately at the end of preprocessing 130 | # columns without missing values in training set are also filled 131 | # but they are filled based on their own values 132 | # self.assertEqual(2, df_test.loc[3, "col3"]) 133 | # self.assertEqual("b", df_test.loc[3, "col4"]) 134 | 135 | def test_transform_on_new_data(self): 136 | # training data 137 | d = { 138 | "col1": [1, 1, np.nan, 3], 139 | "col2": ["a", "a", np.nan, "a"], 140 | "col3": [1, 1, 1, 3], 141 | "col4": ["a", "a", "b", "c"], 142 | "y": [0, 1, 1, 1], 143 | } 144 | df = pd.DataFrame(data=d) 145 | X_train = df.loc[:, ["col1", "col2", "col3", "col4"]] 146 | y_train = df.loc[:, "y"] 147 | 148 | d_test = { 149 | "col1": [1, 1, np.nan, 3], 150 | "col2": ["a", "a", np.nan, "a"], 151 | "col3": [1, 1, 1, 3], 152 | "col4": ["a", "a", "b", "c"], 153 | "y": [np.nan, 1, np.nan, 1], 154 | } 155 | df_test = pd.DataFrame(data=d_test) 156 | X_test = df_test.loc[:, ["col1", "col2", "col3", "col4"]] 157 | y_test = df_test.loc[:, "y"] 158 | 159 | pm = PreprocessingMissingValues( 160 | X_train.columns, PreprocessingMissingValues.FILL_NA_MEDIAN 161 | ) 162 | pm.fit(X_train) 163 | X_train = pm.transform(X_train) 164 | X_test = pm.transform(X_test) 165 | 166 | self.assertEqual(1, X_test.loc[2, "col1"]) 167 | self.assertEqual("a", X_test.loc[2, "col2"]) 168 | 169 | 170 | if __name__ == "__main__": 171 | unittest.main() 172 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_categorical_integers.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import pandas as pd 4 | 5 | from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical 6 | 7 | import warnings 8 | 9 | 10 | class CategoricalIntegersTest(unittest.TestCase): 11 | def test_constructor_preprocessing_categorical(self): 12 | """ 13 | Check if PreprocessingCategorical object is properly initialized 14 | """ 15 | categorical = PreprocessingCategorical( 16 | [], PreprocessingCategorical.CONVERT_INTEGER 17 | ) 18 | self.assertEqual( 19 | categorical._convert_method, PreprocessingCategorical.CONVERT_INTEGER 20 | ) 21 | self.assertEqual(categorical._convert_params, {}) 22 | 23 | def test_fit_integers(self): 24 | # training data 25 | d = { 26 | "col1": [1, 2, 3], 27 | "col2": ["a", "a", "c"], 28 | "col3": [1, 1, 3], 29 | "col4": ["a", "b", "c"], 30 | } 31 | df = pd.DataFrame(data=d) 32 | categorical = PreprocessingCategorical( 33 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 34 | ) 35 | categorical.fit(df) 36 | 37 | self.assertTrue("col2" in categorical._convert_params) 38 | self.assertTrue("col4" in categorical._convert_params) 39 | self.assertTrue("a" in categorical._convert_params["col2"]) 40 | self.assertTrue("c" in categorical._convert_params["col2"]) 41 | self.assertTrue("b" not in categorical._convert_params["col2"]) 42 | self.assertTrue("a" in categorical._convert_params["col4"]) 43 | self.assertTrue("b" in categorical._convert_params["col4"]) 44 | self.assertTrue("c" in categorical._convert_params["col4"]) 45 | 46 | def test_fit_transform_integers(self): 47 | # training data 48 | d = { 49 | "col1": [1, 2, 3], 50 | "col2": ["a", "a", "c"], 51 | "col3": [1, 1, 3], 52 | "col4": ["a", "b", "c"], 53 | } 54 | df = pd.DataFrame(data=d) 55 | categorical = PreprocessingCategorical( 56 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 57 | ) 58 | categorical.fit(df) 59 | df = categorical.transform(df) 60 | for col in ["col1", "col2", "col3", "col4"]: 61 | self.assertTrue(col in df.columns) 62 | self.assertEqual(df["col2"][0], 0) 63 | self.assertEqual(df["col2"][1], 0) 64 | self.assertEqual(df["col2"][2], 1) 65 | self.assertEqual(df["col4"][0], 0) 66 | self.assertEqual(df["col4"][1], 1) 67 | self.assertEqual(df["col4"][2], 2) 68 | 69 | def test_future_warning_pandas_transform(self): 70 | with warnings.catch_warnings(): 71 | warnings.simplefilter("error") 72 | 73 | # training data 74 | d = { 75 | "col1": [False, True, True], 76 | "col2": [False, False, True], 77 | "col3": [True, False, True], 78 | } 79 | df = pd.DataFrame(data=d) 80 | categorical = PreprocessingCategorical( 81 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 82 | ) 83 | categorical.fit(df) 84 | 85 | df = categorical.transform(df).astype(int) 86 | 87 | def test_future_warning_pandas_inverse_transform(self): 88 | with warnings.catch_warnings(): 89 | warnings.simplefilter("error") 90 | 91 | # training data 92 | d = { 93 | "col1": [False, True, True], 94 | "col2": [False, False, True], 95 | "col3": [True, False, True], 96 | } 97 | df = pd.DataFrame(data=d) 98 | categorical = PreprocessingCategorical( 99 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 100 | ) 101 | categorical.fit(df) 102 | 103 | df = categorical.transform(df).astype(int) 104 | df = categorical.inverse_transform(df) 105 | 106 | def test_fit_transform_inverse_transform_integers(self): 107 | # training data 108 | d = { 109 | "col1": [1, 2, 3], 110 | "col2": ["a", "a", "c"], 111 | "col3": [1, 1, 3], 112 | "col4": ["a", "b", "c"], 113 | } 114 | df = pd.DataFrame(data=d) 115 | categorical = PreprocessingCategorical( 116 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 117 | ) 118 | categorical.fit(df) 119 | df_transform = categorical.transform(df).astype(int) 120 | df_inverse = categorical.inverse_transform(df_transform) 121 | for col in ["col1", "col2", "col3", "col4"]: 122 | self.assertTrue(col in df_inverse.columns) 123 | self.assertEqual(d["col2"][0], df_inverse["col2"][0]) 124 | self.assertEqual(d["col2"][1], df_inverse["col2"][1]) 125 | self.assertEqual(d["col2"][2], df_inverse["col2"][2]) 126 | self.assertEqual(d["col4"][0], df_inverse["col4"][0]) 127 | self.assertEqual(d["col4"][1], df_inverse["col4"][1]) 128 | self.assertEqual(d["col4"][2], df_inverse["col4"][2]) 129 | 130 | def test_fit_transform_integers_with_new_values(self): 131 | # training data 132 | d_train = { 133 | "col1": [1, 2, 3], 134 | "col2": ["a", "a", "c"], 135 | "col3": [1, 1, 3], 136 | "col4": ["a", "b", "c"], 137 | } 138 | df_train = pd.DataFrame(data=d_train) 139 | categorical = PreprocessingCategorical( 140 | df_train.columns, PreprocessingCategorical.CONVERT_INTEGER 141 | ) 142 | categorical.fit(df_train) 143 | # testing data 144 | d = { 145 | "col1": [1, 2, 3], 146 | "col2": ["a", "d", "f"], 147 | "col3": [1, 1, 3], 148 | "col4": ["e", "b", "z"], 149 | } 150 | df = pd.DataFrame(data=d) 151 | df = categorical.transform(df) 152 | for col in ["col1", "col2", "col3", "col4"]: 153 | self.assertTrue(col in df.columns) 154 | self.assertEqual(df["col2"][0], 0) 155 | self.assertEqual(df["col2"][1], 2) # new values get higher indexes 156 | self.assertEqual(df["col2"][2], 3) # new values get higher indexes 157 | self.assertEqual(df["col4"][0], 3) # new values get higher indexes 158 | self.assertEqual(df["col4"][1], 1) 159 | self.assertEqual(df["col4"][2], 4) # new values get higher indexes 160 | 161 | def test_to_and_from_json_convert_integers(self): 162 | # training data 163 | d = { 164 | "col1": [1, 2, 3], 165 | "col2": ["a", "a", "c"], 166 | "col3": [1, 1, 3], 167 | "col4": ["a", "b", "c"], 168 | } 169 | df = pd.DataFrame(data=d) 170 | cat1 = PreprocessingCategorical( 171 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 172 | ) 173 | cat1.fit(df) 174 | 175 | cat2 = PreprocessingCategorical( 176 | df.columns, PreprocessingCategorical.CONVERT_INTEGER 177 | ) 178 | cat2.from_json(cat1.to_json()) 179 | df = cat2.transform(df) 180 | for col in ["col1", "col2", "col3", "col4"]: 181 | self.assertTrue(col in df.columns) 182 | self.assertEqual(df["col2"][0], 0) 183 | self.assertEqual(df["col2"][1], 0) 184 | self.assertEqual(df["col2"][2], 1) 185 | self.assertEqual(df["col4"][0], 0) 186 | self.assertEqual(df["col4"][1], 1) 187 | self.assertEqual(df["col4"][2], 2) 188 | 189 | 190 | if __name__ == "__main__": 191 | unittest.main() 192 | ``` -------------------------------------------------------------------------------- /tests/tests_validation/test_validator_kfold.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | import pytest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from supervised.utils.utils import dump_data 10 | from supervised.validation.validator_kfold import KFoldValidator 11 | 12 | 13 | class KFoldValidatorTest(unittest.TestCase): 14 | def test_create(self): 15 | with tempfile.TemporaryDirectory() as results_path: 16 | data = { 17 | "X": pd.DataFrame( 18 | np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"] 19 | ), 20 | "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), 21 | } 22 | 23 | X_path = os.path.join(results_path, "X.data") 24 | y_path = os.path.join(results_path, "y.data") 25 | 26 | dump_data(X_path, data["X"]) 27 | dump_data(y_path, data["y"]) 28 | 29 | params = { 30 | "shuffle": False, 31 | "stratify": True, 32 | "k_folds": 2, 33 | "results_path": results_path, 34 | "X_path": X_path, 35 | "y_path": y_path, 36 | } 37 | vl = KFoldValidator(params) 38 | 39 | self.assertEqual(params["k_folds"], vl.get_n_splits()) 40 | # for train, validation in vl.split(): 41 | for k_fold in range(vl.get_n_splits()): 42 | train, validation = vl.get_split(k_fold) 43 | 44 | X_train, y_train = train.get("X"), train.get("y") 45 | X_validation, y_validation = validation.get("X"), validation.get("y") 46 | 47 | self.assertEqual(X_train.shape[0], 2) 48 | self.assertEqual(y_train.shape[0], 2) 49 | self.assertEqual(X_validation.shape[0], 2) 50 | self.assertEqual(y_validation.shape[0], 2) 51 | 52 | def test_missing_target_values(self): 53 | with tempfile.TemporaryDirectory() as results_path: 54 | data = { 55 | "X": pd.DataFrame( 56 | np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]), 57 | columns=["a", "b"], 58 | ), 59 | "y": pd.DataFrame( 60 | np.array(["a", "b", "a", "b", np.nan, np.nan]), columns=["target"] 61 | ), 62 | } 63 | 64 | X_path = os.path.join(results_path, "X.data") 65 | y_path = os.path.join(results_path, "y.data") 66 | 67 | dump_data(X_path, data["X"]) 68 | dump_data(y_path, data["y"]) 69 | 70 | params = { 71 | "shuffle": False, 72 | "stratify": True, 73 | "k_folds": 2, 74 | "results_path": results_path, 75 | "X_path": X_path, 76 | "y_path": y_path, 77 | } 78 | vl = KFoldValidator(params) 79 | 80 | self.assertEqual(params["k_folds"], vl.get_n_splits()) 81 | 82 | for k_fold in range(vl.get_n_splits()): 83 | train, validation = vl.get_split(k_fold) 84 | X_train, y_train = train.get("X"), train.get("y") 85 | X_validation, y_validation = validation.get("X"), validation.get("y") 86 | 87 | self.assertEqual(X_train.shape[0], 3) 88 | self.assertEqual(y_train.shape[0], 3) 89 | self.assertEqual(X_validation.shape[0], 3) 90 | self.assertEqual(y_validation.shape[0], 3) 91 | 92 | def test_create_with_target_as_labels(self): 93 | with tempfile.TemporaryDirectory() as results_path: 94 | data = { 95 | "X": pd.DataFrame( 96 | np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"] 97 | ), 98 | "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]), 99 | } 100 | 101 | X_path = os.path.join(results_path, "X.data") 102 | y_path = os.path.join(results_path, "y.data") 103 | 104 | dump_data(X_path, data["X"]) 105 | dump_data(y_path, data["y"]) 106 | 107 | params = { 108 | "shuffle": True, 109 | "stratify": True, 110 | "k_folds": 2, 111 | "results_path": results_path, 112 | "X_path": X_path, 113 | "y_path": y_path, 114 | } 115 | vl = KFoldValidator(params) 116 | 117 | self.assertEqual(params["k_folds"], vl.get_n_splits()) 118 | 119 | for k_fold in range(vl.get_n_splits()): 120 | train, validation = vl.get_split(k_fold) 121 | X_train, y_train = train.get("X"), train.get("y") 122 | X_validation, y_validation = validation.get("X"), validation.get("y") 123 | 124 | self.assertEqual(X_train.shape[0], 2) 125 | self.assertEqual(y_train.shape[0], 2) 126 | self.assertEqual(X_validation.shape[0], 2) 127 | self.assertEqual(y_validation.shape[0], 2) 128 | 129 | def test_repeats(self): 130 | with tempfile.TemporaryDirectory() as results_path: 131 | data = { 132 | "X": pd.DataFrame( 133 | np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"] 134 | ), 135 | "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), 136 | } 137 | 138 | X_path = os.path.join(results_path, "X.data") 139 | y_path = os.path.join(results_path, "y.data") 140 | 141 | dump_data(X_path, data["X"]) 142 | dump_data(y_path, data["y"]) 143 | 144 | params = { 145 | "shuffle": True, 146 | "stratify": False, 147 | "k_folds": 2, 148 | "repeats": 10, 149 | "results_path": results_path, 150 | "X_path": X_path, 151 | "y_path": y_path, 152 | "random_seed": 1, 153 | } 154 | vl = KFoldValidator(params) 155 | 156 | self.assertEqual(params["k_folds"], vl.get_n_splits()) 157 | self.assertEqual(params["repeats"], vl.get_repeats()) 158 | 159 | for repeat in range(vl.get_repeats()): 160 | for k_fold in range(vl.get_n_splits()): 161 | train, validation = vl.get_split(k_fold, repeat) 162 | 163 | X_train, y_train = train.get("X"), train.get("y") 164 | X_validation, y_validation = validation.get("X"), validation.get( 165 | "y" 166 | ) 167 | 168 | self.assertEqual(X_train.shape[0], 2) 169 | self.assertEqual(y_train.shape[0], 2) 170 | self.assertEqual(X_validation.shape[0], 2) 171 | self.assertEqual(y_validation.shape[0], 2) 172 | 173 | def test_disable_repeats_when_disabled_shuffle(self): 174 | with tempfile.TemporaryDirectory() as results_path: 175 | data = { 176 | "X": pd.DataFrame( 177 | np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"] 178 | ), 179 | "y": pd.DataFrame(np.array([0, 0, 1, 1]), columns=["target"]), 180 | } 181 | 182 | X_path = os.path.join(results_path, "X.data") 183 | y_path = os.path.join(results_path, "y.data") 184 | 185 | dump_data(X_path, data["X"]) 186 | dump_data(y_path, data["y"]) 187 | 188 | params = { 189 | "shuffle": False, 190 | "stratify": False, 191 | "k_folds": 2, 192 | "repeats": 10, 193 | "results_path": results_path, 194 | "X_path": X_path, 195 | "y_path": y_path, 196 | "random_seed": 1, 197 | } 198 | 199 | with pytest.warns( 200 | expected_warning=UserWarning, 201 | match="Disable repeats in validation because shuffle is disabled", 202 | ) as record: 203 | vl = KFoldValidator(params) 204 | 205 | # check that only one warning was raised 206 | self.assertEqual(len(record), 1) 207 | 208 | self.assertEqual(params["k_folds"], vl.get_n_splits()) 209 | self.assertEqual(1, vl.get_repeats()) 210 | ``` -------------------------------------------------------------------------------- /tests/tests_validation/test_validator_split.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | import pytest 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from supervised.utils.utils import dump_data 10 | from supervised.validation.validator_split import SplitValidator 11 | 12 | 13 | class SplitValidatorTest(unittest.TestCase): 14 | def test_create(self): 15 | with tempfile.TemporaryDirectory() as results_path: 16 | data = { 17 | "X": pd.DataFrame( 18 | np.array( 19 | [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]] 20 | ), 21 | columns=["a", "b"], 22 | ), 23 | "y": pd.DataFrame( 24 | np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"] 25 | ), 26 | } 27 | 28 | X_path = os.path.join(results_path, "X.data") 29 | y_path = os.path.join(results_path, "y.data") 30 | 31 | dump_data(X_path, data["X"]) 32 | dump_data(y_path, data["y"]) 33 | 34 | params = { 35 | "shuffle": False, 36 | "stratify": False, 37 | "train_ratio": 0.5, 38 | "results_path": results_path, 39 | "X_path": X_path, 40 | "y_path": y_path, 41 | } 42 | vl = SplitValidator(params) 43 | 44 | self.assertEqual(1, vl.get_n_splits()) 45 | # for train, validation in vl.split(): 46 | for k_fold in range(vl.get_n_splits()): 47 | train, validation = vl.get_split(k_fold) 48 | 49 | X_train, y_train = train.get("X"), train.get("y") 50 | X_validation, y_validation = validation.get("X"), validation.get("y") 51 | 52 | self.assertEqual(X_train.shape[0], 4) 53 | self.assertEqual(y_train.shape[0], 4) 54 | self.assertEqual(X_validation.shape[0], 4) 55 | self.assertEqual(y_validation.shape[0], 4) 56 | 57 | def test_missing_target_values(self): 58 | with tempfile.TemporaryDirectory() as results_path: 59 | data = { 60 | "X": pd.DataFrame( 61 | np.array([[1, 0], [2, 1], [3, 0], [4, 1], [5, 1], [6, 1]]), 62 | columns=["a", "b"], 63 | ), 64 | "y": pd.DataFrame( 65 | np.array(["a", "b", np.nan, "a", "b", np.nan]), columns=["target"] 66 | ), 67 | } 68 | 69 | X_path = os.path.join(results_path, "X.data") 70 | y_path = os.path.join(results_path, "y.data") 71 | 72 | dump_data(X_path, data["X"]) 73 | dump_data(y_path, data["y"]) 74 | 75 | params = { 76 | "shuffle": False, 77 | "stratify": False, 78 | "train_ratio": 0.5, 79 | "results_path": results_path, 80 | "X_path": X_path, 81 | "y_path": y_path, 82 | } 83 | vl = SplitValidator(params) 84 | 85 | self.assertEqual(1, vl.get_n_splits()) 86 | 87 | for k_fold in range(vl.get_n_splits()): 88 | train, validation = vl.get_split(k_fold) 89 | X_train, y_train = train.get("X"), train.get("y") 90 | X_validation, y_validation = validation.get("X"), validation.get("y") 91 | 92 | self.assertEqual(X_train.shape[0], 3) 93 | self.assertEqual(y_train.shape[0], 3) 94 | self.assertEqual(X_validation.shape[0], 3) 95 | self.assertEqual(y_validation.shape[0], 3) 96 | 97 | def test_create_with_target_as_labels(self): 98 | with tempfile.TemporaryDirectory() as results_path: 99 | data = { 100 | "X": pd.DataFrame( 101 | np.array([[0, 0], [0, 1], [1, 0], [1, 1]]), columns=["a", "b"] 102 | ), 103 | "y": pd.DataFrame(np.array(["a", "b", "a", "b"]), columns=["target"]), 104 | } 105 | 106 | X_path = os.path.join(results_path, "X.data") 107 | y_path = os.path.join(results_path, "y.data") 108 | 109 | dump_data(X_path, data["X"]) 110 | dump_data(y_path, data["y"]) 111 | 112 | params = { 113 | "shuffle": True, 114 | "stratify": True, 115 | "train_ratio": 0.5, 116 | "results_path": results_path, 117 | "X_path": X_path, 118 | "y_path": y_path, 119 | } 120 | vl = SplitValidator(params) 121 | 122 | self.assertEqual(1, vl.get_n_splits()) 123 | 124 | for k_fold in range(vl.get_n_splits()): 125 | train, validation = vl.get_split(k_fold) 126 | X_train, y_train = train.get("X"), train.get("y") 127 | X_validation, y_validation = validation.get("X"), validation.get("y") 128 | 129 | self.assertEqual(X_train.shape[0], 2) 130 | self.assertEqual(y_train.shape[0], 2) 131 | self.assertEqual(X_validation.shape[0], 2) 132 | self.assertEqual(y_validation.shape[0], 2) 133 | 134 | def test_repeats(self): 135 | with tempfile.TemporaryDirectory() as results_path: 136 | data = { 137 | "X": pd.DataFrame( 138 | np.array( 139 | [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]] 140 | ), 141 | columns=["a", "b"], 142 | ), 143 | "y": pd.DataFrame( 144 | np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"] 145 | ), 146 | } 147 | 148 | X_path = os.path.join(results_path, "X.data") 149 | y_path = os.path.join(results_path, "y.data") 150 | 151 | dump_data(X_path, data["X"]) 152 | dump_data(y_path, data["y"]) 153 | 154 | params = { 155 | "shuffle": True, 156 | "stratify": False, 157 | "train_ratio": 0.5, 158 | "results_path": results_path, 159 | "X_path": X_path, 160 | "y_path": y_path, 161 | "repeats": 3, 162 | } 163 | vl = SplitValidator(params) 164 | 165 | self.assertEqual(1, vl.get_n_splits()) 166 | self.assertEqual(3, vl.get_repeats()) 167 | 168 | cnt = 0 169 | for repeat in range(vl.get_repeats()): 170 | for k_fold in range(vl.get_n_splits()): 171 | train, validation = vl.get_split(k_fold, repeat) 172 | 173 | X_train, y_train = train.get("X"), train.get("y") 174 | X_validation, y_validation = validation.get("X"), validation.get( 175 | "y" 176 | ) 177 | 178 | self.assertEqual(X_train.shape[0], 4) 179 | self.assertEqual(y_train.shape[0], 4) 180 | self.assertEqual(X_validation.shape[0], 4) 181 | self.assertEqual(y_validation.shape[0], 4) 182 | cnt += 1 183 | 184 | self.assertEqual(cnt, 3) 185 | 186 | def test_disable_repeats_when_disabled_shuffle(self): 187 | with tempfile.TemporaryDirectory() as results_path: 188 | data = { 189 | "X": pd.DataFrame( 190 | np.array( 191 | [[0, 0], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0], [1, 1]] 192 | ), 193 | columns=["a", "b"], 194 | ), 195 | "y": pd.DataFrame( 196 | np.array([0, 0, 1, 0, 1, 0, 1, 1]), columns=["target"] 197 | ), 198 | } 199 | 200 | X_path = os.path.join(results_path, "X.data") 201 | y_path = os.path.join(results_path, "y.data") 202 | 203 | dump_data(X_path, data["X"]) 204 | dump_data(y_path, data["y"]) 205 | 206 | params = { 207 | "shuffle": False, 208 | "stratify": False, 209 | "train_ratio": 0.5, 210 | "results_path": results_path, 211 | "X_path": X_path, 212 | "y_path": y_path, 213 | "repeats": 3, 214 | } 215 | 216 | with pytest.warns( 217 | expected_warning=UserWarning, 218 | match="Disable repeats in validation because shuffle is disabled", 219 | ) as record: 220 | vl = SplitValidator(params) 221 | 222 | # check that only one warning was raised 223 | self.assertEqual(len(record), 1) 224 | 225 | self.assertEqual(1, vl.get_n_splits()) 226 | self.assertEqual(1, vl.get_repeats()) 227 | ``` -------------------------------------------------------------------------------- /supervised/utils/additional_plots.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | import numpy as np 4 | import scikitplot as skplt 5 | from matplotlib import pyplot as plt 6 | 7 | 8 | class AdditionalPlots: 9 | @staticmethod 10 | def plots_binary(target, predicted_labels, predicted_probas): 11 | figures = [] 12 | try: 13 | # 14 | fig = plt.figure(figsize=(10, 7)) 15 | ax1 = fig.add_subplot(1, 1, 1) 16 | _ = skplt.metrics.plot_confusion_matrix( 17 | target, predicted_labels, normalize=False, ax=ax1 18 | ) 19 | figures += [ 20 | { 21 | "title": "Confusion Matrix", 22 | "fname": "confusion_matrix.png", 23 | "figure": fig, 24 | } 25 | ] 26 | # 27 | fig = plt.figure(figsize=(10, 7)) 28 | ax1 = fig.add_subplot(1, 1, 1) 29 | _ = skplt.metrics.plot_confusion_matrix( 30 | target, predicted_labels, normalize=True, ax=ax1 31 | ) 32 | figures += [ 33 | { 34 | "title": "Normalized Confusion Matrix", 35 | "fname": "confusion_matrix_normalized.png", 36 | "figure": fig, 37 | } 38 | ] 39 | # 40 | fig = plt.figure(figsize=(10, 7)) 41 | ax1 = fig.add_subplot(1, 1, 1) 42 | _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1) 43 | figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}] 44 | # 45 | fig = plt.figure(figsize=(10, 7)) 46 | ax1 = fig.add_subplot(1, 1, 1) 47 | _ = skplt.metrics.plot_ks_statistic(target, predicted_probas, ax=ax1) 48 | figures += [ 49 | { 50 | "title": "Kolmogorov-Smirnov Statistic", 51 | "fname": "ks_statistic.png", 52 | "figure": fig, 53 | } 54 | ] 55 | # 56 | fig = plt.figure(figsize=(10, 7)) 57 | ax1 = fig.add_subplot(1, 1, 1) 58 | _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1) 59 | figures += [ 60 | { 61 | "title": "Precision-Recall Curve", 62 | "fname": "precision_recall_curve.png", 63 | "figure": fig, 64 | } 65 | ] 66 | # 67 | fig = plt.figure(figsize=(10, 7)) 68 | ax1 = fig.add_subplot(1, 1, 1) 69 | # transform target if needed to be {0, 1} 70 | target_uniq_values = np.unique(target) 71 | target_transformed = target.values.ravel() 72 | if not (0 in target_uniq_values and 1 in target_uniq_values): 73 | mapping = {target_uniq_values[0]: 0, target_uniq_values[1]: 1} 74 | target_transformed = target.map(mapping) 75 | # create a plot 76 | _ = skplt.metrics.plot_calibration_curve( 77 | target_transformed, [predicted_probas], ["Classifier"], ax=ax1 78 | ) 79 | figures += [ 80 | { 81 | "title": "Calibration Curve", 82 | "fname": "calibration_curve_curve.png", 83 | "figure": fig, 84 | } 85 | ] 86 | # 87 | fig = plt.figure(figsize=(10, 7)) 88 | ax1 = fig.add_subplot(1, 1, 1) 89 | _ = skplt.metrics.plot_cumulative_gain(target, predicted_probas, ax=ax1) 90 | figures += [ 91 | { 92 | "title": "Cumulative Gains Curve", 93 | "fname": "cumulative_gains_curve.png", 94 | "figure": fig, 95 | } 96 | ] 97 | # 98 | fig = plt.figure(figsize=(10, 7)) 99 | ax1 = fig.add_subplot(1, 1, 1) 100 | _ = skplt.metrics.plot_lift_curve(target, predicted_probas, ax=ax1) 101 | figures += [ 102 | {"title": "Lift Curve", "fname": "lift_curve.png", "figure": fig} 103 | ] 104 | 105 | except Exception as e: 106 | print(str(e)) 107 | 108 | return figures 109 | 110 | @staticmethod 111 | def plots_multiclass(target, predicted_labels, predicted_probas): 112 | figures = [] 113 | try: 114 | # 115 | fig = plt.figure(figsize=(10, 7)) 116 | ax1 = fig.add_subplot(1, 1, 1) 117 | _ = skplt.metrics.plot_confusion_matrix( 118 | target, predicted_labels, normalize=False, ax=ax1 119 | ) 120 | figures += [ 121 | { 122 | "title": "Confusion Matrix", 123 | "fname": "confusion_matrix.png", 124 | "figure": fig, 125 | } 126 | ] 127 | # 128 | fig = plt.figure(figsize=(10, 7)) 129 | ax1 = fig.add_subplot(1, 1, 1) 130 | _ = skplt.metrics.plot_confusion_matrix( 131 | target, predicted_labels, normalize=True, ax=ax1 132 | ) 133 | figures += [ 134 | { 135 | "title": "Normalized Confusion Matrix", 136 | "fname": "confusion_matrix_normalized.png", 137 | "figure": fig, 138 | } 139 | ] 140 | # 141 | fig = plt.figure(figsize=(10, 7)) 142 | ax1 = fig.add_subplot(1, 1, 1) 143 | _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1) 144 | figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}] 145 | # 146 | fig = plt.figure(figsize=(10, 7)) 147 | ax1 = fig.add_subplot(1, 1, 1) 148 | _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1) 149 | figures += [ 150 | { 151 | "title": "Precision Recall Curve", 152 | "fname": "precision_recall_curve.png", 153 | "figure": fig, 154 | } 155 | ] 156 | plt.close("all") 157 | except Exception as e: 158 | print(str(e)) 159 | 160 | return figures 161 | 162 | @staticmethod 163 | def plots_regression(target, predictions): 164 | figures = [] 165 | try: 166 | MAX_SAMPLES = 5000 167 | fig = plt.figure(figsize=(10, 7)) 168 | ax1 = fig.add_subplot(1, 1, 1) 169 | samples = target.shape[0] 170 | if samples > MAX_SAMPLES: 171 | samples = MAX_SAMPLES 172 | ax1.scatter( 173 | target[:samples], predictions[:samples], c="tab:blue", alpha=0.2 174 | ) 175 | plt.xlabel("True values") 176 | plt.ylabel("Predicted values") 177 | plt.title(f"Target values vs Predicted values (samples={samples})") 178 | plt.tight_layout(pad=5.0) 179 | figures += [ 180 | { 181 | "title": "True vs Predicted", 182 | "fname": "true_vs_predicted.png", 183 | "figure": fig, 184 | } 185 | ] 186 | 187 | # residual plot 188 | fig = plt.figure(figsize=(10, 7)) 189 | ax1 = fig.add_subplot(1, 1, 1) 190 | residuals = target[:samples].values - predictions[:samples].values 191 | ax1.scatter(predictions[:samples], residuals, c="tab:blue", alpha=0.2) 192 | plt.xlabel("Predicted values") 193 | plt.ylabel("Residuals") 194 | plt.title(f"Predicted values vs Residuals (samples={samples})") 195 | plt.tight_layout(pad=5.0) 196 | bb = ax1.get_position() 197 | 198 | ax2 = fig.add_axes((bb.x0 + bb.size[0], bb.y0, 0.05, bb.size[1])) 199 | ax2.set_xticklabels([]) 200 | ax2.set_yticklabels([]) 201 | ax2.hist(residuals, 50, orientation="horizontal", alpha=0.5) 202 | ax2.axis("off") 203 | 204 | figures += [ 205 | { 206 | "title": "Predicted vs Residuals", 207 | "fname": "predicted_vs_residuals.png", 208 | "figure": fig, 209 | } 210 | ] 211 | plt.close("all") 212 | 213 | except Exception as e: 214 | print(str(e)) 215 | return figures 216 | 217 | @staticmethod 218 | def append(fout, model_path, plots): 219 | try: 220 | for plot in plots: 221 | fname = plot.get("fname") 222 | fig = plot.get("figure") 223 | title = plot.get("title", "") 224 | fig.savefig(os.path.join(model_path, fname)) 225 | fout.write(f"\n## {title}\n\n") 226 | fout.write(f"\n\n") 227 | except Exception as e: 228 | print(str(e)) 229 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_label_binarizer.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised.preprocessing.label_binarizer import LabelBinarizer 8 | 9 | 10 | class LabelBinarizerTest(unittest.TestCase): 11 | def test_fit(self): 12 | # training data 13 | d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} 14 | df = pd.DataFrame(data=d) 15 | lb = LabelBinarizer() 16 | # check first column 17 | lb.fit(df, "col1") 18 | data_json = lb.to_json() 19 | self.assertTrue("new_columns" in data_json) 20 | # we take alphabetical order 21 | self.assertTrue("col1_c" in data_json["new_columns"]) 22 | self.assertTrue("col1_a" not in data_json["new_columns"]) 23 | self.assertTrue("unique_values" in data_json) 24 | self.assertTrue("a" in data_json["unique_values"]) 25 | self.assertTrue("c" in data_json["unique_values"]) 26 | 27 | lb = LabelBinarizer() 28 | # check second column 29 | lb.fit(df, "col2") 30 | data_json = lb.to_json() 31 | self.assertTrue("new_columns" in data_json) 32 | self.assertTrue("col2_w" in data_json["new_columns"]) 33 | self.assertTrue("col2_e" in data_json["new_columns"]) 34 | self.assertTrue("col2_d" in data_json["new_columns"]) 35 | self.assertTrue("unique_values" in data_json) 36 | self.assertTrue("w" in data_json["unique_values"]) 37 | self.assertTrue("e" in data_json["unique_values"]) 38 | self.assertTrue("d" in data_json["unique_values"]) 39 | 40 | def test_transform(self): 41 | # training data 42 | d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} 43 | df = pd.DataFrame(data=d) 44 | # fit binarizer 45 | lb1 = LabelBinarizer() 46 | lb1.fit(df, "col1") 47 | lb2 = LabelBinarizer() 48 | lb2.fit(df, "col2") 49 | # test data 50 | d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]} 51 | df_test = pd.DataFrame(data=d_test) 52 | # transform 53 | df_test = lb1.transform(df_test, "col1") 54 | df_test = lb2.transform(df_test, "col2") 55 | # for binary column, only one value is left, old column should be deleted 56 | self.assertTrue("col1_c" in df_test.columns) 57 | self.assertTrue("col1" not in df_test.columns) 58 | self.assertEqual(2, np.sum(df_test["col1_c"])) 59 | # for multiple value colum, all columns should be added 60 | self.assertTrue("col2_w" in df_test.columns) 61 | self.assertTrue("col2_e" in df_test.columns) 62 | self.assertTrue("col2_d" in df_test.columns) 63 | self.assertTrue("col2" not in df_test.columns) 64 | self.assertEqual(1, np.sum(df_test["col2_w"])) 65 | self.assertEqual(1, np.sum(df_test["col2_e"])) 66 | self.assertEqual(1, np.sum(df_test["col2_d"])) 67 | # do not touch continuous attribute 68 | self.assertTrue("col3" in df_test.columns) 69 | 70 | def test_transform_with_new_values(self): 71 | # training data 72 | d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} 73 | df = pd.DataFrame(data=d) 74 | # fit binarizer 75 | lb1 = LabelBinarizer() 76 | lb1.fit(df, "col1") 77 | lb2 = LabelBinarizer() 78 | lb2.fit(df, "col2") 79 | # test data 80 | d_test = {"col1": ["c", "d", "d"], "col2": ["g", "e", "f"], "col3": [2, 3, 4]} 81 | df_test = pd.DataFrame(data=d_test) 82 | # transform 83 | df_test = lb1.transform(df_test, "col1") 84 | df_test = lb2.transform(df_test, "col2") 85 | self.assertTrue("col1_c" in df_test.columns) 86 | self.assertTrue("col1_d" not in df_test.columns) 87 | self.assertTrue("col2_w" in df_test.columns) 88 | self.assertTrue("col2_e" in df_test.columns) 89 | self.assertTrue("col2_d" in df_test.columns) 90 | self.assertTrue("col2_g" not in df_test.columns) 91 | self.assertTrue("col2_f" not in df_test.columns) 92 | self.assertEqual(df_test["col1_c"][0], 1) 93 | self.assertEqual(df_test["col1_c"][1], 0) 94 | self.assertEqual(df_test["col1_c"][2], 0) 95 | self.assertEqual(np.sum(df_test["col2_w"]), 0) 96 | self.assertEqual(np.sum(df_test["col2_d"]), 0) 97 | self.assertEqual(df_test["col2_e"][0], 0) 98 | self.assertEqual(df_test["col2_e"][1], 1) 99 | self.assertEqual(df_test["col2_e"][2], 0) 100 | 101 | def test_to_and_from_json(self): 102 | # training data 103 | d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} 104 | df = pd.DataFrame(data=d) 105 | # fit binarizer 106 | lb1 = LabelBinarizer() 107 | lb1.fit(df, "col1") 108 | lb2 = LabelBinarizer() 109 | lb2.fit(df, "col2") 110 | # test data 111 | d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]} 112 | df_test = pd.DataFrame(data=d_test) 113 | # to json and from json 114 | new_lb1 = LabelBinarizer() 115 | new_lb2 = LabelBinarizer() 116 | new_lb1.from_json(lb1.to_json()) 117 | new_lb2.from_json(lb2.to_json()) 118 | # transform 119 | df_test = new_lb1.transform(df_test, "col1") 120 | df_test = new_lb2.transform(df_test, "col2") 121 | # for binary column, only one value is left, old column should be deleted 122 | self.assertTrue("col1_c" in df_test.columns) 123 | self.assertTrue("col1" not in df_test.columns) 124 | self.assertEqual(2, np.sum(df_test["col1_c"])) 125 | # for multiple value colum, all columns should be added 126 | self.assertTrue("col2_w" in df_test.columns) 127 | self.assertTrue("col2_e" in df_test.columns) 128 | self.assertTrue("col2_d" in df_test.columns) 129 | self.assertTrue("col2" not in df_test.columns) 130 | self.assertEqual(1, np.sum(df_test["col2_w"])) 131 | self.assertEqual(1, np.sum(df_test["col2_e"])) 132 | self.assertEqual(1, np.sum(df_test["col2_d"])) 133 | # do not touch continuous attribute 134 | self.assertTrue("col3" in df_test.columns) 135 | 136 | def test_to_and_from_json_booleans(self): 137 | # training data 138 | d = {"col1": ["a", "a", "c"], "col2": [True, True, False]} 139 | df = pd.DataFrame(data=d) 140 | # fit binarizer 141 | lb1 = LabelBinarizer() 142 | lb1.fit(df, "col1") 143 | lb2 = LabelBinarizer() 144 | lb2.fit(df, "col2") 145 | # test data 146 | d_test = { 147 | "col1": ["c", "c", "a"], 148 | "col2": [False, False, True], 149 | "col3": [2, 3, 4], 150 | } 151 | df_test = pd.DataFrame(data=d_test) 152 | # to json and from json 153 | new_lb1 = LabelBinarizer() 154 | new_lb2 = LabelBinarizer() 155 | new_lb1.from_json(lb1.to_json()) 156 | new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4))) 157 | 158 | # transform 159 | df_test = new_lb1.transform(df_test, "col1") 160 | df_test = new_lb2.transform(df_test, "col2") 161 | # for binary column, only one value is left, old column should be deleted 162 | self.assertTrue("col1_c" in df_test.columns) 163 | self.assertTrue("col1" not in df_test.columns) 164 | self.assertEqual(2, np.sum(df_test["col1_c"])) 165 | # for multiple value colum, all columns should be added 166 | self.assertTrue("col2_True" in df_test.columns) 167 | self.assertTrue("col2" not in df_test.columns) 168 | self.assertEqual(1, np.sum(df_test["col2_True"])) 169 | # do not touch continuous attribute 170 | self.assertTrue("col3" in df_test.columns) 171 | 172 | def test_inverse_transform_2_unique_strings(self): 173 | d = {"col1": ["a", "a", "c"]} 174 | df = pd.DataFrame(data=d) 175 | lb = LabelBinarizer() 176 | lb.fit(df, "col1") 177 | bb = lb.transform(df, "col1") 178 | self.assertTrue("col1_c" in bb.columns) 179 | self.assertTrue(np.sum(bb["col1_c"]) == 1) 180 | bb = lb.inverse_transform(bb) 181 | self.assertTrue("col1_c" not in bb.columns) 182 | 183 | def test_inverse_transform_strings(self): 184 | d = {"col2": ["w", "e", "d"]} 185 | df = pd.DataFrame(data=d) 186 | lb = LabelBinarizer() 187 | lb.fit(df, "col2") 188 | bb = lb.transform(df, "col2") 189 | self.assertTrue("col2_w" in bb.columns) 190 | self.assertTrue("col2_e" in bb.columns) 191 | self.assertTrue("col2_d" in bb.columns) 192 | self.assertTrue(np.sum(bb["col2_w"]) == 1) 193 | bb = lb.inverse_transform(bb) 194 | self.assertTrue("col2_w" not in bb.columns) 195 | 196 | def test_inverse_transform_booleans(self): 197 | d = {"col1": [True, False, True, True]} 198 | df = pd.DataFrame(data=d) 199 | lb = LabelBinarizer() 200 | lb.fit(df, "col1") 201 | 202 | bb = lb.transform(df, "col1") 203 | self.assertTrue("col1_True" in bb.columns) 204 | self.assertEqual(bb["col1_True"].dtype, "int64") 205 | self.assertEqual(bb["col1_True"][0], 1) 206 | self.assertEqual(bb["col1_True"][1], 0) 207 | self.assertEqual(bb["col1_True"][2], 1) 208 | self.assertEqual(bb["col1_True"][3], 1) 209 | 210 | bb = lb.inverse_transform(bb) 211 | self.assertTrue("col1_True" not in bb.columns) 212 | self.assertEqual(bb["col1"].dtype, "bool") 213 | self.assertEqual(bb["col1"][0], True) 214 | self.assertEqual(bb["col1"][1], False) 215 | self.assertEqual(bb["col1"][2], True) 216 | self.assertEqual(bb["col1"][3], True) 217 | 218 | 219 | if __name__ == "__main__": 220 | unittest.main() 221 | ``` -------------------------------------------------------------------------------- /supervised/tuner/time_controller.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from supervised.utils.config import LOG_LEVEL 7 | 8 | logger = logging.getLogger(__name__) 9 | logger.setLevel(LOG_LEVEL) 10 | 11 | 12 | class TimeController: 13 | def __init__( 14 | self, start_time, total_time_limit, model_time_limit, steps, algorithms 15 | ): 16 | self._start_time = start_time 17 | self._total_time_limit = total_time_limit 18 | self._model_time_limit = model_time_limit 19 | self._steps = steps 20 | self._algorithms = algorithms 21 | self._spend = [] 22 | self._is_hill_climbing = "hill_climbing_1" in steps 23 | self._is_stacking = "stack" in steps 24 | 25 | def to_json(self): 26 | return { 27 | "total_time_limit": self._total_time_limit, 28 | "model_time_limit": self._model_time_limit, 29 | "steps": self._steps, 30 | "algorithms": self._algorithms, 31 | "spend": self._spend, 32 | "is_hill_climbing": self._is_hill_climbing, 33 | "is_stacking": self._is_stacking, 34 | } 35 | 36 | @staticmethod 37 | def from_json(data): 38 | if data is None: 39 | return None 40 | try: 41 | total_time_limit = data.get("total_time_limit") 42 | model_time_limit = data.get("model_time_limit") 43 | steps = data.get("steps") 44 | algorithms = data.get("algorithms") 45 | 46 | tc = TimeController( 47 | time.time(), total_time_limit, model_time_limit, steps, algorithms 48 | ) 49 | tc._spend = data.get("spend") 50 | tc._start_time -= tc.already_spend() # update time with already spend 51 | return tc 52 | except Exception as e: 53 | logger.error(f"Cant load TimeController from json, {str(e)}") 54 | pass 55 | return None 56 | 57 | def already_spend(self): 58 | return np.sum([s["train_time"] for s in self._spend]) 59 | 60 | def time_should_use(self, fit_level): 61 | if self._total_time_limit is None: 62 | return 7 * 24 * 3600 # 7 days 63 | 64 | ratios = { 65 | "default_algorithms": 0.3, 66 | "not_so_random": 0.35, 67 | "mix_encoding": 0.05, 68 | "golden_features": 0.05, 69 | "kmeans_features": 0.05, 70 | "insert_random_feature": 0.05, 71 | "features_selection": 0.05, 72 | "hill_climbing_1": 0.2, # enough to have only first step from hill climbing 73 | "boost_on_errors": 0.05, 74 | "stack": 0.2, 75 | } 76 | 77 | if ( 78 | fit_level 79 | in [ 80 | "default_algorithms", 81 | "not_so_random", 82 | "boost_on_errors", 83 | "mix_encoding", 84 | "golden_features", 85 | "kmeans_features", 86 | "insert_random_feature", 87 | "features_selection", 88 | "stack", 89 | ] 90 | or "hill_climbing" in fit_level 91 | ): 92 | ratio = 0 93 | for k, v in ratios.items(): 94 | if k in self._steps: 95 | ratio += v 96 | 97 | fl = fit_level 98 | if "hill_climbing" in fit_level: 99 | fl = "hill_climbing_1" 100 | 101 | ratio = ratios[fl] / ratio 102 | 103 | if "hill_climbing" in fit_level: 104 | # print("before hill climbing scale", ratio) 105 | hill_climbing_cnt = len( 106 | [i for i in self._steps if "hill_climbing" in i] 107 | ) 108 | ratio /= float(hill_climbing_cnt) 109 | 110 | should_use = self._total_time_limit * ratio 111 | 112 | return should_use 113 | 114 | return 0 115 | 116 | def compound_time_should_use(self, fit_level): 117 | compound = 0 118 | for step in self._steps: 119 | if step in [ 120 | "adjust_validation", 121 | "simple_algorithms", 122 | # "default_algorithms", 123 | "ensemble", 124 | "ensemble_stacked", 125 | ]: 126 | continue 127 | time_should_use = self.time_should_use(step) 128 | compound += time_should_use 129 | 130 | if fit_level == step: 131 | break 132 | # if fit_level == "stack": 133 | # compound -= 120 # leave time for ensemble 134 | # maybe not needed 135 | return compound 136 | 137 | def enough_time_for_step(self, fit_level): 138 | if fit_level in ["ensemble", "ensemble_stacked", "fairness"]: 139 | return True 140 | total_time_spend = time.time() - self._start_time 141 | compound = self.compound_time_should_use(fit_level) 142 | # print("Enough time for step", fit_level, np.round(total_time_spend,2), np.round(compound,2)) 143 | if total_time_spend > compound: 144 | # dont train more 145 | return False 146 | 147 | return True 148 | 149 | def enough_time_for_model(self, model_type): 150 | if self._total_time_limit is None: 151 | return True 152 | 153 | time_left = self._total_time_limit - self.already_spend() 154 | spend = [s["train_time"] for s in self._spend if s["model_type"] == model_type] 155 | model_mean_spend = np.mean(spend) 156 | return model_mean_spend <= time_left 157 | 158 | def enough_time(self, model_type, step): 159 | """ 160 | Check if there is enough time to train the next model. 161 | 162 | Parameters 163 | ---------- 164 | model_type : str 165 | String with type of the model. 166 | 167 | step: str 168 | String with name of the step in the process of AutoML training. 169 | 170 | 171 | Returns 172 | ------- 173 | bool 174 | `True` if there is time for training next model, `False` otherwise. 175 | """ 176 | if step in ["ensemble", "ensemble_stacked"]: 177 | return True 178 | # if model_time_limit is set, train every model 179 | # do not apply total_time_limit 180 | if self._model_time_limit is not None: 181 | return True 182 | # no total time limit, just train, dont ask 183 | if self._total_time_limit is None: 184 | return True 185 | 186 | total_time_spend = time.time() - self._start_time 187 | time_left = self._total_time_limit - total_time_spend 188 | # no time left, do not train any more models, sorry ... 189 | if time_left < 0: 190 | # print("No time left", time_left) 191 | return False 192 | 193 | # check the fit level type 194 | # we dont want to spend too much time on one step 195 | if not self.enough_time_for_step(step): 196 | # print("Not enough time for step", step) 197 | return False 198 | 199 | # there is still time and model_type was not tested yet 200 | # we should try it 201 | if time_left > 0 and self.model_spend(model_type) == 0: 202 | return True 203 | 204 | # stacked models converge faster 205 | # dont need to check ... 206 | if step == "stack": 207 | return True 208 | # check if there is enough time for model to train 209 | return self.enough_time_for_model(model_type) 210 | 211 | def learner_time_limit(self, model_type, fit_level, k_folds): 212 | if self._total_time_limit is None: 213 | return 7 * 24 * 3600 214 | 215 | if self._model_time_limit is not None: 216 | return self._model_time_limit / k_folds 217 | 218 | # just train them ... 219 | if fit_level == "simple_algorithms": 220 | return None 221 | if fit_level == "default_algorithms": 222 | return None 223 | 224 | tune_algorithms = [ 225 | a 226 | for a in self._algorithms 227 | if a not in ["Baseline", "Linear", "Decision Tree", "Nearest Neighbors"] 228 | ] 229 | tune_algs_cnt = len(tune_algorithms) 230 | if tune_algs_cnt == 0: 231 | return None 232 | 233 | time_elapsed = time.time() - self._start_time 234 | time_left = self._total_time_limit - time_elapsed 235 | 236 | if fit_level == "not_so_random": 237 | tt = self.time_should_use(fit_level) 238 | 239 | tt /= tune_algs_cnt # give time equally for each algorithm 240 | tt /= k_folds # time is per learner (per fold) 241 | return tt 242 | 243 | if "hill_climbing" in fit_level: 244 | tt = self.time_should_use(fit_level) 245 | tt /= tune_algs_cnt # give time equally for each algorithm 246 | tt /= k_folds # time is per learner (per fold) 247 | return tt 248 | 249 | if self._is_stacking and fit_level == "stack": 250 | tt = time_left 251 | tt /= tune_algs_cnt # give time equally for each algorithm 252 | tt /= k_folds # time is per learner (per fold) 253 | return tt 254 | 255 | def log_time(self, model_name, model_type, fit_level, train_time): 256 | self._spend += [ 257 | { 258 | "model_name": model_name, 259 | "model_type": model_type, 260 | "fit_level": fit_level, 261 | "train_time": train_time, 262 | } 263 | ] 264 | # print(pd.DataFrame(self._spend)) 265 | # print("Already spend", self.already_spend()) 266 | 267 | def step_spend(self, step): 268 | return np.sum([s["train_time"] for s in self._spend if s["fit_level"] == step]) 269 | 270 | def model_spend(self, model_type): 271 | return np.sum( 272 | [s["train_time"] for s in self._spend if s["model_type"] == model_type] 273 | ) 274 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/early_stopping.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised.callbacks.callback import Callback 8 | from supervised.utils.config import LOG_LEVEL 9 | from supervised.utils.metric import Metric 10 | 11 | logger = logging.getLogger(__name__) 12 | logger.setLevel(LOG_LEVEL) 13 | 14 | 15 | class EarlyStopping(Callback): 16 | def __init__(self, params): 17 | super(EarlyStopping, self).__init__(params) 18 | self.name = params.get("name", "early_stopping") 19 | self.metric = Metric(params.get("metric")) 20 | self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5) 21 | self.log_to_dir = params.get("log_to_dir") 22 | 23 | self.keep_best_model = params.get("keep_best_model", True) 24 | self.best_iter = {} 25 | self.best_loss = {} 26 | self.loss_values = {} 27 | self.best_models = {} 28 | self.best_y_predicted = {} 29 | self.best_y_oof = ( 30 | None # predictions computed on out of folds or on validation set 31 | ) 32 | self.final_loss = ( 33 | None # final score computed on combined predictions from all learners 34 | ) 35 | # path to best model local copy, only used if cannot deep copy 36 | self.best_model_paths = {} 37 | self.multiple_target = False 38 | self.target_columns = None 39 | 40 | def add_and_set_learner(self, learner): 41 | self.learners += [learner] 42 | self.learner = learner 43 | self.best_iter[learner.uid] = None 44 | self.best_loss[learner.uid] = self.metric.worst_value() 45 | self.loss_values[learner.uid] = {"train": [], "validation": [], "iters": []} 46 | self.best_models[learner.uid] = None 47 | self.best_model_paths[learner.uid] = None 48 | self.best_y_predicted[learner.uid] = None 49 | 50 | def on_learner_train_start(self, logs): 51 | self.no_improvement_cnt = 0 52 | 53 | def on_framework_train_end(self, logs): 54 | # aggregate predictions from all learners 55 | # it has two columns: 'prediction', 'target' 56 | logger.debug("early stopping on framework train end") 57 | self.best_y_oof = pd.concat(list(self.best_y_predicted.values())) 58 | self.best_y_oof.sort_index(inplace=True) 59 | # check for duplicates in index -> repeats of validation 60 | if np.sum(self.best_y_oof.index.duplicated()): 61 | # we need to aggregate predictions from multiple repeats 62 | target_cols = [c for c in self.best_y_oof.columns if "prediction" not in c] 63 | prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c] 64 | 65 | aggs = {} 66 | for t in target_cols: 67 | aggs[t] = "first" 68 | for p in prediction_cols: 69 | aggs[p] = "mean" 70 | # aggregate predictions from repeats 71 | self.best_y_oof = self.best_y_oof.groupby( 72 | target_cols + prediction_cols, level=0 73 | ).agg(aggs) 74 | 75 | sample_weight = None 76 | if "sample_weight" in self.best_y_oof.columns: 77 | sample_weight = self.best_y_oof["sample_weight"] 78 | 79 | if "prediction" in self.best_y_oof: 80 | self.final_loss = self.metric( 81 | self.best_y_oof[self.target_columns], 82 | self.best_y_oof["prediction"], 83 | sample_weight=sample_weight, 84 | ) 85 | else: 86 | prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c] 87 | self.final_loss = self.metric( 88 | self.best_y_oof[self.target_columns], 89 | self.best_y_oof[prediction_cols], 90 | sample_weight=sample_weight, 91 | ) 92 | 93 | def on_iteration_end(self, logs, predictions): 94 | train_loss = 0 95 | if predictions.get("y_train_predicted") is not None: 96 | train_loss = self.metric( 97 | predictions.get("y_train_true"), 98 | predictions.get("y_train_predicted"), 99 | predictions.get("sample_weight"), 100 | ) 101 | 102 | validation_loss = self.metric( 103 | predictions.get("y_validation_true"), 104 | predictions.get("y_validation_predicted"), 105 | predictions.get("sample_weight_validation"), 106 | ) 107 | self.loss_values[self.learner.uid]["train"] += [train_loss] 108 | self.loss_values[self.learner.uid]["validation"] += [validation_loss] 109 | self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")] 110 | 111 | if self.metric.improvement( 112 | previous=self.best_loss[self.learner.uid], current=validation_loss 113 | ): 114 | y_validation_true = predictions.get("y_validation_true") 115 | self.no_improvement_cnt = 0 116 | self.best_iter[self.learner.uid] = logs.get("iter_cnt") 117 | self.best_loss[self.learner.uid] = validation_loss 118 | 119 | if len(y_validation_true.shape) == 1 or y_validation_true.shape[1] == 1: 120 | self.best_y_predicted[self.learner.uid] = pd.DataFrame( 121 | { 122 | "target": np.array(y_validation_true) 123 | # y_validation_true.values.reshape( 124 | # y_validation_true.shape[0] 125 | # ) 126 | }, 127 | index=predictions.get("validation_index"), 128 | ) 129 | self.multiple_target = False 130 | self.target_columns = "target" 131 | else: 132 | # in case of Neural Networks and multi-class classification with one-hot encoding 133 | self.best_y_predicted[self.learner.uid] = pd.DataFrame( 134 | y_validation_true, index=predictions.get("validation_index") 135 | ) 136 | self.multiple_target = True 137 | self.target_columns = y_validation_true.columns 138 | 139 | y_validation_predicted = predictions.get("y_validation_predicted") 140 | 141 | if len(y_validation_predicted.shape) == 1: 142 | # only one prediction column (binary classification or regression) 143 | col = predictions.get("validation_columns", "prediction") 144 | self.best_y_predicted[self.learner.uid][col] = np.array( 145 | y_validation_predicted 146 | ) 147 | else: 148 | # several columns in multiclass classification 149 | cols = predictions.get("validation_columns") 150 | for i_col in range(y_validation_predicted.shape[1]): 151 | self.best_y_predicted[self.learner.uid][ 152 | # "prediction_{}".format(i_col) 153 | cols[i_col] 154 | ] = y_validation_predicted[:, i_col] 155 | 156 | # store sample_weight 157 | sample_weight_validation = predictions.get("sample_weight_validation") 158 | if sample_weight_validation is not None: 159 | self.best_y_predicted[self.learner.uid]["sample_weight"] = np.array( 160 | sample_weight_validation 161 | ) 162 | # store sensitive features 163 | sensitive_features_validation = predictions.get( 164 | "sensitive_features_validation" 165 | ) 166 | 167 | if sensitive_features_validation is not None: 168 | for col in list(sensitive_features_validation.columns): 169 | self.best_y_predicted[self.learner.uid][ 170 | f"sensitive_{col}" 171 | ] = np.array(sensitive_features_validation[col]) 172 | 173 | self.best_models[self.learner.uid] = self.learner.copy() 174 | # if local copy is not available, save model and keep path 175 | if self.best_models[self.learner.uid] is None: 176 | self.best_model_paths[self.learner.uid] = self.learner.save() 177 | else: 178 | self.no_improvement_cnt += 1 179 | 180 | if self.no_improvement_cnt > self.max_no_improvement_cnt: 181 | self.learner.stop_training = True 182 | 183 | logger.info( 184 | "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, " 185 | "no improvement cnt {}, iters {}".format( 186 | train_loss, 187 | validation_loss, 188 | self.no_improvement_cnt, 189 | len(self.loss_values[self.learner.uid]["iters"]), 190 | ) 191 | ) 192 | 193 | if self.log_to_dir is not None and self.learner.algorithm_short_name not in [ 194 | "Xgboost", 195 | "Random Forest", 196 | "Extra Trees", 197 | "LightGBM", 198 | "CatBoost", 199 | "Neural Network", 200 | ]: 201 | sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0 202 | with open( 203 | os.path.join(self.log_to_dir, f"{self.learner.name}_training.log"), "a" 204 | ) as fout: 205 | iteration = len(self.loss_values[self.learner.uid]["iters"]) 206 | fout.write(f"{iteration},{sign*train_loss},{sign*validation_loss}\n") 207 | 208 | def get_status(self): 209 | return "Train loss: {}, Validation loss: {} @ iteration {}".format( 210 | self.loss_values[self.learner.uid]["train"][-1], 211 | self.loss_values[self.learner.uid]["validation"][-1], 212 | len(self.loss_values[self.learner.uid]["iters"]), 213 | ) 214 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/decision_tree.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | import warnings 4 | 5 | import numpy as np 6 | import sklearn 7 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin 8 | from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor 9 | 10 | from supervised.algorithms.registry import ( 11 | BINARY_CLASSIFICATION, 12 | MULTICLASS_CLASSIFICATION, 13 | REGRESSION, 14 | AlgorithmsRegistry, 15 | ) 16 | from supervised.algorithms.sklearn import SklearnAlgorithm 17 | from supervised.utils.config import LOG_LEVEL 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(LOG_LEVEL) 21 | 22 | import dtreeviz 23 | from sklearn.tree import _tree 24 | 25 | from supervised.utils.subsample import subsample 26 | 27 | 28 | def get_rules(tree, feature_names, class_names): 29 | tree_ = tree.tree_ 30 | feature_name = [ 31 | feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" 32 | for i in tree_.feature 33 | ] 34 | 35 | paths = [] 36 | path = [] 37 | 38 | def recurse(node, path, paths): 39 | if tree_.feature[node] != _tree.TREE_UNDEFINED: 40 | name = feature_name[node] 41 | threshold = tree_.threshold[node] 42 | p1, p2 = list(path), list(path) 43 | p1 += [f"({name} <= {np.round(threshold, 3)})"] 44 | recurse(tree_.children_left[node], p1, paths) 45 | p2 += [f"({name} > {np.round(threshold, 3)})"] 46 | recurse(tree_.children_right[node], p2, paths) 47 | else: 48 | path += [(tree_.value[node], tree_.n_node_samples[node])] 49 | paths += [path] 50 | 51 | recurse(0, path, paths) 52 | 53 | # sort by samples count 54 | samples_count = [p[-1][1] for p in paths] 55 | ii = list(np.argsort(samples_count)) 56 | paths = [paths[i] for i in reversed(ii)] 57 | 58 | rules = [] 59 | for path in paths: 60 | rule = "if " 61 | 62 | for p in path[:-1]: 63 | if rule != "if ": 64 | rule += " and " 65 | rule += str(p) 66 | rule += " then " 67 | if class_names is None: 68 | rule += "response: " + str(np.round(path[-1][0][0][0], 3)) 69 | else: 70 | classes = path[-1][0][0] 71 | l = np.argmax(classes) 72 | rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)" 73 | rule += f" | based on {path[-1][1]:,} samples" 74 | rules += [rule] 75 | 76 | return rules 77 | 78 | 79 | def save_rules(tree, feature_names, class_names, model_file_path, learner_name): 80 | try: 81 | rules = get_rules(tree, feature_names, class_names) 82 | fname = os.path.join(model_file_path, f"{learner_name}_rules.txt") 83 | with open(fname, "w") as fout: 84 | for r in rules: 85 | fout.write(r + "\n\n") 86 | except Exception as e: 87 | logger.info(f"Problem with extracting decision tree rules. {str(e)}") 88 | 89 | 90 | class DecisionTreeAlgorithm(ClassifierMixin, SklearnAlgorithm): 91 | algorithm_name = "Decision Tree" 92 | algorithm_short_name = "Decision Tree" 93 | 94 | def __init__(self, params): 95 | super(DecisionTreeAlgorithm, self).__init__(params) 96 | logger.debug("DecisionTreeAlgorithm.__init__") 97 | self.library_version = sklearn.__version__ 98 | self.max_iters = additional.get("max_steps", 1) 99 | self.model = DecisionTreeClassifier( 100 | criterion=params.get("criterion", "gini"), 101 | max_depth=params.get("max_depth", 3), 102 | random_state=params.get("seed", 1), 103 | ) 104 | 105 | def file_extension(self): 106 | return "decision_tree" 107 | 108 | def interpret( 109 | self, 110 | X_train, 111 | y_train, 112 | X_validation, 113 | y_validation, 114 | model_file_path, 115 | learner_name, 116 | target_name=None, 117 | class_names=None, 118 | metric_name=None, 119 | ml_task=None, 120 | explain_level=2, 121 | ): 122 | super(DecisionTreeAlgorithm, self).interpret( 123 | X_train, 124 | y_train, 125 | X_validation, 126 | y_validation, 127 | model_file_path, 128 | learner_name, 129 | target_name, 130 | class_names, 131 | metric_name, 132 | ml_task, 133 | explain_level, 134 | ) 135 | if explain_level == 0: 136 | return 137 | with warnings.catch_warnings(): 138 | warnings.simplefilter(action="ignore") 139 | try: 140 | if len(class_names) > 10: 141 | # dtreeviz does not support more than 10 classes 142 | return 143 | 144 | viz = dtreeviz.model( 145 | self.model, 146 | X_train, 147 | y_train, 148 | target_name="target", 149 | feature_names=X_train.columns, 150 | class_names=class_names, 151 | ) 152 | tree_file_plot = os.path.join( 153 | model_file_path, learner_name + "_tree.svg" 154 | ) 155 | viz.view().save(tree_file_plot) 156 | except Exception as e: 157 | logger.info(f"Problem when visualizing decision tree. {str(e)}") 158 | 159 | save_rules( 160 | self.model, X_train.columns, class_names, model_file_path, learner_name 161 | ) 162 | 163 | 164 | class DecisionTreeRegressorAlgorithm(RegressorMixin, SklearnAlgorithm): 165 | algorithm_name = "Decision Tree" 166 | algorithm_short_name = "Decision Tree" 167 | 168 | def __init__(self, params): 169 | super(DecisionTreeRegressorAlgorithm, self).__init__(params) 170 | logger.debug("DecisionTreeRegressorAlgorithm.__init__") 171 | self.library_version = sklearn.__version__ 172 | self.max_iters = additional.get("max_steps", 1) 173 | self.model = DecisionTreeRegressor( 174 | criterion=params.get("criterion", "squared_error"), 175 | max_depth=params.get("max_depth", 3), 176 | random_state=params.get("seed", 1), 177 | ) 178 | 179 | def file_extension(self): 180 | return "decision_tree" 181 | 182 | def interpret( 183 | self, 184 | X_train, 185 | y_train, 186 | X_validation, 187 | y_validation, 188 | model_file_path, 189 | learner_name, 190 | target_name=None, 191 | class_names=None, 192 | metric_name=None, 193 | ml_task=None, 194 | explain_level=2, 195 | ): 196 | super(DecisionTreeRegressorAlgorithm, self).interpret( 197 | X_train, 198 | y_train, 199 | X_validation, 200 | y_validation, 201 | model_file_path, 202 | learner_name, 203 | target_name, 204 | class_names, 205 | metric_name, 206 | ml_task, 207 | explain_level, 208 | ) 209 | if explain_level == 0: 210 | return 211 | with warnings.catch_warnings(): 212 | warnings.simplefilter(action="ignore") 213 | try: 214 | # 250 is hard limit for number of points used in visualization 215 | # if too many points are used then final SVG plot is very large (can be > 100MB) 216 | if X_train.shape[0] > 250: 217 | x, _, y, _ = subsample(X_train, y_train, REGRESSION, 250) 218 | viz = dtreeviz( 219 | self.model, 220 | x, 221 | y, 222 | target_name="target", 223 | feature_names=x.columns, 224 | ) 225 | else: 226 | viz = dtreeviz.model( 227 | self.model, 228 | X_train, 229 | y_train, 230 | target_name="target", 231 | feature_names=X_train.columns, 232 | ) 233 | tree_file_plot = os.path.join( 234 | model_file_path, learner_name + "_tree.svg" 235 | ) 236 | viz.view().save(tree_file_plot) 237 | except Exception as e: 238 | logger.info( 239 | f"Problem when visuzalizin decision tree regressor. {str(e)}" 240 | ) 241 | 242 | save_rules(self.model, X_train.columns, None, model_file_path, learner_name) 243 | 244 | 245 | dt_params = {"criterion": ["gini", "entropy"], "max_depth": [2, 3, 4]} 246 | 247 | classification_default_params = {"criterion": "gini", "max_depth": 3} 248 | 249 | additional = { 250 | "trees_in_step": 1, 251 | "train_cant_improve_limit": 0, 252 | "max_steps": 1, 253 | "max_rows_limit": None, 254 | "max_cols_limit": None, 255 | } 256 | required_preprocessing = [ 257 | "missing_values_inputation", 258 | "convert_categorical", 259 | "datetime_transform", 260 | "text_transform", 261 | "target_as_integer", 262 | ] 263 | 264 | AlgorithmsRegistry.add( 265 | BINARY_CLASSIFICATION, 266 | DecisionTreeAlgorithm, 267 | dt_params, 268 | required_preprocessing, 269 | additional, 270 | classification_default_params, 271 | ) 272 | 273 | AlgorithmsRegistry.add( 274 | MULTICLASS_CLASSIFICATION, 275 | DecisionTreeAlgorithm, 276 | dt_params, 277 | required_preprocessing, 278 | additional, 279 | classification_default_params, 280 | ) 281 | 282 | dt_regression_params = { 283 | "criterion": [ 284 | "squared_error", 285 | "friedman_mse", 286 | ], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626 287 | "max_depth": [2, 3, 4], 288 | } 289 | regression_required_preprocessing = [ 290 | "missing_values_inputation", 291 | "convert_categorical", 292 | "datetime_transform", 293 | "text_transform", 294 | ] 295 | 296 | regression_default_params = {"criterion": "squared_error", "max_depth": 3} 297 | 298 | AlgorithmsRegistry.add( 299 | REGRESSION, 300 | DecisionTreeRegressorAlgorithm, 301 | dt_regression_params, 302 | regression_required_preprocessing, 303 | additional, 304 | regression_default_params, 305 | ) 306 | ```