This is page 3 of 16. Use http://codebase.md/mljar/mljar-supervised?lines=false&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /supervised/utils/additional_plots.py: -------------------------------------------------------------------------------- ```python import os import numpy as np import scikitplot as skplt from matplotlib import pyplot as plt class AdditionalPlots: @staticmethod def plots_binary(target, predicted_labels, predicted_probas): figures = [] try: # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_confusion_matrix( target, predicted_labels, normalize=False, ax=ax1 ) figures += [ { "title": "Confusion Matrix", "fname": "confusion_matrix.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_confusion_matrix( target, predicted_labels, normalize=True, ax=ax1 ) figures += [ { "title": "Normalized Confusion Matrix", "fname": "confusion_matrix_normalized.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1) figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_ks_statistic(target, predicted_probas, ax=ax1) figures += [ { "title": "Kolmogorov-Smirnov Statistic", "fname": "ks_statistic.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1) figures += [ { "title": "Precision-Recall Curve", "fname": "precision_recall_curve.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) # transform target if needed to be {0, 1} target_uniq_values = np.unique(target) target_transformed = target.values.ravel() if not (0 in target_uniq_values and 1 in target_uniq_values): mapping = {target_uniq_values[0]: 0, target_uniq_values[1]: 1} target_transformed = target.map(mapping) # create a plot _ = skplt.metrics.plot_calibration_curve( target_transformed, [predicted_probas], ["Classifier"], ax=ax1 ) figures += [ { "title": "Calibration Curve", "fname": "calibration_curve_curve.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_cumulative_gain(target, predicted_probas, ax=ax1) figures += [ { "title": "Cumulative Gains Curve", "fname": "cumulative_gains_curve.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_lift_curve(target, predicted_probas, ax=ax1) figures += [ {"title": "Lift Curve", "fname": "lift_curve.png", "figure": fig} ] except Exception as e: print(str(e)) return figures @staticmethod def plots_multiclass(target, predicted_labels, predicted_probas): figures = [] try: # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_confusion_matrix( target, predicted_labels, normalize=False, ax=ax1 ) figures += [ { "title": "Confusion Matrix", "fname": "confusion_matrix.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_confusion_matrix( target, predicted_labels, normalize=True, ax=ax1 ) figures += [ { "title": "Normalized Confusion Matrix", "fname": "confusion_matrix_normalized.png", "figure": fig, } ] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_roc(target, predicted_probas, ax=ax1) figures += [{"title": "ROC Curve", "fname": "roc_curve.png", "figure": fig}] # fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) _ = skplt.metrics.plot_precision_recall(target, predicted_probas, ax=ax1) figures += [ { "title": "Precision Recall Curve", "fname": "precision_recall_curve.png", "figure": fig, } ] plt.close("all") except Exception as e: print(str(e)) return figures @staticmethod def plots_regression(target, predictions): figures = [] try: MAX_SAMPLES = 5000 fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) samples = target.shape[0] if samples > MAX_SAMPLES: samples = MAX_SAMPLES ax1.scatter( target[:samples], predictions[:samples], c="tab:blue", alpha=0.2 ) plt.xlabel("True values") plt.ylabel("Predicted values") plt.title(f"Target values vs Predicted values (samples={samples})") plt.tight_layout(pad=5.0) figures += [ { "title": "True vs Predicted", "fname": "true_vs_predicted.png", "figure": fig, } ] # residual plot fig = plt.figure(figsize=(10, 7)) ax1 = fig.add_subplot(1, 1, 1) residuals = target[:samples].values - predictions[:samples].values ax1.scatter(predictions[:samples], residuals, c="tab:blue", alpha=0.2) plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.title(f"Predicted values vs Residuals (samples={samples})") plt.tight_layout(pad=5.0) bb = ax1.get_position() ax2 = fig.add_axes((bb.x0 + bb.size[0], bb.y0, 0.05, bb.size[1])) ax2.set_xticklabels([]) ax2.set_yticklabels([]) ax2.hist(residuals, 50, orientation="horizontal", alpha=0.5) ax2.axis("off") figures += [ { "title": "Predicted vs Residuals", "fname": "predicted_vs_residuals.png", "figure": fig, } ] plt.close("all") except Exception as e: print(str(e)) return figures @staticmethod def append(fout, model_path, plots): try: for plot in plots: fname = plot.get("fname") fig = plot.get("figure") title = plot.get("title", "") fig.savefig(os.path.join(model_path, fname)) fout.write(f"\n## {title}\n\n") fout.write(f"\n\n") except Exception as e: print(str(e)) ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_label_binarizer.py: -------------------------------------------------------------------------------- ```python import json import unittest import numpy as np import pandas as pd from supervised.preprocessing.label_binarizer import LabelBinarizer class LabelBinarizerTest(unittest.TestCase): def test_fit(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() # check first column lb.fit(df, "col1") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) # we take alphabetical order self.assertTrue("col1_c" in data_json["new_columns"]) self.assertTrue("col1_a" not in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("a" in data_json["unique_values"]) self.assertTrue("c" in data_json["unique_values"]) lb = LabelBinarizer() # check second column lb.fit(df, "col2") data_json = lb.to_json() self.assertTrue("new_columns" in data_json) self.assertTrue("col2_w" in data_json["new_columns"]) self.assertTrue("col2_e" in data_json["new_columns"]) self.assertTrue("col2_d" in data_json["new_columns"]) self.assertTrue("unique_values" in data_json) self.assertTrue("w" in data_json["unique_values"]) self.assertTrue("e" in data_json["unique_values"]) self.assertTrue("d" in data_json["unique_values"]) def test_transform(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]} df_test = pd.DataFrame(data=d_test) # transform df_test = lb1.transform(df_test, "col1") df_test = lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_w" in df_test.columns) self.assertTrue("col2_e" in df_test.columns) self.assertTrue("col2_d" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_w"])) self.assertEqual(1, np.sum(df_test["col2_e"])) self.assertEqual(1, np.sum(df_test["col2_d"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns) def test_transform_with_new_values(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = {"col1": ["c", "d", "d"], "col2": ["g", "e", "f"], "col3": [2, 3, 4]} df_test = pd.DataFrame(data=d_test) # transform df_test = lb1.transform(df_test, "col1") df_test = lb2.transform(df_test, "col2") self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1_d" not in df_test.columns) self.assertTrue("col2_w" in df_test.columns) self.assertTrue("col2_e" in df_test.columns) self.assertTrue("col2_d" in df_test.columns) self.assertTrue("col2_g" not in df_test.columns) self.assertTrue("col2_f" not in df_test.columns) self.assertEqual(df_test["col1_c"][0], 1) self.assertEqual(df_test["col1_c"][1], 0) self.assertEqual(df_test["col1_c"][2], 0) self.assertEqual(np.sum(df_test["col2_w"]), 0) self.assertEqual(np.sum(df_test["col2_d"]), 0) self.assertEqual(df_test["col2_e"][0], 0) self.assertEqual(df_test["col2_e"][1], 1) self.assertEqual(df_test["col2_e"][2], 0) def test_to_and_from_json(self): # training data d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = {"col1": ["c", "c", "a"], "col2": ["e", "d", "w"], "col3": [2, 3, 4]} df_test = pd.DataFrame(data=d_test) # to json and from json new_lb1 = LabelBinarizer() new_lb2 = LabelBinarizer() new_lb1.from_json(lb1.to_json()) new_lb2.from_json(lb2.to_json()) # transform df_test = new_lb1.transform(df_test, "col1") df_test = new_lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_w" in df_test.columns) self.assertTrue("col2_e" in df_test.columns) self.assertTrue("col2_d" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_w"])) self.assertEqual(1, np.sum(df_test["col2_e"])) self.assertEqual(1, np.sum(df_test["col2_d"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns) def test_to_and_from_json_booleans(self): # training data d = {"col1": ["a", "a", "c"], "col2": [True, True, False]} df = pd.DataFrame(data=d) # fit binarizer lb1 = LabelBinarizer() lb1.fit(df, "col1") lb2 = LabelBinarizer() lb2.fit(df, "col2") # test data d_test = { "col1": ["c", "c", "a"], "col2": [False, False, True], "col3": [2, 3, 4], } df_test = pd.DataFrame(data=d_test) # to json and from json new_lb1 = LabelBinarizer() new_lb2 = LabelBinarizer() new_lb1.from_json(lb1.to_json()) new_lb2.from_json(json.loads(json.dumps(lb2.to_json(), indent=4))) # transform df_test = new_lb1.transform(df_test, "col1") df_test = new_lb2.transform(df_test, "col2") # for binary column, only one value is left, old column should be deleted self.assertTrue("col1_c" in df_test.columns) self.assertTrue("col1" not in df_test.columns) self.assertEqual(2, np.sum(df_test["col1_c"])) # for multiple value colum, all columns should be added self.assertTrue("col2_True" in df_test.columns) self.assertTrue("col2" not in df_test.columns) self.assertEqual(1, np.sum(df_test["col2_True"])) # do not touch continuous attribute self.assertTrue("col3" in df_test.columns) def test_inverse_transform_2_unique_strings(self): d = {"col1": ["a", "a", "c"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() lb.fit(df, "col1") bb = lb.transform(df, "col1") self.assertTrue("col1_c" in bb.columns) self.assertTrue(np.sum(bb["col1_c"]) == 1) bb = lb.inverse_transform(bb) self.assertTrue("col1_c" not in bb.columns) def test_inverse_transform_strings(self): d = {"col2": ["w", "e", "d"]} df = pd.DataFrame(data=d) lb = LabelBinarizer() lb.fit(df, "col2") bb = lb.transform(df, "col2") self.assertTrue("col2_w" in bb.columns) self.assertTrue("col2_e" in bb.columns) self.assertTrue("col2_d" in bb.columns) self.assertTrue(np.sum(bb["col2_w"]) == 1) bb = lb.inverse_transform(bb) self.assertTrue("col2_w" not in bb.columns) def test_inverse_transform_booleans(self): d = {"col1": [True, False, True, True]} df = pd.DataFrame(data=d) lb = LabelBinarizer() lb.fit(df, "col1") bb = lb.transform(df, "col1") self.assertTrue("col1_True" in bb.columns) self.assertEqual(bb["col1_True"].dtype, "int64") self.assertEqual(bb["col1_True"][0], 1) self.assertEqual(bb["col1_True"][1], 0) self.assertEqual(bb["col1_True"][2], 1) self.assertEqual(bb["col1_True"][3], 1) bb = lb.inverse_transform(bb) self.assertTrue("col1_True" not in bb.columns) self.assertEqual(bb["col1"].dtype, "bool") self.assertEqual(bb["col1"][0], True) self.assertEqual(bb["col1"][1], False) self.assertEqual(bb["col1"][2], True) self.assertEqual(bb["col1"][3], True) if __name__ == "__main__": unittest.main() ``` -------------------------------------------------------------------------------- /supervised/tuner/time_controller.py: -------------------------------------------------------------------------------- ```python import logging import time import numpy as np from supervised.utils.config import LOG_LEVEL logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) class TimeController: def __init__( self, start_time, total_time_limit, model_time_limit, steps, algorithms ): self._start_time = start_time self._total_time_limit = total_time_limit self._model_time_limit = model_time_limit self._steps = steps self._algorithms = algorithms self._spend = [] self._is_hill_climbing = "hill_climbing_1" in steps self._is_stacking = "stack" in steps def to_json(self): return { "total_time_limit": self._total_time_limit, "model_time_limit": self._model_time_limit, "steps": self._steps, "algorithms": self._algorithms, "spend": self._spend, "is_hill_climbing": self._is_hill_climbing, "is_stacking": self._is_stacking, } @staticmethod def from_json(data): if data is None: return None try: total_time_limit = data.get("total_time_limit") model_time_limit = data.get("model_time_limit") steps = data.get("steps") algorithms = data.get("algorithms") tc = TimeController( time.time(), total_time_limit, model_time_limit, steps, algorithms ) tc._spend = data.get("spend") tc._start_time -= tc.already_spend() # update time with already spend return tc except Exception as e: logger.error(f"Cant load TimeController from json, {str(e)}") pass return None def already_spend(self): return np.sum([s["train_time"] for s in self._spend]) def time_should_use(self, fit_level): if self._total_time_limit is None: return 7 * 24 * 3600 # 7 days ratios = { "default_algorithms": 0.3, "not_so_random": 0.35, "mix_encoding": 0.05, "golden_features": 0.05, "kmeans_features": 0.05, "insert_random_feature": 0.05, "features_selection": 0.05, "hill_climbing_1": 0.2, # enough to have only first step from hill climbing "boost_on_errors": 0.05, "stack": 0.2, } if ( fit_level in [ "default_algorithms", "not_so_random", "boost_on_errors", "mix_encoding", "golden_features", "kmeans_features", "insert_random_feature", "features_selection", "stack", ] or "hill_climbing" in fit_level ): ratio = 0 for k, v in ratios.items(): if k in self._steps: ratio += v fl = fit_level if "hill_climbing" in fit_level: fl = "hill_climbing_1" ratio = ratios[fl] / ratio if "hill_climbing" in fit_level: # print("before hill climbing scale", ratio) hill_climbing_cnt = len( [i for i in self._steps if "hill_climbing" in i] ) ratio /= float(hill_climbing_cnt) should_use = self._total_time_limit * ratio return should_use return 0 def compound_time_should_use(self, fit_level): compound = 0 for step in self._steps: if step in [ "adjust_validation", "simple_algorithms", # "default_algorithms", "ensemble", "ensemble_stacked", ]: continue time_should_use = self.time_should_use(step) compound += time_should_use if fit_level == step: break # if fit_level == "stack": # compound -= 120 # leave time for ensemble # maybe not needed return compound def enough_time_for_step(self, fit_level): if fit_level in ["ensemble", "ensemble_stacked", "fairness"]: return True total_time_spend = time.time() - self._start_time compound = self.compound_time_should_use(fit_level) # print("Enough time for step", fit_level, np.round(total_time_spend,2), np.round(compound,2)) if total_time_spend > compound: # dont train more return False return True def enough_time_for_model(self, model_type): if self._total_time_limit is None: return True time_left = self._total_time_limit - self.already_spend() spend = [s["train_time"] for s in self._spend if s["model_type"] == model_type] model_mean_spend = np.mean(spend) return model_mean_spend <= time_left def enough_time(self, model_type, step): """ Check if there is enough time to train the next model. Parameters ---------- model_type : str String with type of the model. step: str String with name of the step in the process of AutoML training. Returns ------- bool `True` if there is time for training next model, `False` otherwise. """ if step in ["ensemble", "ensemble_stacked"]: return True # if model_time_limit is set, train every model # do not apply total_time_limit if self._model_time_limit is not None: return True # no total time limit, just train, dont ask if self._total_time_limit is None: return True total_time_spend = time.time() - self._start_time time_left = self._total_time_limit - total_time_spend # no time left, do not train any more models, sorry ... if time_left < 0: # print("No time left", time_left) return False # check the fit level type # we dont want to spend too much time on one step if not self.enough_time_for_step(step): # print("Not enough time for step", step) return False # there is still time and model_type was not tested yet # we should try it if time_left > 0 and self.model_spend(model_type) == 0: return True # stacked models converge faster # dont need to check ... if step == "stack": return True # check if there is enough time for model to train return self.enough_time_for_model(model_type) def learner_time_limit(self, model_type, fit_level, k_folds): if self._total_time_limit is None: return 7 * 24 * 3600 if self._model_time_limit is not None: return self._model_time_limit / k_folds # just train them ... if fit_level == "simple_algorithms": return None if fit_level == "default_algorithms": return None tune_algorithms = [ a for a in self._algorithms if a not in ["Baseline", "Linear", "Decision Tree", "Nearest Neighbors"] ] tune_algs_cnt = len(tune_algorithms) if tune_algs_cnt == 0: return None time_elapsed = time.time() - self._start_time time_left = self._total_time_limit - time_elapsed if fit_level == "not_so_random": tt = self.time_should_use(fit_level) tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt if "hill_climbing" in fit_level: tt = self.time_should_use(fit_level) tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt if self._is_stacking and fit_level == "stack": tt = time_left tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt def log_time(self, model_name, model_type, fit_level, train_time): self._spend += [ { "model_name": model_name, "model_type": model_type, "fit_level": fit_level, "train_time": train_time, } ] # print(pd.DataFrame(self._spend)) # print("Already spend", self.already_spend()) def step_spend(self, step): return np.sum([s["train_time"] for s in self._spend if s["fit_level"] == step]) def model_spend(self, model_type): return np.sum( [s["train_time"] for s in self._spend if s["model_type"] == model_type] ) ``` -------------------------------------------------------------------------------- /supervised/callbacks/early_stopping.py: -------------------------------------------------------------------------------- ```python import logging import os import numpy as np import pandas as pd from supervised.callbacks.callback import Callback from supervised.utils.config import LOG_LEVEL from supervised.utils.metric import Metric logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) class EarlyStopping(Callback): def __init__(self, params): super(EarlyStopping, self).__init__(params) self.name = params.get("name", "early_stopping") self.metric = Metric(params.get("metric")) self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5) self.log_to_dir = params.get("log_to_dir") self.keep_best_model = params.get("keep_best_model", True) self.best_iter = {} self.best_loss = {} self.loss_values = {} self.best_models = {} self.best_y_predicted = {} self.best_y_oof = ( None # predictions computed on out of folds or on validation set ) self.final_loss = ( None # final score computed on combined predictions from all learners ) # path to best model local copy, only used if cannot deep copy self.best_model_paths = {} self.multiple_target = False self.target_columns = None def add_and_set_learner(self, learner): self.learners += [learner] self.learner = learner self.best_iter[learner.uid] = None self.best_loss[learner.uid] = self.metric.worst_value() self.loss_values[learner.uid] = {"train": [], "validation": [], "iters": []} self.best_models[learner.uid] = None self.best_model_paths[learner.uid] = None self.best_y_predicted[learner.uid] = None def on_learner_train_start(self, logs): self.no_improvement_cnt = 0 def on_framework_train_end(self, logs): # aggregate predictions from all learners # it has two columns: 'prediction', 'target' logger.debug("early stopping on framework train end") self.best_y_oof = pd.concat(list(self.best_y_predicted.values())) self.best_y_oof.sort_index(inplace=True) # check for duplicates in index -> repeats of validation if np.sum(self.best_y_oof.index.duplicated()): # we need to aggregate predictions from multiple repeats target_cols = [c for c in self.best_y_oof.columns if "prediction" not in c] prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c] aggs = {} for t in target_cols: aggs[t] = "first" for p in prediction_cols: aggs[p] = "mean" # aggregate predictions from repeats self.best_y_oof = self.best_y_oof.groupby( target_cols + prediction_cols, level=0 ).agg(aggs) sample_weight = None if "sample_weight" in self.best_y_oof.columns: sample_weight = self.best_y_oof["sample_weight"] if "prediction" in self.best_y_oof: self.final_loss = self.metric( self.best_y_oof[self.target_columns], self.best_y_oof["prediction"], sample_weight=sample_weight, ) else: prediction_cols = [c for c in self.best_y_oof.columns if "prediction" in c] self.final_loss = self.metric( self.best_y_oof[self.target_columns], self.best_y_oof[prediction_cols], sample_weight=sample_weight, ) def on_iteration_end(self, logs, predictions): train_loss = 0 if predictions.get("y_train_predicted") is not None: train_loss = self.metric( predictions.get("y_train_true"), predictions.get("y_train_predicted"), predictions.get("sample_weight"), ) validation_loss = self.metric( predictions.get("y_validation_true"), predictions.get("y_validation_predicted"), predictions.get("sample_weight_validation"), ) self.loss_values[self.learner.uid]["train"] += [train_loss] self.loss_values[self.learner.uid]["validation"] += [validation_loss] self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")] if self.metric.improvement( previous=self.best_loss[self.learner.uid], current=validation_loss ): y_validation_true = predictions.get("y_validation_true") self.no_improvement_cnt = 0 self.best_iter[self.learner.uid] = logs.get("iter_cnt") self.best_loss[self.learner.uid] = validation_loss if len(y_validation_true.shape) == 1 or y_validation_true.shape[1] == 1: self.best_y_predicted[self.learner.uid] = pd.DataFrame( { "target": np.array(y_validation_true) # y_validation_true.values.reshape( # y_validation_true.shape[0] # ) }, index=predictions.get("validation_index"), ) self.multiple_target = False self.target_columns = "target" else: # in case of Neural Networks and multi-class classification with one-hot encoding self.best_y_predicted[self.learner.uid] = pd.DataFrame( y_validation_true, index=predictions.get("validation_index") ) self.multiple_target = True self.target_columns = y_validation_true.columns y_validation_predicted = predictions.get("y_validation_predicted") if len(y_validation_predicted.shape) == 1: # only one prediction column (binary classification or regression) col = predictions.get("validation_columns", "prediction") self.best_y_predicted[self.learner.uid][col] = np.array( y_validation_predicted ) else: # several columns in multiclass classification cols = predictions.get("validation_columns") for i_col in range(y_validation_predicted.shape[1]): self.best_y_predicted[self.learner.uid][ # "prediction_{}".format(i_col) cols[i_col] ] = y_validation_predicted[:, i_col] # store sample_weight sample_weight_validation = predictions.get("sample_weight_validation") if sample_weight_validation is not None: self.best_y_predicted[self.learner.uid]["sample_weight"] = np.array( sample_weight_validation ) # store sensitive features sensitive_features_validation = predictions.get( "sensitive_features_validation" ) if sensitive_features_validation is not None: for col in list(sensitive_features_validation.columns): self.best_y_predicted[self.learner.uid][ f"sensitive_{col}" ] = np.array(sensitive_features_validation[col]) self.best_models[self.learner.uid] = self.learner.copy() # if local copy is not available, save model and keep path if self.best_models[self.learner.uid] is None: self.best_model_paths[self.learner.uid] = self.learner.save() else: self.no_improvement_cnt += 1 if self.no_improvement_cnt > self.max_no_improvement_cnt: self.learner.stop_training = True logger.info( "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, " "no improvement cnt {}, iters {}".format( train_loss, validation_loss, self.no_improvement_cnt, len(self.loss_values[self.learner.uid]["iters"]), ) ) if self.log_to_dir is not None and self.learner.algorithm_short_name not in [ "Xgboost", "Random Forest", "Extra Trees", "LightGBM", "CatBoost", "Neural Network", ]: sign = -1.0 if Metric.optimize_negative(self.metric.name) else 1.0 with open( os.path.join(self.log_to_dir, f"{self.learner.name}_training.log"), "a" ) as fout: iteration = len(self.loss_values[self.learner.uid]["iters"]) fout.write(f"{iteration},{sign*train_loss},{sign*validation_loss}\n") def get_status(self): return "Train loss: {}, Validation loss: {} @ iteration {}".format( self.loss_values[self.learner.uid]["train"][-1], self.loss_values[self.learner.uid]["validation"][-1], len(self.loss_values[self.learner.uid]["iters"]), ) ``` -------------------------------------------------------------------------------- /supervised/algorithms/decision_tree.py: -------------------------------------------------------------------------------- ```python import logging import os import warnings import numpy as np import sklearn from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, AlgorithmsRegistry, ) from supervised.algorithms.sklearn import SklearnAlgorithm from supervised.utils.config import LOG_LEVEL logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) import dtreeviz from sklearn.tree import _tree from supervised.utils.subsample import subsample def get_rules(tree, feature_names, class_names): tree_ = tree.tree_ feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] paths = [] path = [] def recurse(node, path, paths): if tree_.feature[node] != _tree.TREE_UNDEFINED: name = feature_name[node] threshold = tree_.threshold[node] p1, p2 = list(path), list(path) p1 += [f"({name} <= {np.round(threshold, 3)})"] recurse(tree_.children_left[node], p1, paths) p2 += [f"({name} > {np.round(threshold, 3)})"] recurse(tree_.children_right[node], p2, paths) else: path += [(tree_.value[node], tree_.n_node_samples[node])] paths += [path] recurse(0, path, paths) # sort by samples count samples_count = [p[-1][1] for p in paths] ii = list(np.argsort(samples_count)) paths = [paths[i] for i in reversed(ii)] rules = [] for path in paths: rule = "if " for p in path[:-1]: if rule != "if ": rule += " and " rule += str(p) rule += " then " if class_names is None: rule += "response: " + str(np.round(path[-1][0][0][0], 3)) else: classes = path[-1][0][0] l = np.argmax(classes) rule += f"class: {class_names[l]} (proba: {np.round(100.0*classes[l]/np.sum(classes),2)}%)" rule += f" | based on {path[-1][1]:,} samples" rules += [rule] return rules def save_rules(tree, feature_names, class_names, model_file_path, learner_name): try: rules = get_rules(tree, feature_names, class_names) fname = os.path.join(model_file_path, f"{learner_name}_rules.txt") with open(fname, "w") as fout: for r in rules: fout.write(r + "\n\n") except Exception as e: logger.info(f"Problem with extracting decision tree rules. {str(e)}") class DecisionTreeAlgorithm(ClassifierMixin, SklearnAlgorithm): algorithm_name = "Decision Tree" algorithm_short_name = "Decision Tree" def __init__(self, params): super(DecisionTreeAlgorithm, self).__init__(params) logger.debug("DecisionTreeAlgorithm.__init__") self.library_version = sklearn.__version__ self.max_iters = additional.get("max_steps", 1) self.model = DecisionTreeClassifier( criterion=params.get("criterion", "gini"), max_depth=params.get("max_depth", 3), random_state=params.get("seed", 1), ) def file_extension(self): return "decision_tree" def interpret( self, X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name=None, class_names=None, metric_name=None, ml_task=None, explain_level=2, ): super(DecisionTreeAlgorithm, self).interpret( X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name, class_names, metric_name, ml_task, explain_level, ) if explain_level == 0: return with warnings.catch_warnings(): warnings.simplefilter(action="ignore") try: if len(class_names) > 10: # dtreeviz does not support more than 10 classes return viz = dtreeviz.model( self.model, X_train, y_train, target_name="target", feature_names=X_train.columns, class_names=class_names, ) tree_file_plot = os.path.join( model_file_path, learner_name + "_tree.svg" ) viz.view().save(tree_file_plot) except Exception as e: logger.info(f"Problem when visualizing decision tree. {str(e)}") save_rules( self.model, X_train.columns, class_names, model_file_path, learner_name ) class DecisionTreeRegressorAlgorithm(RegressorMixin, SklearnAlgorithm): algorithm_name = "Decision Tree" algorithm_short_name = "Decision Tree" def __init__(self, params): super(DecisionTreeRegressorAlgorithm, self).__init__(params) logger.debug("DecisionTreeRegressorAlgorithm.__init__") self.library_version = sklearn.__version__ self.max_iters = additional.get("max_steps", 1) self.model = DecisionTreeRegressor( criterion=params.get("criterion", "squared_error"), max_depth=params.get("max_depth", 3), random_state=params.get("seed", 1), ) def file_extension(self): return "decision_tree" def interpret( self, X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name=None, class_names=None, metric_name=None, ml_task=None, explain_level=2, ): super(DecisionTreeRegressorAlgorithm, self).interpret( X_train, y_train, X_validation, y_validation, model_file_path, learner_name, target_name, class_names, metric_name, ml_task, explain_level, ) if explain_level == 0: return with warnings.catch_warnings(): warnings.simplefilter(action="ignore") try: # 250 is hard limit for number of points used in visualization # if too many points are used then final SVG plot is very large (can be > 100MB) if X_train.shape[0] > 250: x, _, y, _ = subsample(X_train, y_train, REGRESSION, 250) viz = dtreeviz( self.model, x, y, target_name="target", feature_names=x.columns, ) else: viz = dtreeviz.model( self.model, X_train, y_train, target_name="target", feature_names=X_train.columns, ) tree_file_plot = os.path.join( model_file_path, learner_name + "_tree.svg" ) viz.view().save(tree_file_plot) except Exception as e: logger.info( f"Problem when visuzalizin decision tree regressor. {str(e)}" ) save_rules(self.model, X_train.columns, None, model_file_path, learner_name) dt_params = {"criterion": ["gini", "entropy"], "max_depth": [2, 3, 4]} classification_default_params = {"criterion": "gini", "max_depth": 3} additional = { "trees_in_step": 1, "train_cant_improve_limit": 0, "max_steps": 1, "max_rows_limit": None, "max_cols_limit": None, } required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", "target_as_integer", ] AlgorithmsRegistry.add( BINARY_CLASSIFICATION, DecisionTreeAlgorithm, dt_params, required_preprocessing, additional, classification_default_params, ) AlgorithmsRegistry.add( MULTICLASS_CLASSIFICATION, DecisionTreeAlgorithm, dt_params, required_preprocessing, additional, classification_default_params, ) dt_regression_params = { "criterion": [ "squared_error", "friedman_mse", ], # remove "mae" because it slows down a lot https://github.com/scikit-learn/scikit-learn/issues/9626 "max_depth": [2, 3, 4], } regression_required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", ] regression_default_params = {"criterion": "squared_error", "max_depth": 3} AlgorithmsRegistry.add( REGRESSION, DecisionTreeRegressorAlgorithm, dt_regression_params, regression_required_preprocessing, additional, regression_default_params, ) ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_explain_levels.py: -------------------------------------------------------------------------------- ```python import os import shutil import unittest import pandas as pd from sklearn import datasets from supervised import AutoML from supervised.algorithms.random_forest import additional additional["max_steps"] = 3 additional["trees_in_step"] = 1 from supervised.algorithms.xgboost import additional additional["max_rounds"] = 1 class AutoMLExplainLevelsTest(unittest.TestCase): automl_dir = "AutoMLExplainLevelsTest" def setUp(self): shutil.rmtree(self.automl_dir, ignore_errors=True) def tearDown(self): shutil.rmtree(self.automl_dir, ignore_errors=True) def run_explain_default(self, task, alg): shutil.rmtree(self.automl_dir, ignore_errors=True) a = AutoML( results_path=self.automl_dir, total_time_limit=10, algorithms=[alg], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, start_random_models=1, ) if task == "binary": X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) elif task == "multi": X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=5, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) else: X, y = datasets.make_regression( n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir( os.path.join(self.automl_dir, f'1_Default_{alg.replace(" ", "")}') ) # There should be files with: # - permutation importance # - shap importance # - shap dependence # - shap decisions # Check permutation importance produced = False for f in result_files: if "importance.csv" in f and "shap" not in f: produced = True break self.assertTrue(produced) # Check shap importance produced = False for f in result_files: if "importance.csv" in f and "shap" in f: produced = True break self.assertTrue(produced) # Check shap dependence produced = False for f in result_files: if "shap_dependence" in f: produced = True break self.assertTrue(produced) # Check shap decisions produced = False for f in result_files: if "decisions.png" in f: produced = True break self.assertTrue(produced) # def test_explain_default(self): # for task in ["binary", "multi", "regression"]: # for alg in ["Xgboost", "Random Forest", "LightGBM"]: # self.run_explain_default(task, alg) def test_no_explain_linear(self): a = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Linear"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, explain_level=0, start_random_models=1, ) X, y = datasets.make_regression( n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir(os.path.join(self.automl_dir, "1_Linear")) # There should be no files with: # - permutation importance # - shap importance # - shap dependence # - shap decisions # Check permutation importance produced = False for f in result_files: if "importance.csv" in f and "shap" not in f: produced = True break self.assertFalse(produced) # Check shap importance produced = False for f in result_files: if "importance.csv" in f and "shap" in f: produced = True break self.assertFalse(produced) # Check shap dependence produced = False for f in result_files: if "dependence.png" in f: produced = True break self.assertFalse(produced) # Check shap decisions produced = False for f in result_files: if "decisions.png" in f: produced = True break self.assertFalse(produced) # Check coefficients produced = False for f in result_files: if "coefs.csv" in f: produced = True break self.assertFalse(produced) def test_explain_just_permutation_importance(self): a = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, explain_level=1, start_random_models=1, ) X, y = datasets.make_regression( n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir(os.path.join(self.automl_dir, "1_Default_Xgboost")) # There should be no files with: # - permutation importance # - shap importance # - shap dependence # - shap decisions # Check permutation importance produced = False for f in result_files: if "importance.csv" in f and "shap" not in f: produced = True break self.assertTrue(produced) # Check shap importance produced = False for f in result_files: if "importance.csv" in f and "shap" in f: produced = True break self.assertFalse(produced) # Check shap dependence produced = False for f in result_files: if "dependence.png" in f: produced = True break self.assertFalse(produced) # Check shap decisions produced = False for f in result_files: if "decisions.png" in f: produced = True break self.assertFalse(produced) def test_build_decision_tree(self): a = AutoML( results_path=self.automl_dir, total_time_limit=10, algorithms=["Decision Tree"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": 2, "shuffle": True, "stratify": True, }, explain_level=2, start_random_models=1, ) X, y = datasets.make_regression( n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir(os.path.join(self.automl_dir, "1_DecisionTree")) # There should be files with: # - decision tree visualization # - permutation importance # - shap importance # - shap dependence # - shap decisions # Check Decision Tree visualization produced = False for f in result_files: if "tree.svg" in f: produced = True break # disable ??? TODO # self.assertTrue(produced) # Check permutation importance produced = False for f in result_files: if "importance.csv" in f and "shap" not in f: produced = True break self.assertTrue(produced) # Check shap importance produced = False for f in result_files: if "importance.csv" in f and "shap" in f: produced = True break self.assertTrue(produced) # Check shap dependence produced = False for f in result_files: if "dependence.png" in f: produced = True break self.assertTrue(produced) # Check shap decisions produced = False for f in result_files: if "decisions.png" in f: produced = True break self.assertTrue(produced) ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_targets.py: -------------------------------------------------------------------------------- ```python import shutil import unittest import pytest import numpy as np import pandas as pd from supervised import AutoML from supervised.algorithms.xgboost import additional from supervised.exceptions import AutoMLException additional["max_rounds"] = 1 class AutoMLTargetsTest(unittest.TestCase): automl_dir = "automl_tests" rows = 50 def tearDown(self): shutil.rmtree(self.automl_dir, ignore_errors=True) def test_bin_class_01(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(0 in u or 1 in u) self.assertTrue(len(u) <= 2) def test_bin_class_11(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 2, self.rows) * 2 - 1 automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(-1 in u or 1 in u) self.assertTrue(0 not in u) self.assertTrue(len(u) <= 2) def test_bin_class_AB(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation(["a", "B"] * int(self.rows / 2)) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) p = automl.predict(X) pred = automl.predict(X) u = np.unique(pred) self.assertTrue("a" in u or "B" in u) self.assertTrue(len(u) <= 2) def test_bin_class_AB_missing_targets(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series( np.random.permutation(["a", "B"] * int(self.rows / 2)), name="target" ) y.iloc[1] = None y.iloc[3] = np.NaN y.iloc[13] = np.nan automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) with pytest.warns( expected_warning=UserWarning, match="There are samples with missing target values in the data which will be excluded for further analysis", ) as record: automl.fit(X, y) # check that only one warning was raised self.assertEqual(len(record), 1) p = automl.predict(X) pred = automl.predict(X) u = np.unique(pred) self.assertTrue("a" in u or "B" in u) self.assertTrue(len(u) <= 2) def test_multi_class_0123_floats(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) y = y.astype(float) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(0.0 in u or 1.0 in u or 2.0 in u or 3.0 in u) self.assertTrue(len(u) <= 4) def test_multi_class_0123(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(0 in u or 1 in u or 2 in u or 3 in u) self.assertTrue(len(u) <= 4) def test_multi_class_0123_strings(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.randint(0, 4, self.rows * 4) y = y.astype(str) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue("0" in u or "1" in u or "2" in u or "3" in u) self.assertTrue(len(u) <= 4) def test_multi_class_abcd(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series( np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target" ) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4) def test_multi_class_abcd_np_array(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.permutation([None, "B", "CC", "d"] * self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4) def test_multi_class_abcd_mixed_int(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series( np.random.permutation([1, "B", "CC", "d"] * self.rows), name="target" ) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4) def test_multi_class_abcd_missing_target(self): X = np.random.rand(self.rows * 4, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series( np.random.permutation(["a", "B", "CC", "d"] * self.rows), name="target" ) y.iloc[0] = None y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) with pytest.warns( expected_warning=UserWarning, match="There are samples with missing target values in the data which will be excluded for further analysis", ) as record: automl.fit(X, y) # check that only one warning was raised self.assertEqual(len(record), 1) pred = automl.predict(X) u = np.unique(pred) self.assertTrue(np.intersect1d(u, ["a", "B", "CC", "d"]).shape[0] > 0) self.assertTrue(len(u) <= 4) def test_regression(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = np.random.rand(self.rows) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) pred = automl.predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0]) def test_regression_missing_target(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") y.iloc[1] = None automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) with pytest.warns( match="There are samples with missing target values in the data which will be excluded for further analysis" ) as record: automl.fit(X, y) self.assertEqual(len(record), 1) pred = automl.predict(X) self.assertIsInstance(pred, np.ndarray) self.assertEqual(len(pred), X.shape[0]) def test_predict_on_empty_dataframe(self): X = np.random.rand(self.rows, 3) X = pd.DataFrame(X, columns=[f"f{i}" for i in range(3)]) y = pd.Series(np.random.rand(self.rows), name="target") automl = AutoML( results_path=self.automl_dir, total_time_limit=1, algorithms=["Xgboost"], train_ensemble=False, explain_level=0, start_random_models=1, ) automl.fit(X, y) with self.assertRaises(AutoMLException) as context: pred = automl.predict(pd.DataFrame()) with self.assertRaises(AutoMLException) as context: pred = automl.predict(np.empty(shape=(0, 3))) ``` -------------------------------------------------------------------------------- /supervised/preprocessing/goldenfeatures_transformer.py: -------------------------------------------------------------------------------- ```python import itertools import json import os import time import numpy as np import pandas as pd from joblib import Parallel, delayed from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, ) from supervised.exceptions import AutoMLException from supervised.utils.jsonencoder import MLJSONEncoder def get_binary_score(X_train, y_train, X_test, y_test): clf = DecisionTreeClassifier(max_depth=3) clf.fit(X_train, y_train) pred = clf.predict_proba(X_test)[:, 1] ll = log_loss(y_test, pred) return ll def get_regression_score(X_train, y_train, X_test, y_test): clf = DecisionTreeRegressor(max_depth=3) clf.fit(X_train, y_train) pred = clf.predict(X_test) ll = mean_squared_error(y_test, pred) return ll def get_multiclass_score(X_train, y_train, X_test, y_test): clf = DecisionTreeClassifier(max_depth=3) clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) ll = log_loss(y_test, pred) return ll def get_score(item): col1 = item[0] col2 = item[1] X_train = item[2] y_train = item[3] X_test = item[4] y_test = item[5] scorer = item[6] try: x_train = np.array(X_train[col1] - X_train[col2]).reshape(-1, 1) x_test = np.array(X_test[col1] - X_test[col2]).reshape(-1, 1) diff_score = scorer(x_train, y_train, x_test, y_test) except Exception as e: diff_score = None print(str(e)) try: a, b = ( np.array(X_train[col1], dtype=float), np.array(X_train[col2], dtype=float), ) x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) a, b = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float) x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) ratio_1_score = scorer(x_train, y_train, x_test, y_test) except Exception as e: print(str(e)) ratio_1_score = None try: b, a = ( np.array(X_train[col1], dtype=float), np.array(X_train[col2], dtype=float), ) x_train = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) b, a = np.array(X_test[col1], dtype=float), np.array(X_test[col2], dtype=float) x_test = np.divide(a, b, out=np.zeros_like(a), where=b != 0).reshape(-1, 1) ratio_2_score = scorer(x_train, y_train, x_test, y_test) except Exception as e: print(str(e)) ratio_2_score = None try: x_train = np.array(X_train[col1] + X_train[col2]).reshape(-1, 1) x_test = np.array(X_test[col1] + X_test[col2]).reshape(-1, 1) sum_score = scorer(x_train, y_train, x_test, y_test) except Exception as e: sum_score = None print(str(e)) try: x_train = np.array(X_train[col1] * X_train[col2]).reshape(-1, 1) x_test = np.array(X_test[col1] * X_test[col2]).reshape(-1, 1) multiply_score = scorer(x_train, y_train, x_test, y_test) except Exception as e: multiply_score = None print(str(e)) return (diff_score, ratio_1_score, ratio_2_score, sum_score, multiply_score) class GoldenFeaturesTransformer(object): def __init__(self, results_path=None, ml_task=None, features_count=None, n_jobs=-1): self._new_features = [] self._new_columns = [] self._ml_task = ml_task self._features_count = features_count self._n_jobs = n_jobs self._scorer = None if self._ml_task == BINARY_CLASSIFICATION: self._scorer = get_binary_score elif self._ml_task == MULTICLASS_CLASSIFICATION: self._scorer = get_multiclass_score else: self._scorer = get_regression_score self._error = None self._result_file = "golden_features.json" if results_path is not None: self._result_path = os.path.join(results_path, self._result_file) if os.path.exists(self._result_path): with open(self._result_path, "r") as file: self.from_json(json.load(file), results_path) def fit(self, X, y): if self._new_features: return if self._error is not None and self._error: raise AutoMLException( "Golden Features not created due to error (please check errors.md). " + self._error ) return if X.shape[1] == 0: self._error = f"Golden Features not created. No continous features. Input data shape: {X.shape}, {y.shape}" self.save() raise AutoMLException("Golden Features not created. No continous features.") start_time = time.time() combinations = itertools.combinations(X.columns, r=2) items = [i for i in combinations] if len(items) > 250000: si = np.random.choice(len(items), 250000, replace=False) items = [items[i] for i in si] X_train, X_test, y_train, y_test = self._subsample(X, y) for i in range(len(items)): items[i] += (X_train, y_train, X_test, y_test, self._scorer) scores = [] # parallel version scores = Parallel(n_jobs=self._n_jobs, backend="loky")( delayed(get_score)(i) for i in items ) # single process version # for item in items: # scores += [get_score(item)] if not scores: self._error = f"Golden Features not created. Empty scores. Input data shape: {X.shape}, {y.shape}" self.save() raise AutoMLException("Golden Features not created. Empty scores.") result = [] for i in range(len(items)): if scores[i][0] is not None: result += [(items[i][0], items[i][1], "diff", scores[i][0])] if scores[i][1] is not None: result += [(items[i][0], items[i][1], "ratio", scores[i][1])] if scores[i][2] is not None: result += [(items[i][1], items[i][0], "ratio", scores[i][2])] if scores[i][3] is not None: result += [(items[i][1], items[i][0], "sum", scores[i][3])] if scores[i][4] is not None: result += [(items[i][1], items[i][0], "multiply", scores[i][4])] df = pd.DataFrame( result, columns=["feature1", "feature2", "operation", "score"] ) df.sort_values(by="score", inplace=True) new_cols_cnt = np.min([100, np.max([10, int(0.1 * X.shape[1])])]) if ( self._features_count is not None and self._features_count > 0 and self._features_count < df.shape[0] ): new_cols_cnt = self._features_count print(self._features_count, new_cols_cnt) self._new_features = json.loads(df.head(new_cols_cnt).to_json(orient="records")) for new_feature in self._new_features: new_col = "_".join( [ new_feature["feature1"], new_feature["operation"], new_feature["feature2"], ] ) self._new_columns += [new_col] print(f"Add Golden Feature: {new_col}") self.save() print( f"Created {len(self._new_features)} Golden Features in {np.round(time.time() - start_time,2)} seconds." ) def transform(self, X): for new_feature in self._new_features: new_col = "_".join( [ new_feature["feature1"], new_feature["operation"], new_feature["feature2"], ] ) if new_feature["operation"] == "diff": X[new_col] = X[new_feature["feature1"]] - X[new_feature["feature2"]] elif new_feature["operation"] == "ratio": a, b = ( np.array(X[new_feature["feature1"]], dtype=float), np.array(X[new_feature["feature2"]], dtype=float), ) X[new_col] = np.divide( a, b, out=np.zeros_like(a), where=b != 0 ).reshape(-1, 1) elif new_feature["operation"] == "sum": X[new_col] = X[new_feature["feature1"]] + X[new_feature["feature2"]] elif new_feature["operation"] == "multiply": X[new_col] = X[new_feature["feature1"]] * X[new_feature["feature2"]] return X def to_json(self): data_json = { "new_features": self._new_features, "new_columns": self._new_columns, "ml_task": self._ml_task, } if self._error is not None and self._error: data_json["error"] = self._error return data_json def from_json(self, data_json, results_path): self._new_features = data_json.get("new_features", []) self._new_columns = data_json.get("new_columns", []) self._ml_task = data_json.get("ml_task") self._error = data_json.get("error") self._result_path = os.path.join(results_path, self._result_file) def save(self): with open(self._result_path, "w") as fout: fout.write(json.dumps(self.to_json(), indent=4, cls=MLJSONEncoder)) def _subsample(self, X, y): MAX_SIZE = 10000 TRAIN_SIZE = 2500 shuffle = True stratify = None if X.shape[0] > MAX_SIZE: if self._ml_task != REGRESSION: stratify = y X_train, _, y_train, _ = train_test_split( X, y, train_size=MAX_SIZE, shuffle=shuffle, stratify=stratify, random_state=1, ) if self._ml_task != REGRESSION: stratify = y_train X_train, X_test, y_train, y_test = train_test_split( X_train, y_train, train_size=TRAIN_SIZE, shuffle=shuffle, stratify=stratify, random_state=1, ) else: if self._ml_task != REGRESSION: stratify = y train_size = X.shape[0] // 4 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_size, shuffle=shuffle, stratify=stratify, random_state=1, ) return X_train, X_test, y_train, y_test ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/tuner.py: -------------------------------------------------------------------------------- ```python import json import os import warnings import joblib import matplotlib import optuna from matplotlib import pyplot as plt from supervised.exceptions import AutoMLException from supervised.preprocessing.preprocessing_utils import PreprocessingUtils from supervised.tuner.optuna.catboost import CatBoostObjective from supervised.tuner.optuna.extra_trees import ExtraTreesObjective from supervised.tuner.optuna.knn import KNNObjective from supervised.tuner.optuna.lightgbm import LightgbmObjective from supervised.tuner.optuna.nn import NeuralNetworkObjective from supervised.tuner.optuna.random_forest import RandomForestObjective from supervised.tuner.optuna.xgboost import XgboostObjective from supervised.utils.jsonencoder import MLJSONEncoder from supervised.utils.metric import Metric class OptunaTuner: def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in [ "auc", "logloss", "rmse", "mse", "mae", "mape", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", "user_defined_metric", ]: raise AutoMLException(f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ( "maximize" if Metric.optimize_negative(eval_metric.name) else "minimize" ) self.n_warmup_steps = ( 500 # set large enough to give small learning rates a chance ) self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL) @staticmethod def is_optimizable(algorithm_name): return algorithm_name in [ "Extra Trees", "Random Forest", "CatBoost", "Xgboost", "LightGBM", "Nearest Neighbors", "Neural Network", ] def optimize( self, algorithm, data_type, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, learner_params, ): # only tune models with original data type if data_type != "original": return learner_params key = f"{data_type}_{algorithm}" if key in self.tuning: return self.update_learner_params(learner_params, self.tuning[key]) if self.verbose: print( f"Optuna optimizes {algorithm} with time budget {self.time_budget} seconds " f"eval_metric {self.eval_metric.name} ({self.direction})" ) self.cat_features_indices = [] for i in range(X_train.shape[1]): if PreprocessingUtils.is_categorical(X_train.iloc[:, i]): self.cat_features_indices += [i] study = optuna.create_study( direction=self.direction, sampler=optuna.samplers.TPESampler(seed=self.random_state), pruner=optuna.pruners.MedianPruner(n_warmup_steps=self.n_warmup_steps), ) obejctive = None if algorithm == "LightGBM": objective = LightgbmObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Xgboost": objective = XgboostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "CatBoost": objective = CatBoostObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.cat_features_indices, self.n_jobs, self.random_state, ) elif algorithm == "Random Forest": objective = RandomForestObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Extra Trees": objective = ExtraTreesObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Nearest Neighbors": objective = KNNObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) elif algorithm == "Neural Network": objective = NeuralNetworkObjective( self.ml_task, X_train, y_train, sample_weight, X_validation, y_validation, sample_weight_validation, self.eval_metric, self.n_jobs, self.random_state, ) study.optimize( objective, n_trials=5000, timeout=self.time_budget, gc_after_trial=True ) self.plot_study(algorithm, data_type, study) joblib.dump(study, os.path.join(self.study_dir, key + ".joblib")) best = study.best_params if algorithm == "LightGBM": best["metric"] = objective.eval_metric_name best["custom_eval_metric_name"] = objective.custom_eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["learning_rate"] = objective.learning_rate best["cat_feature"] = self.cat_features_indices best["feature_pre_filter"] = False best["seed"] = objective.seed elif algorithm == "CatBoost": best["eval_metric"] = objective.eval_metric_name best["num_boost_round"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds # best["bootstrap_type"] = "Bernoulli" # best["learning_rate"] = objective.learning_rate best["seed"] = objective.seed elif algorithm == "Xgboost": best["objective"] = objective.objective best["eval_metric"] = objective.eval_metric_name # best["eta"] = objective.learning_rate best["max_rounds"] = objective.rounds best["early_stopping_rounds"] = objective.early_stopping_rounds best["seed"] = objective.seed elif algorithm == "Extra Trees": # Extra Trees are not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Random Forest": # Random Forest is not using early stopping best["max_steps"] = objective.max_steps # each step has 100 trees best["seed"] = objective.seed best["eval_metric_name"] = self.eval_metric.name elif algorithm == "Nearest Neighbors": best["rows_limit"] = 100000 elif algorithm == "Neural Network": pass self.tuning[key] = best self.save() return self.update_learner_params(learner_params, best) def update_learner_params(self, learner_params, best): for k, v in best.items(): learner_params[k] = v return learner_params def save(self): with open(self.tuning_fname, "w") as fout: fout.write(json.dumps(self.tuning, indent=4, cls=MLJSONEncoder)) def load(self): if os.path.exists(self.tuning_fname): params = json.loads(open(self.tuning_fname).read()) for k, v in params.items(): self.tuning[k] = v def plot_study(self, algorithm, data_type, study): key = f"{data_type}_{algorithm}" plots = [ ( optuna.visualization.matplotlib.plot_optimization_history, "optimization_history", ), ( optuna.visualization.matplotlib.plot_parallel_coordinate, "parallel_coordinate", ), ( optuna.visualization.matplotlib.plot_param_importances, "param_importances", ), # (optuna.visualization.matplotlib.plot_slice, "slice"), ] matplotlib_default_figsize = matplotlib.rcParams["figure.figsize"] matplotlib.rcParams["figure.figsize"] = (11, 7) md = f"# Optuna tuning for {algorithm} on {data_type} data\n\n" for plot, title in plots: try: with warnings.catch_warnings(): warnings.simplefilter("ignore") plt.figure() plt.rcParams["axes.grid"] = title != "parallel_coordinate" plot(study) plt.tight_layout(pad=2.0) fname = f"{key}_{title}.png" plt.savefig(os.path.join(self.study_dir, fname)) plt.close("all") md += f'## {algorithm} {title.replace("_", " ").title()}\n\n' md += f"\n\n" except Exception as e: print(str(e)) matplotlib.rcParams["figure.figsize"] = matplotlib_default_figsize plt.style.use("default") with open(os.path.join(self.study_dir, "README.md"), "a") as fout: fout.write(md) fout.write("\n\n[<< Go back](../README.md)\n") ``` -------------------------------------------------------------------------------- /supervised/algorithms/lightgbm.py: -------------------------------------------------------------------------------- ```python import contextlib import copy import logging import os import lightgbm as lgb import numpy as np import pandas as pd from sklearn.base import ClassifierMixin, RegressorMixin from supervised.algorithms.algorithm import BaseAlgorithm from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, AlgorithmsRegistry, ) from supervised.utils.config import LOG_LEVEL from supervised.utils.metric import ( lightgbm_eval_metric_accuracy, lightgbm_eval_metric_average_precision, lightgbm_eval_metric_f1, lightgbm_eval_metric_pearson, lightgbm_eval_metric_r2, lightgbm_eval_metric_spearman, lightgbm_eval_metric_user_defined, ) logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) def lightgbm_objective(ml_task, automl_eval_metric): objective = "regression" if ml_task == BINARY_CLASSIFICATION: objective = "binary" elif ml_task == MULTICLASS_CLASSIFICATION: objective = "multiclass" else: # ml_task == REGRESSION objective = "regression" return objective def lightgbm_eval_metric(ml_task, automl_eval_metric): if automl_eval_metric == "user_defined_metric": return "custom", automl_eval_metric metric_name_mapping = { BINARY_CLASSIFICATION: { "auc": "auc", "logloss": "binary_logloss", "f1": "custom", "average_precision": "custom", "accuracy": "custom", }, MULTICLASS_CLASSIFICATION: { "logloss": "multi_logloss", "f1": "custom", "accuracy": "custom", }, REGRESSION: { "rmse": "rmse", "mse": "l2", "mae": "l1", "mape": "mape", "r2": "custom", "spearman": "custom", "pearson": "custom", }, } metric = metric_name_mapping[ml_task][automl_eval_metric] custom_eval_metric = None if automl_eval_metric in [ "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", ]: custom_eval_metric = automl_eval_metric return metric, custom_eval_metric class LightgbmAlgorithm(BaseAlgorithm): algorithm_name = "LightGBM" algorithm_short_name = "LightGBM" def __init__(self, params): super(LightgbmAlgorithm, self).__init__(params) self.library_version = lgb.__version__ self.explain_level = params.get("explain_level", 0) self.rounds = additional.get("max_rounds", 10000) self.max_iters = 1 self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) n_jobs = self.params.get("n_jobs", 0) # 0 is the default for LightGBM to use all cores if n_jobs == -1: n_jobs = 0 self.learner_params = { "boosting_type": "gbdt", "objective": self.params.get("objective", "binary"), "metric": self.params.get("metric", "binary_logloss"), "num_leaves": self.params.get("num_leaves", 31), "learning_rate": self.params.get("learning_rate", 0.1), "feature_fraction": self.params.get("feature_fraction", 1.0), "bagging_fraction": self.params.get("bagging_fraction", 1.0), "min_data_in_leaf": self.params.get("min_data_in_leaf", 20), "num_threads": n_jobs, "verbose": -1, "seed": self.params.get("seed", 1), "extra_trees": self.params.get("extra_trees", False), } for extra_param in [ "lambda_l1", "lambda_l2", "bagging_freq", "feature_pre_filter", "cat_feature", "cat_l2", "cat_smooth", "max_bin", ]: if extra_param in self.params: self.learner_params[extra_param] = self.params[extra_param] if "num_boost_round" in self.params: self.rounds = self.params["num_boost_round"] if "early_stopping_rounds" in self.params: self.early_stopping_rounds = self.params["early_stopping_rounds"] if "num_class" in self.params: # multiclass classification self.learner_params["num_class"] = self.params.get("num_class") self.custom_eval_metric = None if self.params.get("custom_eval_metric_name") is not None: if self.params["custom_eval_metric_name"] == "r2": self.custom_eval_metric = lightgbm_eval_metric_r2 elif self.params["custom_eval_metric_name"] == "spearman": self.custom_eval_metric = lightgbm_eval_metric_spearman elif self.params["custom_eval_metric_name"] == "pearson": self.custom_eval_metric = lightgbm_eval_metric_pearson elif self.params["custom_eval_metric_name"] == "f1": self.custom_eval_metric = lightgbm_eval_metric_f1 elif self.params["custom_eval_metric_name"] == "average_precision": self.custom_eval_metric = lightgbm_eval_metric_average_precision elif self.params["custom_eval_metric_name"] == "accuracy": self.custom_eval_metric = lightgbm_eval_metric_accuracy elif self.params["custom_eval_metric_name"] == "user_defined_metric": self.custom_eval_metric = lightgbm_eval_metric_user_defined logger.debug("LightgbmLearner __init__") def file_extension(self): return "lightgbm" def update(self, update_params): pass """ def get_boosting_rounds(self, lgb_train, valid_sets, esr, max_time): if max_time is None: max_time = 3600.0 start_time = time.time() evals_result = {} model = lgb.train( self.learner_params, lgb_train, num_boost_round=2, valid_sets=valid_sets, early_stopping_rounds=esr, evals_result=evals_result, verbose_eval=False, ) time_1_iter = (time.time() - start_time) / 2.0 # 2.0 is just a scaling factor # purely heuristic iters = int(max_time / time_1_iter * 2.0) iters = max(iters, 100) iters = min(iters, 10000) return iters """ def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): lgb_train = lgb.Dataset( X.values if isinstance(X, pd.DataFrame) else X, y, weight=sample_weight, ) valid_sets = None if self.early_stopping_rounds == 0: self.model = lgb.train( self.learner_params, lgb_train, num_boost_round=self.rounds, init_model=self.model, ) else: valid_names = None esr = None if X_validation is not None and y_validation is not None: valid_sets = [ lgb_train, lgb.Dataset( X_validation.values if isinstance(X_validation, pd.DataFrame) else X_validation, y_validation, weight=sample_weight_validation, ), ] valid_names = ["train", "validation"] esr = self.early_stopping_rounds evals_result = {} # disable for now ... # boosting_rounds = self.get_boosting_rounds(lgb_train, valid_sets, esr, max_time) self.model = lgb.train( self.learner_params, lgb_train, num_boost_round=self.rounds, valid_sets=valid_sets, valid_names=valid_names, feval=self.custom_eval_metric, callbacks=[ lgb.early_stopping(esr, verbose=False), lgb.record_evaluation(evals_result), ], ) del lgb_train if valid_sets is not None: del valid_sets[0] del valid_sets if log_to_file is not None: metric_name = list(evals_result["train"].keys())[0] result = pd.DataFrame( { "iteration": range(len(evals_result["train"][metric_name])), "train": evals_result["train"][metric_name], "validation": evals_result["validation"][metric_name], } ) result.to_csv(log_to_file, index=False, header=False) if self.params["ml_task"] != REGRESSION: self.classes_ = np.unique(y) def is_fitted(self): return self.model is not None def predict(self, X): self.reload() return self.model.predict(X.values if isinstance(X, pd.DataFrame) else X) def copy(self): with open(os.devnull, "w") as f, contextlib.redirect_stdout(f): return copy.deepcopy(self) def save(self, model_file_path): self.model.save_model(model_file_path) self.model_file_path = model_file_path logger.debug("LightgbmAlgorithm save model to %s" % model_file_path) def load(self, model_file_path): logger.debug("LightgbmAlgorithm load model from %s" % model_file_path) self.model_file_path = model_file_path self.model = lgb.Booster(model_file=model_file_path) def get_metric_name(self): metric = self.params.get("metric") custom_metric = self.params.get("custom_eval_metric_name") if metric is None: return None if metric == "custom": return custom_metric if metric == "binary_logloss": return "logloss" elif metric == "multi_logloss": return "logloss" return metric lgbm_bin_params = { "objective": ["binary"], "num_leaves": [15, 31, 63, 95, 127], "learning_rate": [0.05, 0.1, 0.2], "feature_fraction": [0.5, 0.8, 0.9, 1.0], "bagging_fraction": [0.5, 0.8, 0.9, 1.0], "min_data_in_leaf": [5, 10, 15, 20, 30, 50], } classification_bin_default_params = { "objective": "binary", "num_leaves": 63, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.9, "min_data_in_leaf": 10, } additional = { "max_rounds": 10000, "early_stopping_rounds": 50, "max_rows_limit": None, "max_cols_limit": None, } required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", "target_as_integer", ] lgbm_multi_params = copy.deepcopy(lgbm_bin_params) lgbm_multi_params["objective"] = ["multiclass"] classification_multi_default_params = { "objective": "multiclass", "num_leaves": 63, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.9, "min_data_in_leaf": 10, } lgbr_params = copy.deepcopy(lgbm_bin_params) lgbr_params["objective"] = ["regression"] class LgbmClassifier(ClassifierMixin, LightgbmAlgorithm): pass AlgorithmsRegistry.add( BINARY_CLASSIFICATION, LgbmClassifier, lgbm_bin_params, required_preprocessing, additional, classification_bin_default_params, ) AlgorithmsRegistry.add( MULTICLASS_CLASSIFICATION, LgbmClassifier, lgbm_multi_params, required_preprocessing, additional, classification_multi_default_params, ) regression_required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", "target_scale", ] regression_default_params = { "objective": "regression", "num_leaves": 63, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.9, "min_data_in_leaf": 10, } class LgbmRegressor(RegressorMixin, LightgbmAlgorithm): pass AlgorithmsRegistry.add( REGRESSION, LgbmRegressor, lgbr_params, regression_required_preprocessing, additional, regression_default_params, ) ``` -------------------------------------------------------------------------------- /supervised/utils/shap.py: -------------------------------------------------------------------------------- ```python import logging import os import matplotlib.pyplot as plt import numpy as np import pandas as pd shap_pacakge_available = False try: # I'm tired of all shap dependency hell # ugh import shap shap_pacakge_available = True except Exception: pass from sklearn.preprocessing import OneHotEncoder from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, ) logger = logging.getLogger(__name__) from supervised.utils.config import LOG_LEVEL logger.setLevel(LOG_LEVEL) import warnings class PlotSHAP: @staticmethod def is_available(algorithm, X_train, y_train, ml_task): if not shap_pacakge_available: return False # https://github.com/mljar/mljar-supervised/issues/112 disable for NN # https://github.com/mljar/mljar-supervised/issues/114 disable for CatBoost if algorithm.algorithm_short_name in ["Baseline", "Neural Network", "CatBoost"]: return False if ( algorithm.algorithm_short_name == "Xgboost" and algorithm.learner_params["booster"] == "gblinear" ): # Xgboost gblinear is not supported by SHAP return False # disable for large number of columns if X_train.shape[1] > 500: warnings.warn( "Disable SHAP explanations because of number of columns > 500." ) return False if ml_task == MULTICLASS_CLASSIFICATION and len(np.unique(y_train)) > 100: warnings.warn( "Disable SHAP explanations because of large number of classes (> 100)." ) return False if X_train.shape[0] < 20: warnings.warn( "Disable SHAP explanations because of small number of samples (< 20)." ) return False return True @staticmethod def get_explainer(algorithm, X_train): explainer = None if algorithm.algorithm_short_name in [ "Xgboost", "Decision Tree", "Random Forest", "LightGBM", "Extra Trees", "CatBoost", ]: explainer = shap.TreeExplainer(algorithm.model) elif algorithm.algorithm_short_name in ["Linear"]: explainer = shap.LinearExplainer(algorithm.model, X_train) # elif algorithm.algorithm_short_name in ["Neural Network"]: # explainer = shap.KernelExplainer(algorithm.model.predict, X_train) # slow return explainer @staticmethod def get_sample(X_validation, y_validation): # too many samples in the data, down-sample it SAMPLES_LIMIT = 1000 if X_validation.shape[0] > SAMPLES_LIMIT: X_validation.reset_index(inplace=True, drop=True) y_validation.reset_index(inplace=True, drop=True) X_vald = X_validation.sample(SAMPLES_LIMIT) y_vald = y_validation[X_vald.index] else: X_vald = X_validation y_vald = y_validation return X_vald, y_vald def get_predictions(algorithm, X_vald, y_vald, ml_task): # compute predictions on down-sampled data predictions = algorithm.predict(X_vald) if ml_task == MULTICLASS_CLASSIFICATION: oh = OneHotEncoder(sparse_output=False) y_encoded = oh.fit_transform(np.array(y_vald).reshape(-1, 1)) residua = np.sum(np.abs(np.array(y_encoded) - predictions), axis=1) else: residua = np.abs(np.array(y_vald) - predictions) df_preds = pd.DataFrame( {"res": residua, "lp": range(residua.shape[0]), "target": np.array(y_vald)}, index=X_vald.index, ) df_preds = df_preds.sort_values(by="res", ascending=False) return df_preds @staticmethod def summary(shap_values, X_vald, model_file_path, learner_name, class_names): fig = plt.gcf() classes = None if class_names is not None and len(class_names): classes = class_names with warnings.catch_warnings(): warnings.simplefilter("ignore") shap.summary_plot( shap_values, X_vald, plot_type="bar", show=False, class_names=classes ) fig.tight_layout(pad=2.0) fig.savefig(os.path.join(model_file_path, f"{learner_name}_shap_summary.png")) plt.close("all") vals = None if isinstance(shap_values, list): for sh in shap_values: v = np.abs(sh).mean(0) vals = v if vals is None else vals + v else: vals = np.abs(shap_values).mean(0) feature_importance = pd.DataFrame( list(zip(X_vald.columns, vals)), columns=["feature", "shap_importance"] ) feature_importance.sort_values( by=["shap_importance"], ascending=False, inplace=True ) feature_importance.to_csv( os.path.join(model_file_path, f"{learner_name}_shap_importance.csv"), index=False, ) @staticmethod def dependence(shap_values, X_vald, model_file_path, learner_name, file_postfix=""): with warnings.catch_warnings(): warnings.simplefilter("ignore") fig = plt.figure(figsize=(14, 7)) plots_cnt = np.min([9, X_vald.shape[1]]) cols_cnt = 3 rows_cnt = 3 if plots_cnt < 4: rows_cnt = 1 elif plots_cnt < 7: rows_cnt = 2 for i in range(plots_cnt): ax = fig.add_subplot(rows_cnt, cols_cnt, i + 1) shap.dependence_plot( f"rank({i})", shap_values, X_vald, show=False, title=f"Importance #{i+1}", ax=ax, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_dependence{file_postfix}.png" ) ) plt.close("all") @staticmethod def compute( algorithm, X_train, y_train, X_validation, y_validation, model_file_path, learner_name, class_names, ml_task, ): if not PlotSHAP.is_available(algorithm, X_train, y_train, ml_task): return try: with warnings.catch_warnings(): warnings.simplefilter("ignore") explainer = PlotSHAP.get_explainer(algorithm, X_train) X_vald, y_vald = PlotSHAP.get_sample(X_validation, y_validation) shap_values = explainer.shap_values(X_vald) # fix problem with 1 or 2 dimensions for binary classification expected_value = explainer.expected_value if ml_task == BINARY_CLASSIFICATION and isinstance(shap_values, list): shap_values = shap_values[1] expected_value = explainer.expected_value[1] # Summary SHAP plot PlotSHAP.summary( shap_values, X_vald, model_file_path, learner_name, class_names ) # Dependence SHAP plots if ml_task == MULTICLASS_CLASSIFICATION: for t in np.unique(y_vald): PlotSHAP.dependence( shap_values[t], X_vald, model_file_path, learner_name, f"_class_{class_names[t]}", ) else: PlotSHAP.dependence(shap_values, X_vald, model_file_path, learner_name) # Decision SHAP plots df_preds = PlotSHAP.get_predictions(algorithm, X_vald, y_vald, ml_task) if ml_task == REGRESSION: PlotSHAP.decisions_regression( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ) elif ml_task == BINARY_CLASSIFICATION: PlotSHAP.decisions_binary( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ) else: PlotSHAP.decisions_multiclass( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, class_names, ) except Exception as e: pass # print( # f"Exception while producing SHAP explanations. {str(e)}\nContinuing ..." # ) @staticmethod def decisions_regression( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ): fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds.lp[:10], :], X_vald.loc[df_preds.index[:10]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join(model_file_path, f"{learner_name}_shap_worst_decisions.png") ) plt.close("all") fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds.lp[-10:], :], X_vald.loc[df_preds.index[-10:]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join(model_file_path, f"{learner_name}_shap_best_decisions.png") ) plt.close("all") @staticmethod def decisions_binary( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, ): # classes are from 0 ... for t in np.unique(y_vald): fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds[df_preds.target == t].lp[:10], :], X_vald.loc[df_preds[df_preds.target == t].index[:10]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_class_{t}_worst_decisions.png", ) ) plt.close("all") fig = plt.gcf() shap.decision_plot( expected_value, shap_values[df_preds[df_preds.target == t].lp[-10:], :], X_vald.loc[df_preds[df_preds.target == t].index[-10:]], show=False, ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_shap_class_{t}_best_decisions.png" ) ) plt.close("all") @staticmethod def decisions_multiclass( df_preds, shap_values, expected_value, X_vald, y_vald, model_file_path, learner_name, class_names, ): for decision_type in ["worst", "best"]: m = 1 if decision_type == "worst" else -1 for i in range(4): fig = plt.gcf() shap.multioutput_decision_plot( list(expected_value), shap_values, row_index=df_preds.lp.iloc[m * i], show=False, legend_labels=class_names, title=f"It should be {class_names[df_preds.target.iloc[m*i]]}", ) fig.tight_layout(pad=2.0) fig.savefig( os.path.join( model_file_path, f"{learner_name}_sample_{i}_{decision_type}_decisions.png", ) ) plt.close("all") ``` -------------------------------------------------------------------------------- /supervised/algorithms/xgboost.py: -------------------------------------------------------------------------------- ```python import copy import logging import numpy as np import pandas as pd import xgboost as xgb from sklearn.base import ClassifierMixin, RegressorMixin from supervised.algorithms.algorithm import BaseAlgorithm from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, AlgorithmsRegistry, ) from supervised.utils.config import LOG_LEVEL from supervised.utils.metric import ( xgboost_eval_metric_accuracy, xgboost_eval_metric_average_precision, xgboost_eval_metric_f1, xgboost_eval_metric_mse, xgboost_eval_metric_pearson, xgboost_eval_metric_r2, xgboost_eval_metric_spearman, xgboost_eval_metric_user_defined, ) logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) class XgbAlgorithmException(Exception): def __init__(self, message): super(XgbAlgorithmException, self).__init__(message) logger.error(message) def time_constraint(env): # print("time constraint") pass def xgboost_eval_metric(ml_task, automl_eval_metric): # the mapping is almost the same eval_metric_name = automl_eval_metric if ml_task == MULTICLASS_CLASSIFICATION: if automl_eval_metric == "logloss": eval_metric_name = "mlogloss" return eval_metric_name def xgboost_objective(ml_task, automl_eval_metric): objective = "reg:squarederror" if ml_task == BINARY_CLASSIFICATION: objective = "binary:logistic" elif ml_task == MULTICLASS_CLASSIFICATION: objective = "multi:softprob" else: # ml_task == REGRESSION objective = "reg:squarederror" return objective class XgbAlgorithm(BaseAlgorithm): """ This is a wrapper over xgboost algorithm. """ algorithm_name = "Extreme Gradient Boosting" algorithm_short_name = "Xgboost" def __init__(self, params): super(XgbAlgorithm, self).__init__(params) self.library_version = xgb.__version__ self.explain_level = params.get("explain_level", 0) self.boosting_rounds = additional.get("max_rounds", 10000) self.max_iters = 1 self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) self.learner_params = { "tree_method": "hist", "booster": "gbtree", "objective": self.params.get("objective"), "eval_metric": self.params.get("eval_metric"), "eta": self.params.get("eta", 0.01), "max_depth": self.params.get("max_depth", 1), "min_child_weight": self.params.get("min_child_weight", 1), "subsample": self.params.get("subsample", 0.8), "colsample_bytree": self.params.get("colsample_bytree", 0.8), "n_jobs": self.params.get("n_jobs", -1), # "silent": self.params.get("silent", 1), "seed": self.params.get("seed", 1), "verbosity": 0, } if "lambda" in self.params: self.learner_params["lambda"] = self.params["lambda"] if "alpha" in self.params: self.learner_params["alpha"] = self.params["alpha"] # check https://github.com/dmlc/xgboost/issues/5637 if self.learner_params["seed"] > 2147483647: self.learner_params["seed"] = self.learner_params["seed"] % 2147483647 if "num_class" in self.params: # multiclass classification self.learner_params["num_class"] = self.params.get("num_class") if "max_rounds" in self.params: self.boosting_rounds = self.params["max_rounds"] self.custom_eval_metric = None if self.params.get("eval_metric", "") == "r2": self.custom_eval_metric = xgboost_eval_metric_r2 elif self.params.get("eval_metric", "") == "spearman": self.custom_eval_metric = xgboost_eval_metric_spearman elif self.params.get("eval_metric", "") == "pearson": self.custom_eval_metric = xgboost_eval_metric_pearson elif self.params.get("eval_metric", "") == "f1": self.custom_eval_metric = xgboost_eval_metric_f1 elif self.params.get("eval_metric", "") == "average_precision": self.custom_eval_metric = xgboost_eval_metric_average_precision elif self.params.get("eval_metric", "") == "accuracy": self.custom_eval_metric = xgboost_eval_metric_accuracy elif self.params.get("eval_metric", "") == "mse": self.custom_eval_metric = xgboost_eval_metric_mse elif self.params.get("eval_metric", "") == "user_defined_metric": self.custom_eval_metric = xgboost_eval_metric_user_defined logger.debug("XgbLearner __init__") """ def get_boosting_rounds(self, dtrain, evals, esr, max_time): if max_time is None: return self.boosting_rounds start_time = time.time() evals_result = {} model = xgb.train( self.learner_params, dtrain, 2, evals=evals, early_stopping_rounds=esr, evals_result=evals_result, verbose_eval=False, ) time_1_iter = (time.time() - start_time) / 2.0 # 2.0 is just a scaling factor # purely heuristic iters = int(max_time / time_1_iter * 2.0) iters = max(iters, 100) iters = min(iters, 10000) return iters """ def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): dtrain = xgb.DMatrix( X.values if isinstance(X, pd.DataFrame) else X, label=y, missing=np.NaN, weight=sample_weight, ) if X_validation is not None and y_validation is not None: dvalidation = xgb.DMatrix( X_validation.values if isinstance(X_validation, pd.DataFrame) else X_validation, label=y_validation, missing=np.NaN, weight=sample_weight_validation, ) else: dvalidation = None evals_result = {} evals = [] esr = None if X_validation is not None and y_validation is not None: evals = [(dtrain, "train"), (dvalidation, "validation")] esr = self.early_stopping_rounds # disable for now, dont have better idea how to handle time limit ... # looks like there is better not to limit the algorithm # just wait till they converge ... # boosting_rounds = self.get_boosting_rounds(dtrain, evals, esr, max_time) if self.custom_eval_metric is not None: del self.learner_params["eval_metric"] self.model = xgb.train( self.learner_params, dtrain, self.boosting_rounds, evals=evals, early_stopping_rounds=esr, evals_result=evals_result, verbose_eval=False, custom_metric=self.custom_eval_metric # callbacks=[time_constraint] # callback slows down by factor ~8 ) del dtrain del dvalidation if log_to_file is not None: metric_name = list(evals_result["train"].keys())[-1] result = pd.DataFrame( { "iteration": range(len(evals_result["train"][metric_name])), "train": evals_result["train"][metric_name], "validation": evals_result["validation"][metric_name], } ) # it a is custom metric # that is always minimized # we need to revert it if metric_name in [ "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", ]: result["train"] *= -1.0 result["validation"] *= -1.0 result.to_csv(log_to_file, index=False, header=False) if self.params["ml_task"] != REGRESSION: self.classes_ = np.unique(y) # fix high memory consumption in xgboost, # waiting for release with fix # https://github.com/dmlc/xgboost/issues/5474 """ # disable, for now all learners are saved to hard disk and then deleted from RAM with tempfile.NamedTemporaryFile() as tmp: self.model.save_model(tmp.name) del self.model self.model = xgb.Booster() self.model.load_model(tmp.name) """ def is_fitted(self): return self.model is not None def predict(self, X): self.reload() if self.model is None: raise XgbAlgorithmException("Xgboost model is None") dtrain = xgb.DMatrix( X.values if isinstance(X, pd.DataFrame) else X, missing=np.NaN ) # xgboost > 2.0.0 version if hasattr(self.model, "best_iteration"): a = self.model.predict( dtrain, iteration_range=(0, self.model.best_iteration + 1) ) else: a = self.model.predict(dtrain) return a def copy(self): return copy.deepcopy(self) def save(self, model_file_path): self.model.save_model(model_file_path) self.model_file_path = model_file_path logger.debug("XgbAlgorithm save model to %s" % model_file_path) def load(self, model_file_path): logger.debug("XgbLearner load model from %s" % model_file_path) self.model = xgb.Booster() # init model self.model.load_model(model_file_path) self.model_file_path = model_file_path def file_extension(self): # we need to keep models as json files # to keep information about best_iteration return "xgboost.json" def get_metric_name(self): metric = self.params.get("eval_metric") if metric is None: return None if metric == "mlogloss": return "logloss" return metric # For binary classification target should be 0, 1. There should be no NaNs in target. xgb_bin_class_params = { "objective": ["binary:logistic"], "eta": [0.05, 0.075, 0.1, 0.15], "max_depth": [4, 5, 6, 7, 8, 9], "min_child_weight": [1, 5, 10, 25, 50], "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], } classification_bin_default_params = { "objective": "binary:logistic", "eta": 0.075, "max_depth": 6, "min_child_weight": 1, "subsample": 1.0, "colsample_bytree": 1.0, } xgb_regression_params = dict(xgb_bin_class_params) xgb_regression_params["objective"] = ["reg:squarederror"] # xgb_regression_params["eval_metric"] = ["rmse", "mae", "mape"] xgb_regression_params["max_depth"] = [4, 5, 6, 7, 8, 9] xgb_multi_class_params = dict(xgb_bin_class_params) xgb_multi_class_params["objective"] = ["multi:softprob"] # xgb_multi_class_params["eval_metric"] = ["mlogloss"] classification_multi_default_params = { "objective": "multi:softprob", "eta": 0.075, "max_depth": 6, "min_child_weight": 1, "subsample": 1.0, "colsample_bytree": 1.0, } regression_default_params = { "objective": "reg:squarederror", "eta": 0.075, "max_depth": 6, "min_child_weight": 1, "subsample": 1.0, "colsample_bytree": 1.0, } additional = { "max_rounds": 10000, "early_stopping_rounds": 50, "max_rows_limit": None, "max_cols_limit": None, } required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", "target_as_integer", ] class XgbClassifier(ClassifierMixin, XgbAlgorithm): pass AlgorithmsRegistry.add( BINARY_CLASSIFICATION, XgbClassifier, xgb_bin_class_params, required_preprocessing, additional, classification_bin_default_params, ) AlgorithmsRegistry.add( MULTICLASS_CLASSIFICATION, XgbClassifier, xgb_multi_class_params, required_preprocessing, additional, classification_multi_default_params, ) regression_required_preprocessing = [ "missing_values_inputation", "convert_categorical", "datetime_transform", "text_transform", "target_scale", ] class XgbRegressor(RegressorMixin, XgbAlgorithm): pass AlgorithmsRegistry.add( REGRESSION, XgbRegressor, xgb_regression_params, regression_required_preprocessing, additional, regression_default_params, ) ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl.py: -------------------------------------------------------------------------------- ```python import os import shutil import unittest from pathlib import Path import numpy as np import pandas as pd import pytest from sklearn import datasets from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from supervised import AutoML from supervised.exceptions import AutoMLException iris = datasets.load_iris() housing = datasets.fetch_california_housing() # limit data size for faster tests housing.data = housing.data[:500] housing.target = housing.target[:500] breast_cancer = datasets.load_breast_cancer() @pytest.mark.usefixtures("data_folder") class AutoMLTest(unittest.TestCase): automl_dir = "AutoMLTest" data_folder: Path def tearDown(self): shutil.rmtree(self.automl_dir, ignore_errors=True) def setUp(self): shutil.rmtree(self.automl_dir, ignore_errors=True) def test_new_directory(self): """Directory does not exist, create it""" # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Create model with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit(X, y) # AutoML only validates constructor params on `fit()` call # Assert directory was created self.assertTrue(os.path.exists(self.automl_dir)) def test_empty_directory(self): """Directory exists and is empty, use it""" # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Make dir os.mkdir(self.automl_dir) # Assert dir exists self.assertTrue(os.path.exists(self.automl_dir)) # Create automl with dir model = AutoML(results_path=self.automl_dir) # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit(X, y) # AutoML only validates constructor params on `fit()` call self.assertTrue(os.path.exists(self.automl_dir)) def test_not_empty_directory(self): """ Directory exists and is not empty, there is no params.json file in it, dont use it, raise exception """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Create directory os.mkdir(self.automl_dir) # Write some content to directory open(os.path.join(self.automl_dir, "test.file"), "w").close() # Assert directory exists self.assertTrue(os.path.exists(self.automl_dir)) # Generate data X, y = datasets.make_classification(n_samples=30) # Assert than an Exception is raised with self.assertRaises(AutoMLException) as context: a = AutoML(results_path=self.automl_dir) a.fit(X, y) # AutoML only validates constructor params on `fit()` call self.assertTrue("not empty" in str(context.exception)) def test_use_directory_if_non_empty_exists_with_params_json(self): """ Directory exists and is not empty, there is params.json in it, try to load it, raise exception because of fake params.json """ # Assert directory does not exist self.assertTrue(not os.path.exists(self.automl_dir)) # Create dir os.mkdir(self.automl_dir) # Write `params.json` to directory open(os.path.join(self.automl_dir, "params.json"), "w").close() # Assert directory exists self.assertTrue(os.path.exists(self.automl_dir)) # Generate data X, y = datasets.make_classification(n_samples=30) with self.assertRaises(AutoMLException) as context: a = AutoML(results_path=self.automl_dir) a.predict(X) # AutoML tries to load on predict call self.assertTrue("Cannot load" in str(context.exception)) def test_get_params(self): """ Passes params in AutoML constructor and uses `get_params()` after fitting. Initial params must be equal to the ones returned by `get_params()`. """ # Create model model = AutoML( hill_climbing_steps=3, start_random_models=1, results_path=self.automl_dir ) # Get params before fit params_before_fit = model.get_params() # Generate data X, y = datasets.make_classification(n_samples=30) # Fit data model.fit(X, y) # Get params after fit params_after_fit = model.get_params() # Assert before and after params are equal self.assertEqual(params_before_fit, params_after_fit) def test_scikit_learn_pipeline_integration(self): """ Tests if AutoML is working on a scikit-learn's pipeline """ # Create dataset X, y = datasets.make_classification(n_samples=30) # apply PCA to X new_X = PCA(random_state=0).fit_transform(X) # Create default model default_model = AutoML( algorithms=["Linear"], random_state=0, results_path=self.automl_dir ) # Fit default model with transformed X and y, and predict transformed X y_pred_default = default_model.fit(new_X, y).predict(new_X) # Create pipeline with PCA and AutoML pipeline = make_pipeline( PCA(random_state=0), AutoML(algorithms=["Linear"], random_state=0) ) # Fit with original X and y and predict X y_pred_pipe = pipeline.fit(X, y).predict(X) # y_pred_default must be equal to y_pred_pipe self.assertTrue((y_pred_pipe == y_pred_default).all()) def test_predict_proba_in_regression(self): model = AutoML( explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir ) model.fit(housing.data, housing.target) with self.assertRaises(AutoMLException) as context: # Try to call predict_proba in regression task model.predict_proba(housing.data) def test_iris_dataset(self): """Tests AutoML in the iris dataset (Multiclass classification)""" model = AutoML( explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir ) score = model.fit(iris.data, iris.target).score(iris.data, iris.target) self.assertGreater(score, 0.5) def test_housing_dataset(self): """Tests AutoML in the housing dataset (Regression)""" model = AutoML( explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir ) score = model.fit(housing.data, housing.target).score( housing.data, housing.target ) self.assertGreater(score, 0.5) def test_breast_cancer_dataset(self): """Tests AutoML in the breast cancer (binary classification)""" model = AutoML( explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir ) score = model.fit(breast_cancer.data, breast_cancer.target).score( breast_cancer.data, breast_cancer.target ) self.assertGreater(score, 0.5) def test_titatic_dataset(self): """Tets AutoML in the titanic dataset (binary classification) with categorial features""" data_folder = self.data_folder automl = AutoML( algorithms=["Xgboost"], mode="Explain", results_path=self.automl_dir ) df = pd.read_csv((data_folder / "Titanic/train.csv")) X = df[df.columns[2:]] y = df["Survived"] automl.fit(X, y) test = pd.read_csv(data_folder / "Titanic/test_with_Survived.csv") test_cols = [ "Parch", "Ticket", "Fare", "Pclass", "Name", "Sex", "Age", "SibSp", "Cabin", "Embarked", ] score = automl.score(test[test_cols], test["Survived"]) self.assertGreater(score, 0.5) def test_score_without_y(self): """Tests the use of `score()` without passing y. Should raise AutoMLException""" model = AutoML( explain_level=0, verbose=0, random_state=1, results_path=self.automl_dir ) # Assert than an Exception is raised with self.assertRaises(AutoMLException) as context: # Try to score without passing 'y' score = model.fit(breast_cancer.data, breast_cancer.target).score( breast_cancer.data ) self.assertTrue("y must be specified" in str(context.exception)) def test_no_constructor_args(self): """Tests the use of AutoML without passing any args. Should work without any arguments""" # Create model with no arguments model = AutoML() model.results_path = self.automl_dir # Assert than an Exception is raised score = model.fit(iris.data, iris.target).score(iris.data, iris.target) self.assertGreater(score, 0.5) def test_fit_returns_self(self): """Tests if the `fit()` method returns `self`. This allows to quickly implement one-liners with AutoML""" model = AutoML() model.results_path = self.automl_dir self.assertTrue( isinstance(model.fit(iris.data, iris.target), AutoML), "`fit()` method must return 'self'", ) def test_invalid_mode(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"mode": "invalid_mode"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_ml_task(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"ml_task": "invalid_task"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_results_path(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"results_path": 2} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_total_time_limit(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"total_time_limit": -1} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_model_time_limit(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"model_time_limit": -1} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_algorithm_name(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"algorithms": ["Baseline", "Neural Netrk"]} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_train_ensemble(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"train_ensemble": "not bool"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_stack_models(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"stack_models": "not bool"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_eval_metric(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"eval_metric": "not_real_metric"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_validation_strategy(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"validation_strategy": "test"} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_invalid_verbose(self): model = AutoML(explain_level=0, verbose=0, results_path=self.automl_dir) param = {"verbose": -1} model.set_params(**param) with self.assertRaises(ValueError) as context: model.fit(iris.data, iris.target) def test_too_small_time_limit(self): rows = 1000000 X = np.random.uniform(size=(rows, 100)) y = np.random.randint(0, 2, size=(rows,)) automl = AutoML( results_path=self.automl_dir, total_time_limit=1, train_ensemble=False ) with self.assertRaises(AutoMLException) as context: automl.fit(X, y) ``` -------------------------------------------------------------------------------- /supervised/utils/metric.py: -------------------------------------------------------------------------------- ```python import logging log = logging.getLogger(__name__) import numpy as np import pandas as pd import scipy as sp from sklearn.metrics import ( accuracy_score, average_precision_score, f1_score, log_loss, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, ) def logloss(y_true, y_predicted, sample_weight=None): # convert predicted values to float32 to avoid warnings ll = log_loss(y_true, y_predicted.astype(np.float32), sample_weight=sample_weight) return ll def rmse(y_true, y_predicted, sample_weight=None): val = mean_squared_error(y_true, y_predicted, sample_weight=sample_weight) return np.sqrt(val) if val > 0 else -np.Inf def rmsle(y_true, y_predicted, sample_weight=None): val = mean_squared_log_error(y_true, y_predicted, sample_weight=sample_weight) return np.sqrt(val) if val > 0 else -np.Inf def negative_auc(y_true, y_predicted, sample_weight=None): val = roc_auc_score(y_true, y_predicted, sample_weight=sample_weight) return -1.0 * val def negative_r2(y_true, y_predicted, sample_weight=None): val = r2_score(y_true, y_predicted, sample_weight=sample_weight) return -1.0 * val def negative_f1(y_true, y_predicted, sample_weight=None): if isinstance(y_true, pd.DataFrame): y_true = np.array(y_true) if isinstance(y_predicted, pd.DataFrame): y_predicted = np.array(y_predicted) if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1: y_predicted = y_predicted.ravel() average = None if len(y_predicted.shape) == 1: y_predicted = (y_predicted > 0.5).astype(int) average = "binary" else: y_predicted = np.argmax(y_predicted, axis=1) average = "micro" val = f1_score(y_true, y_predicted, sample_weight=sample_weight, average=average) return -val def negative_accuracy(y_true, y_predicted, sample_weight=None): if isinstance(y_true, pd.DataFrame): y_true = np.array(y_true) if isinstance(y_predicted, pd.DataFrame): y_predicted = np.array(y_predicted) if len(y_predicted.shape) == 2 and y_predicted.shape[1] == 1: y_predicted = y_predicted.ravel() if len(y_predicted.shape) == 1: y_predicted = (y_predicted > 0.5).astype(int) else: y_predicted = np.argmax(y_predicted, axis=1) val = accuracy_score(y_true, y_predicted, sample_weight=sample_weight) return -val def negative_average_precision(y_true, y_predicted, sample_weight=None): if isinstance(y_true, pd.DataFrame): y_true = np.array(y_true) if isinstance(y_predicted, pd.DataFrame): y_predicted = np.array(y_predicted) val = average_precision_score(y_true, y_predicted, sample_weight=sample_weight) return -val def negative_spearman(y_true, y_predicted, sample_weight=None): # sample weight is ignored c, _ = sp.stats.spearmanr(y_true, y_predicted) return -c def spearman(y_true, y_predicted, sample_weight=None): # sample weight is ignored c, _ = sp.stats.spearmanr(y_true, y_predicted) return c def negative_pearson(y_true, y_predicted, sample_weight=None): # sample weight is ignored if isinstance(y_true, pd.DataFrame): y_true = np.array(y_true).ravel() if isinstance(y_predicted, pd.DataFrame): y_predicted = np.array(y_predicted).ravel() return -np.corrcoef(y_true, y_predicted)[0, 1] def pearson(y_true, y_predicted, sample_weight=None): return -negative_pearson(y_true, y_predicted, sample_weight) class MetricException(Exception): def __init__(self, message): Exception.__init__(self, message) log.error(message) def xgboost_eval_metric_r2(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None return "r2", -r2_score(target, preds, sample_weight=weight) def xgboost_eval_metric_spearman(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() return "spearman", negative_spearman(target, preds) def xgboost_eval_metric_pearson(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() return "pearson", negative_pearson(target, preds) def xgboost_eval_metric_f1(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None return "f1", negative_f1(target, preds, weight) def xgboost_eval_metric_average_precision(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None return "average_precision", negative_average_precision(target, preds, weight) def xgboost_eval_metric_accuracy(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None return "accuracy", negative_accuracy(target, preds, weight) def xgboost_eval_metric_mse(preds, dtrain): # Xgboost needs to minimize eval_metric target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None return "mse", mean_squared_error(target, preds, sample_weight=weight) def lightgbm_eval_metric_r2(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() return "r2", r2_score(target, preds, sample_weight=weight), True def lightgbm_eval_metric_spearman(preds, dtrain): target = dtrain.get_label() return "spearman", -negative_spearman(target, preds), True def lightgbm_eval_metric_pearson(preds, dtrain): target = dtrain.get_label() return "pearson", -negative_pearson(target, preds), True def lightgbm_eval_metric_f1(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() unique_targets = np.unique(target) if len(unique_targets) > 2: cols = len(unique_targets) rows = int(preds.shape[0] / len(unique_targets)) preds = np.reshape(preds, (rows, cols), order="F") return "f1", -negative_f1(target, preds, weight), True def lightgbm_eval_metric_average_precision(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() return "average_precision", -negative_average_precision(target, preds, weight), True def lightgbm_eval_metric_accuracy(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() return "accuracy", -negative_accuracy(target, preds, weight), True class CatBoostEvalMetricSpearman(object): def get_final_error(self, error, weight): return error def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) preds = np.array(approxes[0]) target = np.array(target) return -negative_spearman(target, preds), 0 class CatBoostEvalMetricPearson(object): def get_final_error(self, error, weight): return error def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) preds = np.array(approxes[0]) target = np.array(target) return -negative_pearson(target, preds), 0 class CatBoostEvalMetricAveragePrecision(object): def get_final_error(self, error, weight): return error def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) preds = np.array(approxes[0]) target = np.array(target) if weight is not None: weight = np.array(weight) return -negative_average_precision(target, preds, weight), 0 class CatBoostEvalMetricMSE(object): def get_final_error(self, error, weight): return error def is_max_optimal(self): return False def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) preds = np.array(approxes[0]) target = np.array(target) if weight is not None: weight = np.array(weight) return mean_squared_error(target, preds, sample_weight=weight), 0 class UserDefinedEvalMetric: # should always minimize eval_metric = mean_squared_error # set the default def set_metric(self, feval): UserDefinedEvalMetric.eval_metric = feval def __call__(self, y_true, y_predicted, sample_weight=None): return UserDefinedEvalMetric.eval_metric(y_true, y_predicted, sample_weight) def xgboost_eval_metric_user_defined(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() if len(weight) == 0: weight = None metric = UserDefinedEvalMetric() return "user_defined_metric", metric(target, preds, sample_weight=weight) def lightgbm_eval_metric_user_defined(preds, dtrain): target = dtrain.get_label() weight = dtrain.get_weight() metric = UserDefinedEvalMetric() return "user_defined_metric", metric(target, preds, sample_weight=weight), False class CatBoostEvalMetricUserDefined(object): def get_final_error(self, error, weight): return error def is_max_optimal(self): return False def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) preds = np.array(approxes[0]) target = np.array(target) if weight is not None: weight = np.array(weight) metric = UserDefinedEvalMetric() return metric(target, preds, sample_weight=weight), 0 class Metric(object): def __init__(self, params): if params is None: raise MetricException("Metric params not defined") self.params = params self.name = self.params.get("name") if self.name is None: raise MetricException("Metric name not defined") self.minimize_direction = self.name in [ "logloss", "auc", # negative auc "rmse", "mae", "mse", "r2", # negative r2 "mape", "spearman", # negative "pearson", # negative "f1", # negative "average_precision", # negative "accuracy", # negative "user_defined_metric", ] if self.name == "logloss": self.metric = logloss elif self.name == "auc": self.metric = negative_auc elif self.name == "acc": self.metric = accuracy_score elif self.name == "rmse": self.metric = rmse elif self.name == "mse": self.metric = mean_squared_error elif self.name == "mae": self.metric = mean_absolute_error elif self.name == "r2": self.metric = negative_r2 elif self.name == "mape": self.metric = mean_absolute_percentage_error elif self.name == "spearman": self.metric = negative_spearman elif self.name == "pearson": self.metric = negative_pearson elif self.name == "f1": self.metric = negative_f1 elif self.name == "average_precision": self.metric = negative_average_precision elif self.name == "accuracy": self.metric = negative_accuracy elif self.name == "user_defined_metric": self.metric = UserDefinedEvalMetric.eval_metric # elif self.name == "rmsle": # need to update target preprocessing # self.metric = rmsle # to assure that target is not negative ... else: raise MetricException(f"Unknown metric '{self.name}'") def __call__(self, y_true, y_predicted, sample_weight=None): return self.metric(y_true, y_predicted, sample_weight=sample_weight) def improvement(self, previous, current): if self.minimize_direction: return current < previous return current > previous def get_maximum(self): if self.minimize_direction: return 10e12 else: return -10e12 def worst_value(self): if self.minimize_direction: return np.Inf return -np.Inf def get_minimize_direction(self): return self.minimize_direction def is_negative(self): return self.name in [ "auc", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", ] @staticmethod def optimize_negative(metric_name): return metric_name in [ "auc", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", ] ``` -------------------------------------------------------------------------------- /supervised/algorithms/catboost.py: -------------------------------------------------------------------------------- ```python import copy import logging import time import numpy as np import pandas as pd from sklearn.base import ClassifierMixin, RegressorMixin from supervised.algorithms.algorithm import BaseAlgorithm from supervised.algorithms.registry import ( BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION, REGRESSION, AlgorithmsRegistry, ) from supervised.preprocessing.preprocessing_utils import PreprocessingUtils from supervised.utils.config import LOG_LEVEL from supervised.utils.metric import ( CatBoostEvalMetricAveragePrecision, CatBoostEvalMetricMSE, CatBoostEvalMetricPearson, CatBoostEvalMetricSpearman, CatBoostEvalMetricUserDefined, ) logger = logging.getLogger(__name__) logger.setLevel(LOG_LEVEL) import catboost from catboost import CatBoostClassifier, CatBoostRegressor, Pool def catboost_eval_metric(ml_task, eval_metric): if eval_metric == "user_defined_metric": return eval_metric metric_name_mapping = { BINARY_CLASSIFICATION: { "auc": "AUC", "logloss": "Logloss", "f1": "F1", "average_precision": "average_precision", "accuracy": "Accuracy", }, MULTICLASS_CLASSIFICATION: { "logloss": "MultiClass", "f1": "TotalF1:average=Micro", "accuracy": "Accuracy", }, REGRESSION: { "rmse": "RMSE", "mse": "mse", "mae": "MAE", "mape": "MAPE", "r2": "R2", "spearman": "spearman", "pearson": "pearson", }, } return metric_name_mapping[ml_task][eval_metric] def catboost_objective(ml_task, eval_metric): objective = "RMSE" if ml_task == BINARY_CLASSIFICATION: objective = "Logloss" elif ml_task == MULTICLASS_CLASSIFICATION: objective = "MultiClass" else: # ml_task == REGRESSION objective = catboost_eval_metric(REGRESSION, eval_metric) if objective in [ "mse", "R2", "spearman", "pearson", "user_defined_metric", ]: # cant optimize them directly objective = "RMSE" return objective class CatBoostAlgorithm(BaseAlgorithm): algorithm_name = "CatBoost" algorithm_short_name = "CatBoost" warmup_iterations = 20 def __init__(self, params): super(CatBoostAlgorithm, self).__init__(params) self.library_version = catboost.__version__ self.snapshot_file_path = "training_snapshot" self.explain_level = params.get("explain_level", 0) self.rounds = additional.get("max_rounds", 10000) self.max_iters = 1 self.early_stopping_rounds = additional.get("early_stopping_rounds", 50) Algo = CatBoostClassifier loss_function = "Logloss" if self.params["ml_task"] == BINARY_CLASSIFICATION: loss_function = self.params.get("loss_function", "Logloss") elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: loss_function = self.params.get("loss_function", "MultiClass") elif self.params["ml_task"] == REGRESSION: loss_function = self.params.get("loss_function", "RMSE") Algo = CatBoostRegressor cat_params = { "iterations": self.params.get("num_boost_round", self.rounds), "learning_rate": self.params.get("learning_rate", 0.1), "depth": self.params.get("depth", 3), "rsm": self.params.get("rsm", 1.0), "l2_leaf_reg": self.params.get("l2_leaf_reg", 3.0), "random_strength": self.params.get("random_strength", 1.0), "loss_function": loss_function, "eval_metric": self.params.get("eval_metric", loss_function), # "custom_metric": self.params.get("eval_metric", loss_function), "thread_count": self.params.get("n_jobs", -1), "verbose": False, "allow_writing_files": False, "random_seed": self.params.get("seed", 1), } for extra_param in [ "min_data_in_leaf", "bootstrap_type", "bagging_temperature", "subsample", "border_count", ]: if extra_param in self.params: cat_params[extra_param] = self.params[extra_param] self.log_metric_name = cat_params["eval_metric"] if cat_params["eval_metric"] == "spearman": cat_params["eval_metric"] = CatBoostEvalMetricSpearman() self.log_metric_name = "CatBoostEvalMetricSpearman" elif cat_params["eval_metric"] == "pearson": cat_params["eval_metric"] = CatBoostEvalMetricPearson() self.log_metric_name = "CatBoostEvalMetricPearson" elif cat_params["eval_metric"] == "average_precision": cat_params["eval_metric"] = CatBoostEvalMetricAveragePrecision() self.log_metric_name = "CatBoostEvalMetricAveragePrecision" elif cat_params["eval_metric"] == "mse": cat_params["eval_metric"] = CatBoostEvalMetricMSE() self.log_metric_name = "CatBoostEvalMetricMSE" elif cat_params["eval_metric"] == "user_defined_metric": cat_params["eval_metric"] = CatBoostEvalMetricUserDefined() self.log_metric_name = "CatBoostEvalMetricUserDefined" self.model = Algo(**cat_params) self.cat_features = None self.best_ntree_limit = 0 logger.debug("CatBoostAlgorithm.__init__") def _assess_iterations(self, X, y, sample_weight, eval_set, max_time=None): if max_time is None: max_time = 3600 try: model = copy.deepcopy(self.model) model.set_params(iterations=self.warmup_iterations) start_time = time.time() model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=None if self.model.tree_count_ is None else self.model, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) elapsed_time = (time.time() - start_time) / float(self.warmup_iterations) # print(max_time, elapsed_time, max_time / elapsed_time, np.round(time.time() - start_time, 2)) new_rounds = int(min(10000, max_time / elapsed_time)) new_rounds = max(new_rounds, 10) return model, new_rounds except Exception as e: # print(str(e)) return None, 1000 def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): if self.is_fitted(): print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] col_name = X.columns[i] X[col_name] = X[col_name].astype(str) if X_validation is not None: X_validation[col_name] = X_validation[col_name].astype(str) eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) if self.params.get("num_boost_round") is None: model_init, new_iterations = self._assess_iterations( X, y, sample_weight, eval_set, max_time ) self.model.set_params(iterations=new_iterations) else: model_init = None self.model.set_params(iterations=self.params.get("num_boost_round")) self.early_stopping_rounds = self.params.get("early_stopping_rounds", 50) self.model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if self.model.best_iteration_ is not None: if model_init is not None: self.best_ntree_limit = ( self.model.best_iteration_ + model_init.tree_count_ + 1 ) else: self.best_ntree_limit = self.model.best_iteration_ + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: train_scores = self.model.evals_result_["learn"].get(self.log_metric_name) validation_scores = self.model.evals_result_["validation"].get( self.log_metric_name ) if model_init is not None: if train_scores is not None: train_scores = ( model_init.evals_result_["learn"].get(self.log_metric_name) + train_scores ) if validation_scores is not None: validation_scores = ( model_init.evals_result_["validation"].get(self.log_metric_name) + validation_scores ) iteration = None if train_scores is not None: iteration = range(len(validation_scores)) elif validation_scores is not None: iteration = range(len(validation_scores)) result = pd.DataFrame( { "iteration": iteration, "train": train_scores, "validation": validation_scores, } ) result.to_csv(log_to_file, index=False, header=False) if self.params["ml_task"] != REGRESSION: self.classes_ = np.unique(y) def is_fitted(self): return self.model is not None and self.model.tree_count_ is not None def predict(self, X): self.reload() if self.params["ml_task"] == BINARY_CLASSIFICATION: return self.model.predict_proba(X, ntree_end=self.best_ntree_limit)[:, 1] elif self.params["ml_task"] == MULTICLASS_CLASSIFICATION: return self.model.predict_proba(X, ntree_end=self.best_ntree_limit) return self.model.predict(X, ntree_end=self.best_ntree_limit) def copy(self): return copy.deepcopy(self) def save(self, model_file_path): self.model.save_model(model_file_path) self.model_file_path = model_file_path logger.debug("CatBoostAlgorithm save model to %s" % model_file_path) def load(self, model_file_path): logger.debug("CatBoostLearner load model from %s" % model_file_path) # waiting for fix https://github.com/catboost/catboost/issues/696 Algo = CatBoostClassifier if self.params["ml_task"] == REGRESSION: Algo = CatBoostRegressor # loading might throw warnings in the case of custom eval_metric # check https://github.com/catboost/catboost/issues/1169 self.model = Algo().load_model(model_file_path) self.model_file_path = model_file_path def file_extension(self): return "catboost" def get_metric_name(self): metric = self.params.get("eval_metric") if metric is None: return None if metric == "Logloss": return "logloss" elif metric == "AUC": return "auc" elif metric == "MultiClass": return "logloss" elif metric == "RMSE": return "rmse" elif metric == "MSE": return "mse" elif metric == "MAE": return "mae" elif metric == "MAPE": return "mape" elif metric in ["F1", "TotalF1:average=Micro"]: return "f1" elif metric == "Accuracy": return "accuracy" return metric classification_params = { "learning_rate": [0.025, 0.05, 0.1, 0.2], "depth": [4, 5, 6, 7, 8, 9], "rsm": [0.7, 0.8, 0.9, 1], # random subspace method "loss_function": ["Logloss"], } classification_default_params = { "learning_rate": 0.1, "depth": 6, "rsm": 1, "loss_function": "Logloss", } additional = { "max_rounds": 10000, "early_stopping_rounds": 50, "max_rows_limit": None, "max_cols_limit": None, } required_preprocessing = [ "missing_values_inputation", "datetime_transform", "text_transform", "target_as_integer", ] class CBClassifier(ClassifierMixin, CatBoostAlgorithm): pass AlgorithmsRegistry.add( BINARY_CLASSIFICATION, CBClassifier, classification_params, required_preprocessing, additional, classification_default_params, ) multiclass_classification_params = copy.deepcopy(classification_params) multiclass_classification_params["loss_function"] = ["MultiClass"] multiclass_classification_params["depth"] = [3, 4, 5, 6] multiclass_classification_params["learning_rate"] = [0.1, 0.15, 0.2] multiclass_classification_default_params = copy.deepcopy(classification_default_params) multiclass_classification_default_params["loss_function"] = "MultiClass" multiclass_classification_default_params["depth"] = 5 multiclass_classification_default_params["learning_rate"] = 0.15 AlgorithmsRegistry.add( MULTICLASS_CLASSIFICATION, CBClassifier, multiclass_classification_params, required_preprocessing, additional, multiclass_classification_default_params, ) regression_params = copy.deepcopy(classification_params) regression_params["loss_function"] = ["RMSE", "MAE", "MAPE"] regression_required_preprocessing = [ "missing_values_inputation", "datetime_transform", "text_transform", "target_scale", ] regression_default_params = { "learning_rate": 0.1, "depth": 6, "rsm": 1, "loss_function": "RMSE", } class CBRegressor(RegressorMixin, CatBoostAlgorithm): pass AlgorithmsRegistry.add( REGRESSION, CBRegressor, regression_params, regression_required_preprocessing, additional, regression_default_params, ) ``` -------------------------------------------------------------------------------- /supervised/fairness/optimization.py: -------------------------------------------------------------------------------- ```python import numpy as np class FairnessOptimization: @staticmethod def binary_classification( target, predicted_labels, sensitive_features, fairness_metric, fairness_threshold, privileged_groups=[], underprivileged_groups=[], previous_fairness_optimization=None, min_selection_rate=None, max_selection_rate=None, ): target = np.array(target).ravel() preds = np.array(predicted_labels) # fairness optimization stats sensitive_values = {} for col in sensitive_features.columns: col_name = col[10:] # skip 'senstive_' values = list(sensitive_features[col].unique()) sensitive_values[col] = values for v in values: ii = sensitive_features[col] == v new_sensitive_values = {} for k, prev_values in sensitive_values.items(): if k == col: continue new_sensitive_values[f"{k}@{col}"] = [] for v in values: for pv in prev_values: if isinstance(pv, tuple): new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] else: new_sensitive_values[f"{k}@{col}"] += [(pv, v)] sensitive_values = {**sensitive_values, **new_sensitive_values} # print(sensitive_values) sensitive_indices = {} for k, values_list in sensitive_values.items(): if k.count("@") == sensitive_features.shape[1] - 1: # print(k) # print("values_list",values_list) cols = k.split("@") for values in values_list: if not isinstance(values, tuple): values = (values,) # print("values", values) ii = None for i, c in enumerate(cols): if ii is None: ii = sensitive_features[c] == values[i] else: ii &= sensitive_features[c] == values[i] key = "@".join([str(s) for s in values]) # print(key, np.sum(ii)) sensitive_indices[key] = ii total_dp_ratio = min_selection_rate / max_selection_rate # print("total dp ratio", total_dp_ratio) c0 = np.sum(target == 0) c1 = np.sum(target == 1) selection_rates = {} weights = {} for key, indices in sensitive_indices.items(): selection_rates[key] = np.sum((preds == 1) & indices) / np.sum(indices) # print(key, np.sum(indices), selection_rates[key]) t = np.sum(indices) t0 = np.sum(indices & (target == 0)) t1 = np.sum(indices & (target == 1)) w0 = t / target.shape[0] * c0 / t0 w1 = t / target.shape[0] * c1 / t1 # print("----", key, w0, w1, t, t0, t1) weights[key] = [w0, w1] max_selection_rate = np.max(list(selection_rates.values())) min_selection_rate = np.min(list(selection_rates.values())) for k, v in selection_rates.items(): selection_rates[k] = v / max_selection_rate # print("previous fairness optimization") # print(previous_fairness_optimization) # print("********") previous_weights = {} if previous_fairness_optimization is not None: weights = previous_fairness_optimization.get("weights") for key, indices in sensitive_indices.items(): # print("Previous") # print(previous_fairness_optimization["selection_rates"][key], selection_rates[key]) direction = 0.0 if ( previous_fairness_optimization["selection_rates"][key] < selection_rates[key] ): # print("Improvement") direction = 1.0 elif selection_rates[key] > 0.8: # print("GOOD") direction = 0.0 else: # print("Decrease") direction = -0.5 # need to add previous weights instead 1.0 prev_weights = previous_fairness_optimization.get( "previous_weights", {} ).get(key, [1, 1]) # print("prev_weights", prev_weights) delta0 = weights[key][0] - prev_weights[0] delta1 = weights[key][1] - prev_weights[1] previous_weights[key] = [weights[key][0], weights[key][1]] # print("BEFORE") # print(weights[key]) weights[key][0] += direction * delta0 weights[key][1] += direction * delta1 # print("AFTER") # print(weights[key]) # print(previous_fairness_optimization["weights"][key]) step = None if previous_fairness_optimization is not None: step = previous_fairness_optimization.get("step") if step is None: step = 0 else: step += 1 return { "selection_rates": selection_rates, "previous_weights": previous_weights, "weights": weights, "total_dp_ratio": total_dp_ratio, "step": step, "fairness_threshold": fairness_threshold, } @staticmethod def regression( target, predictions, sensitive_features, fairness_metric, fairness_threshold, privileged_groups=[], underprivileged_groups=[], previous_fairness_optimization=None, performance_metric=None, performance_metric_name=None, ): target = np.array(target).ravel() preds = np.array(predictions) # fairness optimization stats sensitive_values = {} for col in sensitive_features.columns: col_name = col[10:] # skip 'senstive_' values = list(sensitive_features[col].unique()) sensitive_values[col] = values for v in values: ii = sensitive_features[col] == v new_sensitive_values = {} for k, prev_values in sensitive_values.items(): if k == col: continue new_sensitive_values[f"{k}@{col}"] = [] for v in values: for pv in prev_values: if isinstance(pv, tuple): new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] else: new_sensitive_values[f"{k}@{col}"] += [(pv, v)] sensitive_values = {**sensitive_values, **new_sensitive_values} sensitive_indices = {} least_frequent_key = None least_frequency = sensitive_features.shape[0] for k, values_list in sensitive_values.items(): if k.count("@") == sensitive_features.shape[1] - 1: # print(k) # print("values_list",values_list) cols = k.split("@") for values in values_list: if not isinstance(values, tuple): values = (values,) # print("values", values) ii = None for i, c in enumerate(cols): if ii is None: ii = sensitive_features[c] == values[i] else: ii &= sensitive_features[c] == values[i] key = "@".join([str(s) for s in values]) if np.sum(ii) > 0: sensitive_indices[key] = ii if np.sum(ii) < least_frequency: least_frequency = np.sum(ii) least_frequent_key = key weights = {} performance = {} for key, indices in sensitive_indices.items(): w = target.shape[0] / len(sensitive_indices) / np.sum(indices) weights[key] = w performance[key] = performance_metric(target[indices], predictions[indices]) # try to upscale more the largest weight weights[least_frequent_key] *= 1.5 denominator = np.max(list(performance.values())) new_performance = {} for k, v in performance.items(): new_performance[k] = np.round(v / denominator, 4) performance = new_performance previous_weights = {} if previous_fairness_optimization is not None: weights = previous_fairness_optimization.get("weights") for key, indices in sensitive_indices.items(): direction = 0.0 if ( previous_fairness_optimization["performance"][key] < performance[key] ): direction = 1.0 elif performance[key] > fairness_threshold: direction = 0.0 else: direction = -0.5 # need to add previous weights instead 1.0 prev_weights = previous_fairness_optimization.get( "previous_weights", {} ).get(key, 1) delta0 = weights[key] - prev_weights previous_weights[key] = weights[key] weights[key] = max(weights[key] + direction * delta0, 0.01) no_weights_change = False if str(previous_weights) == str(weights): no_weights_change = True step = None if previous_fairness_optimization is not None: step = previous_fairness_optimization.get("step") if step is None: step = 0 else: if not no_weights_change: step += 1 return { "performance": performance, "previous_weights": previous_weights, "weights": weights, "step": step, "fairness_threshold": fairness_threshold, } @staticmethod def multiclass_classification( target, predicted_labels, sensitive_features, fairness_metric, fairness_threshold, privileged_groups=[], underprivileged_groups=[], previous_fairness_optimization=None, ): target = np.array(target).ravel() preds = np.array(predicted_labels) target_values = list(np.unique(target)) # fairness optimization stats sensitive_values = {} for col in sensitive_features.columns: col_name = col[10:] # skip 'senstive_' values = list(sensitive_features[col].unique()) sensitive_values[col] = values for v in values: ii = sensitive_features[col] == v new_sensitive_values = {} for k, prev_values in sensitive_values.items(): if k == col: continue new_sensitive_values[f"{k}@{col}"] = [] for v in values: for pv in prev_values: if isinstance(pv, tuple): new_sensitive_values[f"{k}@{col}"] += [(*pv, v)] else: new_sensitive_values[f"{k}@{col}"] += [(pv, v)] sensitive_values = {**sensitive_values, **new_sensitive_values} sensitive_indices = {} for k, values_list in sensitive_values.items(): if k.count("@") == sensitive_features.shape[1] - 1: cols = k.split("@") for values in values_list: if not isinstance(values, tuple): values = (values,) ii = None for i, c in enumerate(cols): if ii is None: ii = sensitive_features[c] == values[i] else: ii &= sensitive_features[c] == values[i] key = "@".join([str(s) for s in values]) sensitive_indices[key] = ii cs = {} for t in target_values: cs[t] = np.sum(target == t) selection_rates = {} weights = {} for key, indices in sensitive_indices.items(): weights[key] = [] sv = np.sum(indices) selection_rates[key] = {} for t in target_values: selection_rates[key][t] = np.sum((preds == t) & indices) / np.sum( indices ) t_k = np.sum(indices & (target == t)) w_k = sv / target.shape[0] * cs[t] / t_k weights[key] += [w_k] for t in target_values: values = [] for k, v in selection_rates.items(): values += [v[t]] max_selection_rate = np.max(values) for k, v in selection_rates.items(): v[t] /= max_selection_rate previous_weights = {} if previous_fairness_optimization is not None: weights = previous_fairness_optimization.get("weights") for key, indices in sensitive_indices.items(): previous_weights[key] = [1] * len(target_values) for i, t in enumerate(target_values): direction = 0.0 if ( previous_fairness_optimization["selection_rates"][key][t] < selection_rates[key][t] ): direction = 1.0 elif selection_rates[key][t] > 0.8: direction = 0.0 else: direction = -0.5 # need to add previous weights instead 1.0 prev_weights = previous_fairness_optimization.get( "previous_weights", {} ).get(key, [1] * len(target_values)) delta_i = weights[key][i] - prev_weights[i] previous_weights[key][i] = weights[key][i] weights[key][i] += direction * delta_i step = None if previous_fairness_optimization is not None: step = previous_fairness_optimization.get("step") if step is None: step = 0 else: step += 1 return { "selection_rates": selection_rates, "previous_weights": previous_weights, "weights": weights, "step": step, "fairness_threshold": fairness_threshold, "target_values": target_values, } ``` -------------------------------------------------------------------------------- /supervised/utils/automl_plots.py: -------------------------------------------------------------------------------- ```python import logging import os import traceback # For exception details import numpy as np import pandas as pd import scipy as sp # --- Added Import --- from sklearn.preprocessing import MinMaxScaler # -------------------- logger = logging.getLogger(__name__) from supervised.utils.config import LOG_LEVEL logger.setLevel(LOG_LEVEL) # Add a handler if running standalone for testing if not logger.hasHandlers(): handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) import warnings import matplotlib.pyplot as plt warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) class AutoMLPlots: # Original filename definitions features_heatmap_fname = "features_heatmap.png" correlation_heatmap_fname = "correlation_heatmap.png" # Filename for Scaled Plot features_heatmap_scaled_fname = "features_heatmap_scaled.png" @staticmethod def _plot_feature_heatmap(data_df, title, plot_path, cmap="Blues", vmin=None, vmax=None, cbar_label='Importance'): """ Helper method to generate and save a feature importance heatmap. """ try: logger.info(f"Generating heatmap: '{title}'") # Adjust height dynamically based on number of features plot_height = max(7, len(data_df.index) * 0.35) fig, ax = plt.subplots(1, 1, figsize=(10, plot_height)) image = ax.imshow( data_df, interpolation="nearest", cmap=plt.cm.get_cmap(cmap), aspect="auto", vmin=vmin, # Use provided vmin vmax=vmax # Use provided vmax ) cbar = plt.colorbar(mappable=image) cbar.set_label(cbar_label) # Use provided label x_tick_marks = np.arange(len(data_df.columns)) y_tick_marks = np.arange(len(data_df.index)) ax.set_xticks(x_tick_marks) ax.set_xticklabels(data_df.columns, rotation=90) ax.set_yticks(y_tick_marks) ax.set_yticklabels(data_df.index) ax.set_title(title) plt.tight_layout(pad=2.0) plt.savefig(plot_path) logger.info(f"Saved heatmap to: {plot_path}") plt.close(fig) # Close the specific figure except Exception as e: logger.error(f"Failed to generate heatmap '{title}': {e}") logger.error(traceback.format_exc()) plt.close("all") # Close any potentially open plots on error @staticmethod def add(results_path, models, fout): """ Adds plots to the report file stream. Now includes both unscaled and scaled importance. Args: results_path (str): Path to results directory. models (list): List of model objects. fout (file object): Writable file object for the report. """ # Generate both feature importance plots AutoMLPlots.models_feature_importance(results_path, models) # --- Unscaled Feature Importance Section --- features_plot_path = os.path.join( results_path, AutoMLPlots.features_heatmap_fname # Use original filename ) if os.path.exists(features_plot_path): fout.write("\n\n### Features Importance (Original Scale)\n") # Updated title fout.write( f"\n\n" # Use original filename ) else: logger.warning(f"Original feature importance plot not found at: {features_plot_path}") # --- Scaled Feature Importance Section --- features_scaled_plot_path = os.path.join( results_path, AutoMLPlots.features_heatmap_scaled_fname # Use scaled filename ) if os.path.exists(features_scaled_plot_path): fout.write("\n\n### Scaled Features Importance (MinMax per Model)\n") # Title for scaled plot fout.write( f"\n\n" # Use scaled filename ) else: logger.warning(f"Scaled feature importance plot not found at: {features_scaled_plot_path}") # --- Correlation Section (remains the same) --- AutoMLPlots.models_correlation(results_path, models) correlation_plot_path = os.path.join( results_path, AutoMLPlots.correlation_heatmap_fname ) if os.path.exists(correlation_plot_path): fout.write("\n\n### Spearman Correlation of Models\n") fout.write( f"\n\n" ) else: logger.warning(f"Model correlation plot not found at: {correlation_plot_path}") @staticmethod def models_feature_importance(results_path, models): """ Generates and saves BOTH original and scaled feature importance heatmaps. """ logger.info("Starting feature importance generation (original and scaled).") try: # --- Data Aggregation (Common part) --- model_feature_imp = {} # (Same robust reading logic as before) for m in models: model_name = m.get_name() model_path = os.path.join(results_path, model_name) logger.debug(f"Processing model '{model_name}' in '{model_path}'") if not os.path.isdir(model_path): logger.warning(f"Directory not found for model '{model_name}'. Skipping.") continue try: all_files = os.listdir(model_path) except OSError as e: logger.error(f"Cannot list directory {model_path}: {e}. Skipping model '{model_name}'.") continue imp_data = [f for f in all_files if "_importance.csv" in f and "shap" not in f] if not imp_data: logger.warning(f"No suitable importance files found for model '{model_name}'. Skipping.") continue df_all = [] for fname in imp_data: file_path = os.path.join(model_path, fname) try: df = pd.read_csv(file_path, index_col=0) numeric_df = df.select_dtypes(include=np.number) if numeric_df.empty or numeric_df.isnull().all().all(): logger.warning(f"File {fname} (model '{model_name}') contains no valid numeric data. Skipping.") continue df_all.append(df) except Exception as read_e: logger.error(f"Error reading/processing file {fname} (model '{model_name}'): {read_e}. Skipping.") continue if not df_all: logger.warning(f"No valid importance dataframes read for model '{model_name}'. Skipping.") continue try: df_concat = pd.concat(df_all, axis=1, join='outer') numeric_df_concat = df_concat.select_dtypes(include=np.number) if not numeric_df_concat.empty: model_feature_imp[model_name] = numeric_df_concat.mean(axis=1).fillna(0) else: logger.warning(f"No numeric data after concat for model '{model_name}'. Skipping.") except Exception as concat_e: logger.error(f"Error aggregating importance for model '{model_name}': {concat_e}") continue logger.info(f"Collected feature importance for {len(model_feature_imp)} models.") if len(model_feature_imp) < 2: logger.warning("Feature importance heatmaps require at least 2 models with data. Skipping plot generation.") return mfi = pd.concat(model_feature_imp, axis=1, join='outer').fillna(0) logger.debug(f"Combined importance DataFrame shape: {mfi.shape}") # --- Sorting & Top N (Common part) --- mfi["m"] = mfi.mean(axis=1) mfi_sorted = mfi.sort_values(by="m", ascending=False) mfi_sorted = mfi_sorted.drop("m", axis=1) # Keep original mfi for potential later use if needed num_features_original = mfi_sorted.shape[0] mfi_plot_data = mfi_sorted # Default to using all sorted features title_suffix = "Feature Importance" scaled_title_suffix = "Scaled Feature Importance (MinMax per model)" if num_features_original > 25: mfi_plot_data = mfi_sorted.head(25) title_suffix = f"Top-25 ({num_features_original} total) Feature Importance" scaled_title_suffix = f"Top-25 ({num_features_original} total) Scaled Feature Importance (MinMax per model)" logger.info(f"Selecting top 25 features out of {num_features_original} for plotting.") else: logger.info(f"Using all {num_features_original} features for plotting.") # --- Plotting Unscaled Version --- unscaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_fname) AutoMLPlots._plot_feature_heatmap( data_df=mfi_plot_data, title=title_suffix + " (Original Scale)", plot_path=unscaled_plot_path, cbar_label='Importance' # vmin/vmax are auto-detected by default ) # --- Scaling Data --- logger.debug("Applying Min-Max scaling for the second plot.") scaler = MinMaxScaler() mfi_scaled_array = scaler.fit_transform(mfi_plot_data) # Scale the potentially filtered data mfi_scaled = pd.DataFrame(mfi_scaled_array, index=mfi_plot_data.index, columns=mfi_plot_data.columns) # --- Plotting Scaled Version --- scaled_plot_path = os.path.join(results_path, AutoMLPlots.features_heatmap_scaled_fname) AutoMLPlots._plot_feature_heatmap( data_df=mfi_scaled, title=scaled_title_suffix, plot_path=scaled_plot_path, vmin=0, # Explicit range for scaled data vmax=1, cbar_label='Scaled Importance (MinMax per model)' ) logger.info("Finished generating feature importance plots.") except Exception as e: logger.error(f"An error occurred during feature importance processing: {e}") logger.error(traceback.format_exc()) plt.close("all") # Ensure plots are closed on unexpected error # --- correlation and models_correlation methods remain the same as in the previous version --- # (Include the improved versions from the previous response here) @staticmethod def correlation(oof1, oof2): """ Calculates mean Spearman correlation between prediction columns """ # (Original code - unchanged) cols = [c for c in oof1.columns if "prediction" in c] # Check if prediction columns exist if not cols or not all(c in oof2.columns for c in cols): logger.warning("Prediction columns mismatch or not found for correlation calculation.") return np.nan # Return NaN if predictions can't be compared with warnings.catch_warnings(): warnings.simplefilter(action="ignore") v = [] for c in cols: try: # Calculate Spearman correlation, ignore p-value corr_val, _ = sp.stats.spearmanr(oof1[c], oof2[c]) # Handle potential NaN result from spearmanr if input variance is zero if not np.isnan(corr_val): v.append(corr_val) else: logger.debug(f"NaN result from spearmanr for column {c}. Skipping.") except Exception as corr_e: logger.warning(f"Could not calculate Spearman correlation for column {c}: {corr_e}") # Return mean correlation, or NaN if no valid correlations were calculated return np.mean(v) if v else np.nan @staticmethod def models_correlation(results_path, models): """ Generates and saves model prediction correlation heatmap """ # (Original code - minor logging/error handling improvements) logger.info("Starting model correlation heatmap generation.") try: if len(models) < 2: logger.warning("Model correlation heatmap requires at least 2 models. Skipping.") return names = [] oofs = [] valid_models_indices = [] # Keep track of models with valid OOF data for i, m in enumerate(models): try: oof_data = m.get_out_of_folds() # Basic validation of OOF data if oof_data is None or oof_data.empty or not any("prediction" in c for c in oof_data.columns): logger.warning(f"Model '{m.get_name()}' has invalid or missing out-of-folds prediction data. Excluding from correlation.") continue names.append(m.get_name()) oofs.append(oof_data) valid_models_indices.append(i) # Store original index if valid logger.debug(f"Got valid OOF data for model '{m.get_name()}'.") except AttributeError: logger.warning(f"Model '{m.get_name()}' seems to be missing 'get_out_of_folds' method or it failed. Excluding from correlation.") continue except Exception as oof_e: logger.warning(f"Failed to get OOF data for model '{m.get_name()}': {oof_e}. Excluding from correlation.") continue num_valid_models = len(names) if num_valid_models < 2: logger.warning(f"Fewer than 2 models ({num_valid_models}) have valid OOF data for correlation. Skipping plot generation.") return logger.info(f"Calculating correlations for {num_valid_models} models.") corrs = np.ones((num_valid_models, num_valid_models)) # Use num_valid_models dimension for i in range(num_valid_models): for j in range(i + 1, num_valid_models): correlation_value = AutoMLPlots.correlation(oofs[i], oofs[j]) # Fill with NaN if correlation calculation failed corrs[i, j] = corrs[j, i] = correlation_value if not np.isnan(correlation_value) else np.nan # Check if all correlations are NaN if np.isnan(corrs[np.triu_indices(num_valid_models, k=1)]).all(): logger.warning("All pairwise model correlations resulted in NaN. Cannot generate heatmap.") return logger.info("Generating model correlation heatmap.") figsize = (15, 15) if num_valid_models > 15 else (10, 10) # Adjusted threshold fig, ax = plt.subplots(1, 1, figsize=figsize) image = ax.imshow( corrs, interpolation="nearest", cmap=plt.cm.get_cmap("Blues"), aspect="auto", vmin=np.nanmin(corrs), # Use nanmin/nanmax to handle potential NaNs vmax=np.nanmax(corrs) ) plt.colorbar(mappable=image) x_tick_marks = np.arange(num_valid_models) y_tick_marks = np.arange(num_valid_models) ax.set_xticks(x_tick_marks) ax.set_xticklabels(names, rotation=90) ax.set_yticks(y_tick_marks) ax.set_yticklabels(names) ax.set_title("Spearman Correlation of Models' OOF Predictions") # Slightly more descriptive title plt.tight_layout(pad=2.0) # --- Saving the Plot --- os.makedirs(results_path, exist_ok=True) # Ensure directory exists plot_path = os.path.join( results_path, AutoMLPlots.correlation_heatmap_fname ) plt.savefig(plot_path) logger.info(f"Saved model correlation heatmap to: {plot_path}") plt.close("all") # Close plot to free memory except Exception as e: # Log the exception with traceback logger.error(f"An error occurred during model correlation plotting: {e}") logger.error(traceback.format_exc()) # Ensure plot is closed if error occurred during saving/closing plt.close("all") ```