This is page 2 of 19. Use http://codebase.md/mljar/mljar-supervised?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ └── workflows │ ├── run-tests.yml │ ├── test-installation-with-conda.yml │ └── test-installation-with-pip-on-windows.yml ├── .gitignore ├── CITATION ├── examples │ ├── notebooks │ │ ├── basic_run.ipynb │ │ └── Titanic.ipynb │ └── scripts │ ├── binary_classifier_adult_fairness.py │ ├── binary_classifier_ensemble.py │ ├── binary_classifier_marketing.py │ ├── binary_classifier_random.py │ ├── binary_classifier_Titanic.py │ ├── binary_classifier.py │ ├── multi_class_classifier_digits.py │ ├── multi_class_classifier_MNIST.py │ ├── multi_class_classifier.py │ ├── multi_class_drug_fairness.py │ ├── regression_acs_fairness.py │ ├── regression_crime_fairness.py │ ├── regression_housing_fairness.py │ ├── regression_law_school_fairness.py │ ├── regression.py │ └── tabular_mar_2021.py ├── LICENSE ├── MANIFEST.in ├── pytest.ini ├── README.md ├── requirements_dev.txt ├── requirements.txt ├── setup.py ├── supervised │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── baseline.py │ │ ├── catboost.py │ │ ├── decision_tree.py │ │ ├── extra_trees.py │ │ ├── factory.py │ │ ├── knn.py │ │ ├── lightgbm.py │ │ ├── linear.py │ │ ├── nn.py │ │ ├── random_forest.py │ │ ├── registry.py │ │ ├── sklearn.py │ │ └── xgboost.py │ ├── automl.py │ ├── base_automl.py │ ├── callbacks │ │ ├── __init__.py │ │ ├── callback_list.py │ │ ├── callback.py │ │ ├── early_stopping.py │ │ ├── learner_time_constraint.py │ │ ├── max_iters_constraint.py │ │ ├── metric_logger.py │ │ ├── terminate_on_nan.py │ │ └── total_time_constraint.py │ ├── ensemble.py │ ├── exceptions.py │ ├── fairness │ │ ├── __init__.py │ │ ├── metrics.py │ │ ├── optimization.py │ │ ├── plots.py │ │ ├── report.py │ │ └── utils.py │ ├── model_framework.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── datetime_transformer.py │ │ ├── encoding_selector.py │ │ ├── exclude_missing_target.py │ │ ├── goldenfeatures_transformer.py │ │ ├── kmeans_transformer.py │ │ ├── label_binarizer.py │ │ ├── label_encoder.py │ │ ├── preprocessing_categorical.py │ │ ├── preprocessing_missing.py │ │ ├── preprocessing_utils.py │ │ ├── preprocessing.py │ │ ├── scale.py │ │ └── text_transformer.py │ ├── tuner │ │ ├── __init__.py │ │ ├── data_info.py │ │ ├── hill_climbing.py │ │ ├── mljar_tuner.py │ │ ├── optuna │ │ │ ├── __init__.py │ │ │ ├── catboost.py │ │ │ ├── extra_trees.py │ │ │ ├── knn.py │ │ │ ├── lightgbm.py │ │ │ ├── nn.py │ │ │ ├── random_forest.py │ │ │ ├── tuner.py │ │ │ └── xgboost.py │ │ ├── preprocessing_tuner.py │ │ ├── random_parameters.py │ │ └── time_controller.py │ ├── utils │ │ ├── __init__.py │ │ ├── additional_metrics.py │ │ ├── additional_plots.py │ │ ├── automl_plots.py │ │ ├── common.py │ │ ├── config.py │ │ ├── constants.py │ │ ├── data_validation.py │ │ ├── importance.py │ │ ├── jsonencoder.py │ │ ├── leaderboard_plots.py │ │ ├── learning_curves.py │ │ ├── metric.py │ │ ├── shap.py │ │ ├── subsample.py │ │ └── utils.py │ └── validation │ ├── __init__.py │ ├── validation_step.py │ ├── validator_base.py │ ├── validator_custom.py │ ├── validator_kfold.py │ └── validator_split.py └── tests ├── __init__.py ├── checks │ ├── __init__.py │ ├── check_automl_with_regression.py │ ├── run_ml_tests.py │ └── run_performance_tests.py ├── conftest.py ├── data │ ├── 179.csv │ ├── 24.csv │ ├── 3.csv │ ├── 31.csv │ ├── 38.csv │ ├── 44.csv │ ├── 720.csv │ ├── 737.csv │ ├── acs_income_1k.csv │ ├── adult_missing_values_missing_target_500rows.csv │ ├── boston_housing.csv │ ├── CrimeData │ │ ├── cities.json │ │ ├── crimedata.csv │ │ └── README.md │ ├── Drug │ │ ├── Drug_Consumption.csv │ │ └── README.md │ ├── housing_regression_missing_values_missing_target.csv │ ├── iris_classes_missing_values_missing_target.csv │ ├── iris_missing_values_missing_target.csv │ ├── LawSchool │ │ ├── bar_pass_prediction.csv │ │ └── README.md │ ├── PortugeseBankMarketing │ │ └── Data_FinalProject.csv │ └── Titanic │ ├── test_with_Survived.csv │ └── train.csv ├── README.md ├── tests_algorithms │ ├── __init__.py │ ├── test_baseline.py │ ├── test_catboost.py │ ├── test_decision_tree.py │ ├── test_extra_trees.py │ ├── test_factory.py │ ├── test_knn.py │ ├── test_lightgbm.py │ ├── test_linear.py │ ├── test_nn.py │ ├── test_random_forest.py │ ├── test_registry.py │ └── test_xgboost.py ├── tests_automl │ ├── __init__.py │ ├── test_adjust_validation.py │ ├── test_automl_init.py │ ├── test_automl_report.py │ ├── test_automl_sample_weight.py │ ├── test_automl_time_constraints.py │ ├── test_automl.py │ ├── test_data_types.py │ ├── test_dir_change.py │ ├── test_explain_levels.py │ ├── test_golden_features.py │ ├── test_handle_imbalance.py │ ├── test_integration.py │ ├── test_joblib_version.py │ ├── test_models_needed_for_predict.py │ ├── test_prediction_after_load.py │ ├── test_repeated_validation.py │ ├── test_restore.py │ ├── test_stack_models_constraints.py │ ├── test_targets.py │ └── test_update_errors_report.py ├── tests_callbacks │ ├── __init__.py │ └── test_total_time_constraint.py ├── tests_ensemble │ ├── __init__.py │ └── test_save_load.py ├── tests_fairness │ ├── __init__.py │ ├── test_binary_classification.py │ ├── test_multi_class_classification.py │ └── test_regression.py ├── tests_preprocessing │ ├── __init__.py │ ├── disable_eda.py │ ├── test_categorical_integers.py │ ├── test_datetime_transformer.py │ ├── test_encoding_selector.py │ ├── test_exclude_missing.py │ ├── test_goldenfeatures_transformer.py │ ├── test_label_binarizer.py │ ├── test_label_encoder.py │ ├── test_preprocessing_missing.py │ ├── test_preprocessing_utils.py │ ├── test_preprocessing.py │ ├── test_scale.py │ └── test_text_transformer.py ├── tests_tuner │ ├── __init__.py │ ├── test_hill_climbing.py │ ├── test_time_controller.py │ └── test_tuner.py ├── tests_utils │ ├── __init__.py │ ├── test_compute_additional_metrics.py │ ├── test_importance.py │ ├── test_learning_curves.py │ ├── test_metric.py │ ├── test_shap.py │ └── test_subsample.py └── tests_validation ├── __init__.py ├── test_validator_kfold.py └── test_validator_split.py ``` # Files -------------------------------------------------------------------------------- /supervised/preprocessing/label_binarizer.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | 3 | 4 | class LabelBinarizer(object): 5 | def __init__(self): 6 | self._new_columns = [] 7 | self._uniq_values = None 8 | self._old_column = None 9 | self._old_column_dtype = None 10 | 11 | def fit(self, X, column): 12 | self._old_column = column 13 | self._old_column_dtype = str(X[column].dtype) 14 | self._uniq_values = np.unique(X[column].values) 15 | # self._uniq_values = [str(u) for u in self._uniq_values] 16 | 17 | if len(self._uniq_values) == 2: 18 | self._new_columns.append(column + "_" + str(self._uniq_values[1])) 19 | else: 20 | for v in self._uniq_values: 21 | self._new_columns.append(column + "_" + str(v)) 22 | 23 | def transform(self, X, column): 24 | if len(self._uniq_values) == 2: 25 | X[column + "_" + str(self._uniq_values[1])] = ( 26 | X[column] == self._uniq_values[1] 27 | ).astype(int) 28 | else: 29 | for v in self._uniq_values: 30 | X[column + "_" + str(v)] = (X[column] == v).astype(int) 31 | 32 | X.drop(column, axis=1, inplace=True) 33 | return X 34 | 35 | def inverse_transform(self, X): 36 | if self._old_column is None: 37 | return X 38 | 39 | old_col = (X[self._new_columns[0]] * 0).astype(self._old_column_dtype) 40 | 41 | for unique_value in self._uniq_values: 42 | new_col = f"{self._old_column}_{unique_value}" 43 | if new_col not in self._new_columns: 44 | old_col[:] = unique_value 45 | else: 46 | old_col[X[new_col] == 1] = unique_value 47 | 48 | X[self._old_column] = old_col 49 | X.drop(self._new_columns, axis=1, inplace=True) 50 | return X 51 | 52 | def to_json(self): 53 | self._uniq_values = [str(i) for i in list(self._uniq_values)] 54 | data_json = { 55 | "new_columns": list(self._new_columns), 56 | "unique_values": self._uniq_values, 57 | "old_column": self._old_column, 58 | "old_column_dtype": self._old_column_dtype, 59 | } 60 | 61 | if ( 62 | "True" in self._uniq_values 63 | and "False" in self._uniq_values 64 | and len(self._uniq_values) == 2 65 | ): 66 | self._uniq_values = [False, True] 67 | 68 | return data_json 69 | 70 | def from_json(self, data_json): 71 | self._new_columns = data_json.get("new_columns", None) 72 | self._uniq_values = data_json.get("unique_values", None) 73 | self._old_column = data_json.get("old_column", None) 74 | self._old_column_dtype = data_json.get("old_column_dtype", None) 75 | 76 | if ( 77 | "True" in self._uniq_values 78 | and "False" in self._uniq_values 79 | and len(self._uniq_values) == 2 80 | ): 81 | self._uniq_values = [False, True] 82 | ``` -------------------------------------------------------------------------------- /tests/data/iris_classes_missing_values_missing_target.csv: -------------------------------------------------------------------------------- ``` 1 | feature_1,feature_2,feature_3,feature_4,class 2 | 5.1,3.5,1.4,0.2,1 3 | 4.9,3.0,1.4,0.2,1 4 | 4.7,3.2,1.3,,1 5 | 4.6,3.1,1.5,,1 6 | 5.0,3.6,1.4,0.2,1 7 | ,3.9,1.7,0.4,1 8 | 4.6,3.4,1.4,0.3,1 9 | 5.0,3.4,1.5,0.2,1 10 | 4.4,,1.4,0.2,1 11 | 4.9,3.1,1.5,0.1,1 12 | 5.4,3.7,1.5,0.2,1 13 | 4.8,3.4,,0.2,1 14 | 4.8,3.0,1.4,0.1,1 15 | 4.3,3.0,1.1,0.1,1 16 | 5.8,4.0,1.2,0.2,1 17 | 5.7,4.4,1.5,0.4,1 18 | 5.4,3.9,1.3,0.4,1 19 | 5.1,3.5,1.4,0.3, 20 | 5.7,3.8,1.7,0.3,1 21 | 5.1,3.8,1.5,0.3,1 22 | 5.4,3.4,1.7,0.2,1 23 | 5.1,3.7,1.5,0.4,1 24 | 4.6,3.6,1.0,0.2,1 25 | 5.1,3.3,1.7,0.5,1 26 | 4.8,3.4,1.9,0.2,1 27 | 5.0,3.0,1.6,0.2,1 28 | 5.0,3.4,1.6,0.4,1 29 | 5.2,3.5,1.5,0.2,1 30 | 5.2,3.4,1.4,0.2,1 31 | 4.7,3.2,1.6,0.2,1 32 | 4.8,3.1,1.6,0.2,1 33 | 5.4,3.4,1.5,0.4,1 34 | 5.2,4.1,1.5,0.1,1 35 | 5.5,4.2,1.4,0.2,1 36 | 4.9,3.1,1.5,0.1,1 37 | 5.0,3.2,1.2,0.2,1 38 | 5.5,3.5,1.3,0.2,1 39 | 4.9,3.1,1.5,0.1,1 40 | 4.4,3.0,1.3,0.2,1 41 | 5.1,3.4,1.5,0.2,1 42 | 5.0,3.5,1.3,0.3,1 43 | 4.5,2.3,1.3,0.3,1 44 | 4.4,3.2,1.3,0.2,1 45 | 5.0,3.5,1.6,0.6,1 46 | 5.1,3.8,1.9,0.4,1 47 | 4.8,3.0,1.4,0.3,1 48 | 5.1,3.8,1.6,0.2,1 49 | 4.6,3.2,1.4,0.2,1 50 | 5.3,3.7,1.5,0.2,1 51 | 5.0,3.3,1.4,0.2,1 52 | 7.0,3.2,4.7,1.4,2 53 | 6.4,3.2,4.5,1.5,2 54 | 6.9,3.1,4.9,1.5, 55 | 5.5,2.3,4.0,1.3,2 56 | 6.5,2.8,4.6,1.5,2 57 | 5.7,2.8,4.5,1.3,2 58 | 6.3,3.3,4.7,1.6,2 59 | 4.9,2.4,3.3,1.0,2 60 | 6.6,2.9,4.6,1.3,2 61 | 5.2,2.7,3.9,1.4,2 62 | 5.0,2.0,3.5,1.0,2 63 | 5.9,3.0,4.2,1.5,2 64 | 6.0,2.2,4.0,1.0,2 65 | 6.1,2.9,4.7,1.4,2 66 | 5.6,2.9,3.6,1.3,2 67 | 6.7,3.1,4.4,1.4,2 68 | 5.6,3.0,4.5,1.5,2 69 | 5.8,2.7,4.1,1.0,2 70 | 6.2,2.2,4.5,1.5,2 71 | 5.6,2.5,3.9,1.1,2 72 | 5.9,3.2,4.8,1.8,2 73 | 6.1,2.8,4.0,1.3,2 74 | 6.3,2.5,4.9,1.5,2 75 | 6.1,2.8,4.7,1.2,2 76 | 6.4,2.9,4.3,1.3,2 77 | 6.6,3.0,4.4,1.4,2 78 | 6.8,2.8,4.8,1.4,2 79 | 6.7,3.0,5.0,1.7,2 80 | 6.0,2.9,4.5,1.5,2 81 | 5.7,2.6,3.5,1.0,2 82 | 5.5,2.4,3.8,1.1,2 83 | 5.5,2.4,3.7,1.0,2 84 | 5.8,2.7,3.9,1.2,2 85 | 6.0,2.7,5.1,1.6,2 86 | 5.4,3.0,4.5,1.5,2 87 | 6.0,3.4,4.5,1.6,2 88 | 6.7,3.1,4.7,1.5,2 89 | 6.3,2.3,4.4,1.3,2 90 | 5.6,3.0,4.1,1.3,2 91 | 5.5,2.5,4.0,1.3,2 92 | 5.5,2.6,4.4,1.2,2 93 | 6.1,3.0,4.6,1.4,2 94 | 5.8,2.6,4.0,1.2,2 95 | 5.0,2.3,3.3,1.0,2 96 | 5.6,2.7,4.2,1.3,2 97 | 5.7,3.0,4.2,1.2,2 98 | 5.7,2.9,4.2,1.3,2 99 | 6.2,2.9,4.3,1.3,2 100 | 5.1,2.5,3.0,1.1,2 101 | 5.7,2.8,4.1,1.3,2 102 | 6.3,3.3,6.0,2.5,121 103 | 5.8,2.7,5.1,1.9,121 104 | 7.1,3.0,5.9,2.1,121 105 | 6.3,2.9,5.6,1.8,121 106 | 6.5,3.0,5.8,2.2,121 107 | 7.6,3.0,6.6,2.1,121 108 | 4.9,2.5,4.5,1.7,121 109 | 7.3,2.9,6.3,1.8,121 110 | 6.7,2.5,5.8,1.8,121 111 | 7.2,3.6,6.1,2.5,121 112 | 6.5,3.2,5.1,2.0,121 113 | 6.4,2.7,5.3,1.9,121 114 | 6.8,3.0,5.5,2.1,121 115 | 5.7,2.5,5.0,2.0,121 116 | 5.8,2.8,5.1,2.4,121 117 | 6.4,3.2,5.3,2.3,121 118 | 6.5,3.0,5.5,1.8,121 119 | 7.7,3.8,6.7,2.2,121 120 | 7.7,2.6,6.9,2.3,121 121 | 6.0,2.2,5.0,1.5,121 122 | 6.9,3.2,5.7,2.3,121 123 | 5.6,2.8,4.9,2.0,121 124 | 7.7,2.8,6.7,2.0,121 125 | 6.3,2.7,4.9,1.8,121 126 | 6.7,3.3,5.7,2.1,121 127 | 7.2,3.2,6.0,1.8,121 128 | 6.2,2.8,4.8,1.8,121 129 | 6.1,3.0,4.9,1.8,121 130 | 6.4,2.8,5.6,2.1,121 131 | 7.2,3.0,5.8,1.6,121 132 | 7.4,2.8,6.1,1.9,121 133 | 7.9,3.8,6.4,2.0,121 134 | 6.4,2.8,5.6,2.2,121 135 | 6.3,2.8,5.1,1.5,121 136 | 6.1,2.6,5.6,1.4,121 137 | 7.7,3.0,6.1,2.3,121 138 | 6.3,3.4,5.6,2.4,121 139 | 6.4,3.1,5.5,1.8,121 140 | 6.0,3.0,4.8,1.8,121 141 | 6.9,3.1,5.4,2.1,121 142 | 6.7,3.1,5.6,2.4,121 143 | 6.9,3.1,5.1,2.3,121 144 | 5.8,2.7,5.1,1.9,121 145 | 6.8,3.2,5.9,2.3,121 146 | 6.7,3.3,5.7,2.5,121 147 | 6.7,3.0,5.2,2.3,121 148 | 6.3,2.5,5.0,1.9,121 149 | 6.5,3.0,5.2,2.0,121 150 | 6.2,3.4,5.4,2.3,121 151 | 5.9,3.0,5.1,1.8,121 152 | 153 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_knn.py: -------------------------------------------------------------------------------- ```python 1 | import unittest 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | from sklearn import datasets 6 | 7 | from supervised.algorithms.knn import KNeighborsAlgorithm, KNeighborsRegressorAlgorithm 8 | from supervised.utils.metric import Metric 9 | 10 | 11 | class KNeighborsRegressorAlgorithmTest(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.X, cls.y = datasets.make_regression( 15 | n_samples=100, 16 | n_features=5, 17 | n_informative=4, 18 | shuffle=False, 19 | random_state=0 20 | ) 21 | 22 | def test_reproduce_fit(self): 23 | metric = Metric({"name": "mse"}) 24 | params = {"seed": 1, "ml_task": "regression"} 25 | prev_loss = None 26 | for _ in range(2): 27 | model = KNeighborsRegressorAlgorithm(params) 28 | model.fit(self.X, self.y) 29 | y_predicted = model.predict(self.X) 30 | loss = metric(self.y, y_predicted) 31 | if prev_loss is not None: 32 | assert_almost_equal(prev_loss, loss) 33 | prev_loss = loss 34 | 35 | 36 | class KNeighborsAlgorithmTest(unittest.TestCase): 37 | @classmethod 38 | def setUpClass(cls): 39 | cls.X, cls.y = datasets.make_classification( 40 | n_samples=100, 41 | n_features=5, 42 | n_informative=4, 43 | n_redundant=1, 44 | n_classes=2, 45 | n_clusters_per_class=3, 46 | n_repeated=0, 47 | shuffle=False, 48 | random_state=0, 49 | ) 50 | 51 | def test_reproduce_fit(self): 52 | metric = Metric({"name": "logloss"}) 53 | params = {"seed": 1, "ml_task": "binary_classification"} 54 | prev_loss = None 55 | for _ in range(2): 56 | model = KNeighborsAlgorithm(params) 57 | model.fit(self.X, self.y) 58 | y_predicted = model.predict(self.X) 59 | loss = metric(self.y, y_predicted) 60 | if prev_loss is not None: 61 | assert_almost_equal(prev_loss, loss) 62 | prev_loss = loss 63 | 64 | def test_fit_predict(self): 65 | metric = Metric({"name": "logloss"}) 66 | params = {"ml_task": "binary_classification"} 67 | la = KNeighborsAlgorithm(params) 68 | 69 | la.fit(self.X, self.y) 70 | y_predicted = la.predict(self.X) 71 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 72 | 73 | def test_is_fitted(self): 74 | params = {"ml_task": "binary_classification"} 75 | model = KNeighborsAlgorithm(params) 76 | self.assertFalse(model.is_fitted()) 77 | model.fit(self.X, self.y) 78 | self.assertTrue(model.is_fitted()) 79 | 80 | def test_classes_attribute(self): 81 | params = {"ml_task": "binary_classification"} 82 | model = KNeighborsAlgorithm(params) 83 | model.fit(self.X,self.y) 84 | 85 | try: 86 | classes = model._classes 87 | except AttributeError: 88 | classes = None 89 | 90 | self.assertTrue(np.array_equal(np.unique(self.y), classes)) 91 | ``` -------------------------------------------------------------------------------- /supervised/utils/importance.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | import warnings 4 | 5 | import pandas as pd 6 | from sklearn.inspection import permutation_importance 7 | 8 | from supervised.algorithms.registry import ( 9 | BINARY_CLASSIFICATION, 10 | MULTICLASS_CLASSIFICATION, 11 | ) 12 | from supervised.utils.subsample import subsample 13 | 14 | logger = logging.getLogger(__name__) 15 | from supervised.utils.config import LOG_LEVEL 16 | 17 | logger.setLevel(LOG_LEVEL) 18 | 19 | from sklearn.metrics import log_loss, make_scorer 20 | 21 | 22 | def log_loss_eps(y_true, y_pred): 23 | ll = log_loss(y_true, y_pred) 24 | return ll 25 | 26 | 27 | log_loss_scorer = make_scorer(log_loss_eps, greater_is_better=False, response_method="predict_proba") 28 | 29 | 30 | class PermutationImportance: 31 | @staticmethod 32 | def compute_and_plot( 33 | model, 34 | X_validation, 35 | y_validation, 36 | model_file_path, 37 | learner_name, 38 | metric_name=None, 39 | ml_task=None, 40 | n_jobs=-1, 41 | ): 42 | # for scoring check https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter 43 | if ml_task == BINARY_CLASSIFICATION: 44 | scoring = log_loss_scorer 45 | elif ml_task == MULTICLASS_CLASSIFICATION: 46 | scoring = log_loss_scorer 47 | else: 48 | scoring = "neg_mean_squared_error" 49 | 50 | try: 51 | with warnings.catch_warnings(): 52 | warnings.simplefilter("ignore") 53 | # subsample validation data to speed-up importance computation 54 | # in the case of large number of columns, it can take a lot of time 55 | rows, cols = X_validation.shape 56 | if cols > 5000 and rows > 100: 57 | X_vald, _, y_vald, _ = subsample( 58 | X_validation, y_validation, train_size=100, ml_task=ml_task 59 | ) 60 | elif cols > 50 and rows * cols > 200000 and rows > 1000: 61 | X_vald, _, y_vald, _ = subsample( 62 | X_validation, y_validation, train_size=1000, ml_task=ml_task 63 | ) 64 | else: 65 | X_vald = X_validation 66 | y_vald = y_validation 67 | 68 | importance = permutation_importance( 69 | model, 70 | X_vald, 71 | y_vald, 72 | scoring=scoring, 73 | n_jobs=n_jobs, 74 | random_state=12, 75 | n_repeats=5, # default 76 | ) 77 | 78 | sorted_idx = importance["importances_mean"].argsort() 79 | 80 | # save detailed importance 81 | df_imp = pd.DataFrame( 82 | { 83 | "feature": X_vald.columns[sorted_idx], 84 | "mean_importance": importance["importances_mean"][sorted_idx], 85 | } 86 | ) 87 | df_imp.to_csv( 88 | os.path.join(model_file_path, f"{learner_name}_importance.csv"), 89 | index=False, 90 | ) 91 | except Exception as e: 92 | print(str(e)) 93 | print("Problem during computing permutation importance. Skipping ...") 94 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_models_needed_for_predict.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import tempfile 4 | import unittest 5 | 6 | from supervised import AutoML 7 | from supervised.exceptions import AutoMLException 8 | 9 | 10 | class AutoMLModelsNeededForPredictTest(unittest.TestCase): 11 | # models_needed_on_predict 12 | 13 | def test_models_needed_on_predict(self): 14 | with tempfile.TemporaryDirectory() as tmpdir: 15 | params = { 16 | "saved": [ 17 | "model_1", 18 | "model_2", 19 | "model_3", 20 | "unused_model", 21 | "Ensemble", 22 | "model_4_Stacked", 23 | "Stacked_Ensemble", 24 | ], 25 | "stacked": ["Ensemble", "model_1", "model_2"], 26 | } 27 | with open(os.path.join(tmpdir, "params.json"), "w") as fout: 28 | fout.write(json.dumps(params)) 29 | os.mkdir(os.path.join(tmpdir, "Ensemble")) 30 | with open(os.path.join(tmpdir, "Ensemble", "ensemble.json"), "w") as fout: 31 | params = { 32 | "selected_models": [ 33 | {"model": "model_2"}, 34 | {"model": "model_3"}, 35 | ] 36 | } 37 | fout.write(json.dumps(params)) 38 | os.mkdir(os.path.join(tmpdir, "Stacked_Ensemble")) 39 | with open( 40 | os.path.join(tmpdir, "Stacked_Ensemble", "ensemble.json"), "w" 41 | ) as fout: 42 | params = { 43 | "selected_models": [ 44 | {"model": "Ensemble"}, 45 | {"model": "model_4_Stacked"}, 46 | ] 47 | } 48 | fout.write(json.dumps(params)) 49 | 50 | automl = AutoML(results_path=tmpdir) 51 | with self.assertRaises(AutoMLException) as context: 52 | l = automl.models_needed_on_predict("missing_model") 53 | l = automl.models_needed_on_predict("model_1") 54 | self.assertTrue("model_1" in l) 55 | self.assertTrue(len(l) == 1) 56 | l = automl.models_needed_on_predict("model_3") 57 | self.assertTrue("model_3" in l) 58 | self.assertTrue(len(l) == 1) 59 | l = automl.models_needed_on_predict("Ensemble") 60 | self.assertTrue("model_2" in l) 61 | self.assertTrue("model_3" in l) 62 | self.assertTrue("Ensemble" in l) 63 | self.assertTrue(len(l) == 3) 64 | l = automl.models_needed_on_predict("model_4_Stacked") 65 | self.assertTrue("model_1" in l) 66 | self.assertTrue("model_2" in l) 67 | self.assertTrue("model_3" in l) 68 | self.assertTrue("Ensemble" in l) 69 | self.assertTrue("model_4_Stacked" in l) 70 | self.assertTrue(len(l) == 5) 71 | l = automl.models_needed_on_predict("Stacked_Ensemble") 72 | self.assertTrue("model_1" in l) 73 | self.assertTrue("model_2" in l) 74 | self.assertTrue("model_3" in l) 75 | self.assertTrue("Ensemble" in l) 76 | self.assertTrue("model_4_Stacked" in l) 77 | self.assertTrue("Stacked_Ensemble" in l) 78 | self.assertTrue(len(l) == 6) 79 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_golden_features.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import shutil 4 | import unittest 5 | 6 | import pandas as pd 7 | from sklearn import datasets 8 | 9 | from supervised import AutoML 10 | 11 | 12 | class AutoMLGoldenFeaturesTest(unittest.TestCase): 13 | automl_dir = "automl_tests" 14 | rows = 50 15 | 16 | def tearDown(self): 17 | shutil.rmtree(self.automl_dir, ignore_errors=True) 18 | 19 | def test_no_golden_features(self): 20 | N_COLS = 10 21 | X, y = datasets.make_classification( 22 | n_samples=100, 23 | n_features=N_COLS, 24 | n_informative=6, 25 | n_redundant=1, 26 | n_classes=2, 27 | n_clusters_per_class=3, 28 | n_repeated=0, 29 | shuffle=False, 30 | random_state=0, 31 | ) 32 | 33 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 34 | 35 | automl = AutoML( 36 | results_path=self.automl_dir, 37 | total_time_limit=50, 38 | algorithms=["Xgboost"], 39 | train_ensemble=False, 40 | golden_features=False, 41 | explain_level=0, 42 | start_random_models=1, 43 | ) 44 | automl.fit(X, y) 45 | 46 | self.assertEqual(len(automl._models), 1) 47 | 48 | def test_golden_features(self): 49 | N_COLS = 10 50 | X, y = datasets.make_classification( 51 | n_samples=100, 52 | n_features=N_COLS, 53 | n_informative=6, 54 | n_redundant=1, 55 | n_classes=2, 56 | n_clusters_per_class=3, 57 | n_repeated=0, 58 | shuffle=False, 59 | random_state=0, 60 | ) 61 | 62 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 63 | 64 | automl = AutoML( 65 | results_path=self.automl_dir, 66 | total_time_limit=50, 67 | algorithms=["Xgboost"], 68 | train_ensemble=False, 69 | golden_features=True, 70 | explain_level=0, 71 | start_random_models=1, 72 | ) 73 | automl.fit(X, y) 74 | 75 | self.assertEqual(len(automl._models), 2) 76 | 77 | # there should be 10 golden features 78 | with open(os.path.join(self.automl_dir, "golden_features.json")) as fin: 79 | d = json.loads(fin.read()) 80 | self.assertEqual(len(d["new_features"]), 10) 81 | 82 | def test_golden_features_count(self): 83 | N_COLS = 10 84 | X, y = datasets.make_classification( 85 | n_samples=100, 86 | n_features=N_COLS, 87 | n_informative=6, 88 | n_redundant=1, 89 | n_classes=2, 90 | n_clusters_per_class=3, 91 | n_repeated=0, 92 | shuffle=False, 93 | random_state=0, 94 | ) 95 | 96 | X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])]) 97 | 98 | automl = AutoML( 99 | results_path=self.automl_dir, 100 | total_time_limit=50, 101 | algorithms=["Xgboost"], 102 | train_ensemble=False, 103 | golden_features=50, 104 | explain_level=0, 105 | start_random_models=1, 106 | ) 107 | automl.fit(X, y) 108 | 109 | self.assertEqual(len(automl._models), 2) 110 | 111 | # there should be 50 golden features 112 | with open(os.path.join(self.automl_dir, "golden_features.json")) as fin: 113 | d = json.loads(fin.read()) 114 | self.assertEqual(len(d["new_features"]), 50) 115 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_sample_weight.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | 10 | iris = datasets.load_iris() 11 | housing = datasets.fetch_california_housing() 12 | # limit data size for faster tests 13 | housing.data = housing.data[:500] 14 | housing.target = housing.target[:500] 15 | breast_cancer = datasets.load_breast_cancer() 16 | 17 | 18 | class AutoMLSampleWeightTest(unittest.TestCase): 19 | automl_dir = "AutoMLSampleWeightTest" 20 | 21 | def tearDown(self): 22 | shutil.rmtree(self.automl_dir, ignore_errors=True) 23 | 24 | def test_iris_dataset_sample_weight(self): 25 | """Tests AutoML in the iris dataset (Multiclass classification) 26 | without and with sample weight""" 27 | model = AutoML( 28 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 29 | ) 30 | score_1 = model.fit(iris.data, iris.target).score(iris.data, iris.target) 31 | self.assertGreater(score_1, 0.5) 32 | 33 | shutil.rmtree(self.automl_dir, ignore_errors=True) 34 | model = AutoML( 35 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 36 | ) 37 | sample_weight = np.ones(iris.data.shape[0]) 38 | score_2 = model.fit(iris.data, iris.target, sample_weight=sample_weight).score( 39 | iris.data, iris.target, sample_weight=sample_weight 40 | ) 41 | assert_almost_equal(score_1, score_2) 42 | 43 | def test_housing_dataset(self): 44 | """Tests AutoML in the housing dataset (Regression) 45 | without and with sample weight""" 46 | model = AutoML( 47 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 48 | ) 49 | score_1 = model.fit(housing.data, housing.target).score( 50 | housing.data, housing.target 51 | ) 52 | self.assertGreater(score_1, 0.5) 53 | 54 | shutil.rmtree(self.automl_dir, ignore_errors=True) 55 | model = AutoML( 56 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 57 | ) 58 | sample_weight = np.ones(housing.data.shape[0]) 59 | score_2 = model.fit( 60 | housing.data, housing.target, sample_weight=sample_weight 61 | ).score(housing.data, housing.target, sample_weight=sample_weight) 62 | assert_almost_equal(score_1, score_2) 63 | 64 | def test_breast_cancer_dataset(self): 65 | """Tests AutoML in the breast cancer (binary classification) 66 | without and with sample weight""" 67 | model = AutoML( 68 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 69 | ) 70 | score_1 = model.fit(breast_cancer.data, breast_cancer.target).score( 71 | breast_cancer.data, breast_cancer.target 72 | ) 73 | self.assertGreater(score_1, 0.5) 74 | 75 | shutil.rmtree(self.automl_dir, ignore_errors=True) 76 | model = AutoML( 77 | explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir 78 | ) 79 | sample_weight = np.ones(breast_cancer.data.shape[0]) 80 | score_2 = model.fit( 81 | breast_cancer.data, breast_cancer.target, sample_weight=sample_weight 82 | ).score(breast_cancer.data, breast_cancer.target, sample_weight=sample_weight) 83 | assert_almost_equal(score_1, score_2) 84 | ``` -------------------------------------------------------------------------------- /supervised/callbacks/total_time_constraint.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | from supervised.callbacks.callback import Callback 7 | from supervised.exceptions import NotTrainedException 8 | from supervised.utils.config import LOG_LEVEL 9 | 10 | log = logging.getLogger(__name__) 11 | log.setLevel(LOG_LEVEL) 12 | 13 | 14 | class TotalTimeConstraint(Callback): 15 | def __init__(self, params={}): 16 | super(TotalTimeConstraint, self).__init__(params) 17 | self.name = params.get("name", "total_time_constraint") 18 | self.total_time_limit = params.get("total_time_limit") 19 | self.total_time_start = params.get("total_time_start") 20 | self.expected_learners_cnt = params.get("expected_learners_cnt", 1) 21 | 22 | def on_learner_train_start(self, logs): 23 | self.train_start_time = time.time() 24 | 25 | def on_learner_train_end(self, logs): 26 | if ( 27 | self.total_time_limit is not None 28 | and len(self.learners) == 1 29 | and self.expected_learners_cnt > 1 30 | # just check for the first learner 31 | # need to have more than 1 learner 32 | # otherwise it is a finish of the training 33 | ): 34 | one_fold_time = time.time() - self.train_start_time 35 | estimate_all_folds = one_fold_time * self.expected_learners_cnt 36 | 37 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 38 | 39 | # we need to add time for the rest of learners (assuming that all folds training time is the same) 40 | estimate_elapsed_time = total_elapsed_time + one_fold_time * ( 41 | self.expected_learners_cnt - 1 42 | ) 43 | 44 | if estimate_elapsed_time >= self.total_time_limit: 45 | raise NotTrainedException( 46 | "Stop training after the first fold. " 47 | f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. " 48 | "The time estimate for training on all folds is larger than total_time_limit." 49 | ) 50 | if ( 51 | self.total_time_limit is not None 52 | and len(self.learners) < self.expected_learners_cnt 53 | # dont stop for last learner, we are finishing anyway 54 | ): 55 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 56 | 57 | if total_elapsed_time > self.total_time_limit + 600: 58 | # add 10 minutes of margin 59 | # margin is added because of unexpected time changes 60 | # if training on each fold will be the same 61 | # then the training will be stopped after first fold (above condition) 62 | raise NotTrainedException( 63 | "Force to stop the training. " 64 | "Total time for AutoML training already exceeded." 65 | ) 66 | 67 | def on_iteration_end(self, logs, predictions): 68 | total_elapsed_time = np.round(time.time() - self.total_time_start, 2) 69 | 70 | if self.total_time_limit is not None: 71 | log.debug( 72 | f"Total elapsed time {total_elapsed_time} seconds. " 73 | + f"Time left {np.round(self.total_time_limit - total_elapsed_time, 2)} seconds." 74 | ) 75 | # not time left, stop now 76 | if total_elapsed_time >= self.total_time_limit: 77 | self.learner.stop_training = True 78 | else: 79 | log.debug(f"Total elapsed time {total_elapsed_time} seconds") 80 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_repeated_validation.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import shutil 3 | import unittest 4 | 5 | import pandas as pd 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | from supervised.algorithms.random_forest import additional 10 | from supervised.utils.common import construct_learner_name 11 | 12 | additional["max_steps"] = 1 13 | additional["trees_in_step"] = 1 14 | 15 | from supervised.algorithms.xgboost import additional 16 | 17 | additional["max_rounds"] = 1 18 | 19 | 20 | class AutoMLRepeatedValidationTest(unittest.TestCase): 21 | automl_dir = "AutoMLRepeatedValidationTest" 22 | 23 | def tearDown(self): 24 | shutil.rmtree(self.automl_dir, ignore_errors=True) 25 | 26 | def test_repeated_kfold(self): 27 | REPEATS = 3 28 | FOLDS = 2 29 | 30 | a = AutoML( 31 | results_path=self.automl_dir, 32 | total_time_limit=10, 33 | algorithms=["Random Forest"], 34 | train_ensemble=False, 35 | validation_strategy={ 36 | "validation_type": "kfold", 37 | "k_folds": FOLDS, 38 | "repeats": REPEATS, 39 | "shuffle": True, 40 | "stratify": True, 41 | }, 42 | start_random_models=1, 43 | ) 44 | 45 | X, y = datasets.make_classification( 46 | n_samples=100, 47 | n_features=5, 48 | n_informative=4, 49 | n_redundant=1, 50 | n_classes=2, 51 | n_clusters_per_class=3, 52 | n_repeated=0, 53 | shuffle=False, 54 | random_state=0, 55 | ) 56 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 57 | 58 | a.fit(X, y) 59 | 60 | result_files = os.listdir( 61 | os.path.join(self.automl_dir, "1_Default_RandomForest") 62 | ) 63 | 64 | cnt = 0 65 | for repeat in range(REPEATS): 66 | for fold in range(FOLDS): 67 | learner_name = construct_learner_name(fold, repeat, REPEATS) 68 | self.assertTrue(f"{learner_name}.random_forest" in result_files) 69 | self.assertTrue(f"{learner_name}_training.log" in result_files) 70 | cnt += 1 71 | self.assertTrue(cnt, 6) 72 | 73 | def test_repeated_split(self): 74 | REPEATS = 3 75 | FOLDS = 1 76 | 77 | a = AutoML( 78 | results_path=self.automl_dir, 79 | total_time_limit=10, 80 | algorithms=["Random Forest"], 81 | train_ensemble=False, 82 | validation_strategy={ 83 | "validation_type": "split", 84 | "repeats": REPEATS, 85 | "shuffle": True, 86 | "stratify": True, 87 | }, 88 | start_random_models=1, 89 | ) 90 | 91 | X, y = datasets.make_classification( 92 | n_samples=100, 93 | n_features=5, 94 | n_informative=4, 95 | n_redundant=1, 96 | n_classes=2, 97 | n_clusters_per_class=3, 98 | n_repeated=0, 99 | shuffle=False, 100 | random_state=0, 101 | ) 102 | X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) 103 | 104 | a.fit(X, y) 105 | 106 | result_files = os.listdir( 107 | os.path.join(self.automl_dir, "1_Default_RandomForest") 108 | ) 109 | cnt = 0 110 | for repeat in range(REPEATS): 111 | for fold in range(FOLDS): 112 | learner_name = construct_learner_name(fold, repeat, REPEATS) 113 | self.assertTrue(f"{learner_name}.random_forest" in result_files) 114 | self.assertTrue(f"{learner_name}_training.log" in result_files) 115 | cnt += 1 116 | self.assertTrue(cnt, 3) 117 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/datetime_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class DateTimeTransformer(object): 6 | def __init__(self): 7 | self._new_columns = [] 8 | self._old_column = None 9 | self._min_datetime = None 10 | self._transforms = [] 11 | 12 | def fit(self, X, column): 13 | self._old_column = column 14 | self._min_datetime = np.min(X[column]) 15 | 16 | values = X[column].dt.year 17 | if len(np.unique(values)) > 1: 18 | self._transforms += ["year"] 19 | new_column = column + "_Year" 20 | self._new_columns += [new_column] 21 | 22 | values = X[column].dt.month 23 | if len(np.unique(values)) > 1: 24 | self._transforms += ["month"] 25 | new_column = column + "_Month" 26 | self._new_columns += [new_column] 27 | 28 | values = X[column].dt.day 29 | if len(np.unique(values)) > 1: 30 | self._transforms += ["day"] 31 | new_column = column + "_Day" 32 | self._new_columns += [new_column] 33 | 34 | values = X[column].dt.weekday 35 | if len(np.unique(values)) > 1: 36 | self._transforms += ["weekday"] 37 | new_column = column + "_WeekDay" 38 | self._new_columns += [new_column] 39 | 40 | values = X[column].dt.dayofyear 41 | if len(np.unique(values)) > 1: 42 | self._transforms += ["dayofyear"] 43 | new_column = column + "_DayOfYear" 44 | self._new_columns += [new_column] 45 | 46 | values = X[column].dt.hour 47 | if len(np.unique(values)) > 1: 48 | self._transforms += ["hour"] 49 | new_column = column + "_Hour" 50 | self._new_columns += [new_column] 51 | 52 | values = (X[column] - self._min_datetime).dt.days 53 | if len(np.unique(values)) > 1: 54 | self._transforms += ["days_diff"] 55 | new_column = column + "_Days_Diff_To_Min" 56 | self._new_columns += [new_column] 57 | 58 | def transform(self, X): 59 | column = self._old_column 60 | 61 | if "year" in self._transforms: 62 | new_column = column + "_Year" 63 | X[new_column] = X[column].dt.year 64 | 65 | if "month" in self._transforms: 66 | new_column = column + "_Month" 67 | X[new_column] = X[column].dt.month 68 | 69 | if "day" in self._transforms: 70 | new_column = column + "_Day" 71 | X[new_column] = X[column].dt.day 72 | 73 | if "weekday" in self._transforms: 74 | new_column = column + "_WeekDay" 75 | X[new_column] = X[column].dt.weekday 76 | 77 | if "dayofyear" in self._transforms: 78 | new_column = column + "_DayOfYear" 79 | X[new_column] = X[column].dt.dayofyear 80 | 81 | if "hour" in self._transforms: 82 | new_column = column + "_Hour" 83 | X[new_column] = X[column].dt.hour 84 | 85 | if "days_diff" in self._transforms: 86 | new_column = column + "_Days_Diff_To_Min" 87 | X[new_column] = (X[column] - self._min_datetime).dt.days 88 | 89 | X.drop(column, axis=1, inplace=True) 90 | return X 91 | 92 | def to_json(self): 93 | data_json = { 94 | "new_columns": list(self._new_columns), 95 | "old_column": self._old_column, 96 | "min_datetime": str(self._min_datetime), 97 | "transforms": list(self._transforms), 98 | } 99 | return data_json 100 | 101 | def from_json(self, data_json): 102 | self._new_columns = data_json.get("new_columns", None) 103 | self._old_column = data_json.get("old_column", None) 104 | d = data_json.get("min_datetime", None) 105 | self._min_datetime = None if d is None else pd.to_datetime(d) 106 | self._transforms = data_json.get("transforms", []) 107 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_linear.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.linear import LinearAlgorithm, LinearRegressorAlgorithm 9 | from supervised.utils.metric import Metric 10 | 11 | 12 | class LinearRegressorAlgorithmTest(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.X, cls.y = datasets.make_regression( 16 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 17 | ) 18 | 19 | def test_reproduce_fit(self): 20 | metric = Metric({"name": "mse"}) 21 | params = {"seed": 1, "ml_task": "regression"} 22 | prev_loss = None 23 | for _ in range(3): 24 | model = LinearRegressorAlgorithm(params) 25 | model.fit(self.X, self.y) 26 | y_predicted = model.predict(self.X) 27 | loss = metric(self.y, y_predicted) 28 | if prev_loss is not None: 29 | assert_almost_equal(prev_loss, loss) 30 | prev_loss = loss 31 | 32 | 33 | class LinearAlgorithmTest(unittest.TestCase): 34 | @classmethod 35 | def setUpClass(cls): 36 | cls.X, cls.y = datasets.make_classification( 37 | n_samples=100, 38 | n_features=5, 39 | n_informative=4, 40 | n_redundant=1, 41 | n_classes=2, 42 | n_clusters_per_class=3, 43 | n_repeated=0, 44 | shuffle=False, 45 | random_state=0, 46 | ) 47 | 48 | def test_reproduce_fit(self): 49 | metric = Metric({"name": "logloss"}) 50 | params = {"seed": 1, "ml_task": "binary_classification"} 51 | prev_loss = None 52 | for _ in range(3): 53 | model = LinearAlgorithm(params) 54 | model.fit(self.X, self.y) 55 | y_predicted = model.predict(self.X) 56 | loss = metric(self.y, y_predicted) 57 | if prev_loss is not None: 58 | assert_almost_equal(prev_loss, loss) 59 | prev_loss = loss 60 | 61 | def test_fit_predict(self): 62 | metric = Metric({"name": "logloss"}) 63 | params = {"ml_task": "binary_classification"} 64 | la = LinearAlgorithm(params) 65 | 66 | la.fit(self.X, self.y) 67 | y_predicted = la.predict(self.X) 68 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 69 | 70 | def test_copy(self): 71 | metric = Metric({"name": "logloss"}) 72 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 73 | model.fit(self.X, self.y) 74 | y_predicted = model.predict(self.X) 75 | loss = metric(self.y, y_predicted) 76 | 77 | model2 = LinearAlgorithm({}) 78 | model2 = model.copy() 79 | self.assertEqual(type(model), type(model2)) 80 | y_predicted = model2.predict(self.X) 81 | loss2 = metric(self.y, y_predicted) 82 | assert_almost_equal(loss, loss2) 83 | 84 | def test_save_and_load(self): 85 | metric = Metric({"name": "logloss"}) 86 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 87 | model.fit(self.X, self.y) 88 | y_predicted = model.predict(self.X) 89 | loss = metric(self.y, y_predicted) 90 | 91 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 92 | 93 | model.save(filename) 94 | model2 = LinearAlgorithm({"ml_task": "binary_classification"}) 95 | model2.load(filename) 96 | # Finished with the file, delete it 97 | os.remove(filename) 98 | 99 | y_predicted = model2.predict(self.X) 100 | loss2 = metric(self.y, y_predicted) 101 | assert_almost_equal(loss, loss2) 102 | 103 | def test_is_fitted(self): 104 | model = LinearAlgorithm({"ml_task": "binary_classification"}) 105 | self.assertFalse(model.is_fitted()) 106 | model.fit(self.X, self.y) 107 | self.assertTrue(model.is_fitted()) 108 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/knn.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | 3 | import sklearn 4 | from sklearn.base import ClassifierMixin, RegressorMixin 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor 7 | 8 | from supervised.algorithms.registry import ( 9 | BINARY_CLASSIFICATION, 10 | MULTICLASS_CLASSIFICATION, 11 | REGRESSION, 12 | AlgorithmsRegistry, 13 | ) 14 | from supervised.algorithms.sklearn import SklearnAlgorithm 15 | from supervised.utils.config import LOG_LEVEL 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(LOG_LEVEL) 19 | 20 | 21 | KNN_ROWS_LIMIT = 1000 22 | 23 | 24 | class KNNFit(SklearnAlgorithm): 25 | def file_extension(self): 26 | return "k_neighbors" 27 | 28 | def is_fitted(self): 29 | return ( 30 | hasattr(self.model, "n_samples_fit_") 31 | and self.model.n_samples_fit_ is not None 32 | and self.model.n_samples_fit_ > 0 33 | ) 34 | 35 | def fit( 36 | self, 37 | X, 38 | y, 39 | sample_weight=None, 40 | X_validation=None, 41 | y_validation=None, 42 | sample_weight_validation=None, 43 | log_to_file=None, 44 | max_time=None, 45 | ): 46 | rows_limit = self.params.get("rows_limit", KNN_ROWS_LIMIT) 47 | if X.shape[0] > rows_limit: 48 | X1, _, y1, _ = train_test_split( 49 | X, y, train_size=rows_limit, stratify=y, random_state=1234 50 | ) 51 | self.model.fit(X1, y1) 52 | else: 53 | self.model.fit(X, y) 54 | 55 | @property 56 | def _classes(self): 57 | # Returns the unique classes based on the fitted model 58 | if hasattr(self.model, "classes_"): 59 | return self.model.classes_ 60 | else: 61 | return None 62 | 63 | 64 | class KNeighborsAlgorithm(ClassifierMixin, KNNFit): 65 | algorithm_name = "k-Nearest Neighbors" 66 | algorithm_short_name = "Nearest Neighbors" 67 | 68 | def __init__(self, params): 69 | super(KNeighborsAlgorithm, self).__init__(params) 70 | logger.debug("KNeighborsAlgorithm.__init__") 71 | self.library_version = sklearn.__version__ 72 | self.max_iters = 1 73 | self.model = KNeighborsClassifier( 74 | n_neighbors=params.get("n_neighbors", 3), 75 | weights=params.get("weights", "uniform"), 76 | algorithm="kd_tree", 77 | n_jobs=params.get("n_jobs", -1), 78 | ) 79 | 80 | 81 | class KNeighborsRegressorAlgorithm(RegressorMixin, KNNFit): 82 | algorithm_name = "k-Nearest Neighbors" 83 | algorithm_short_name = "Nearest Neighbors" 84 | 85 | def __init__(self, params): 86 | super(KNeighborsRegressorAlgorithm, self).__init__(params) 87 | logger.debug("KNeighborsRegressorAlgorithm.__init__") 88 | self.library_version = sklearn.__version__ 89 | self.max_iters = 1 90 | self.model = KNeighborsRegressor( 91 | n_neighbors=params.get("n_neighbors", 3), 92 | weights=params.get("weights", "uniform"), 93 | algorithm="ball_tree", 94 | n_jobs=params.get("n_jobs", -1), 95 | ) 96 | 97 | 98 | knn_params = {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]} 99 | 100 | default_params = {"n_neighbors": 5, "weights": "uniform"} 101 | 102 | additional = {"max_rows_limit": 100000, "max_cols_limit": 100} 103 | 104 | required_preprocessing = [ 105 | "missing_values_inputation", 106 | "convert_categorical", 107 | "datetime_transform", 108 | "text_transform", 109 | "scale", 110 | "target_as_integer", 111 | ] 112 | 113 | AlgorithmsRegistry.add( 114 | BINARY_CLASSIFICATION, 115 | KNeighborsAlgorithm, 116 | knn_params, 117 | required_preprocessing, 118 | additional, 119 | default_params, 120 | ) 121 | AlgorithmsRegistry.add( 122 | MULTICLASS_CLASSIFICATION, 123 | KNeighborsAlgorithm, 124 | knn_params, 125 | required_preprocessing, 126 | additional, 127 | default_params, 128 | ) 129 | 130 | AlgorithmsRegistry.add( 131 | REGRESSION, 132 | KNeighborsRegressorAlgorithm, 133 | knn_params, 134 | required_preprocessing, 135 | additional, 136 | default_params, 137 | ) 138 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_automl_time_constraints.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import time 3 | import unittest 4 | 5 | from supervised import AutoML 6 | from supervised.tuner.time_controller import TimeController 7 | 8 | 9 | class AutoMLTimeConstraintsTest(unittest.TestCase): 10 | automl_dir = "automl_tests" 11 | 12 | def tearDown(self): 13 | shutil.rmtree(self.automl_dir, ignore_errors=True) 14 | 15 | def test_set_total_time_limit(self): 16 | model_type = "Xgboost" 17 | automl = AutoML( 18 | results_path=self.automl_dir, total_time_limit=100, algorithms=[model_type] 19 | ) 20 | 21 | automl._time_ctrl = TimeController( 22 | time.time(), 100, None, ["simple_algorithms", "not_so_random"], "Xgboost" 23 | ) 24 | 25 | time_spend = 0 26 | for i in range(12): 27 | automl._start_time -= 10 28 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 29 | if automl._time_ctrl.enough_time(model_type, "not_so_random"): 30 | time_spend += 10 31 | 32 | self.assertTrue(time_spend < 100) 33 | 34 | def test_set_model_time_limit(self): 35 | model_type = "Xgboost" 36 | automl = AutoML( 37 | results_path=self.automl_dir, model_time_limit=10, algorithms=[model_type] 38 | ) 39 | automl._time_ctrl = TimeController( 40 | time.time(), None, 10, ["simple_algorithms", "not_so_random"], "Xgboost" 41 | ) 42 | 43 | for i in range(12): 44 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 45 | # should be always true 46 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 47 | 48 | def test_set_model_time_limit_omit_total_time(self): 49 | model_type = "Xgboost" 50 | automl = AutoML( 51 | results_path=self.automl_dir, 52 | total_time_limit=10, 53 | model_time_limit=10, 54 | algorithms=[model_type], 55 | ) 56 | automl._time_ctrl = TimeController( 57 | time.time(), 10, 10, ["simple_algorithms", "not_so_random"], "Xgboost" 58 | ) 59 | 60 | for i in range(12): 61 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 10) 62 | # should be always true 63 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 64 | 65 | def test_enough_time_to_train(self): 66 | model_type = "Xgboost" 67 | model_type_2 = "LightGBM" 68 | 69 | model_type = "Xgboost" 70 | automl = AutoML( 71 | results_path=self.automl_dir, 72 | total_time_limit=10, 73 | model_time_limit=10, 74 | algorithms=[model_type, model_type_2], 75 | ) 76 | automl._time_ctrl = TimeController( 77 | time.time(), 78 | 10, 79 | 10, 80 | ["simple_algorithms", "not_so_random"], 81 | [model_type, model_type_2], 82 | ) 83 | 84 | for i in range(5): 85 | automl._time_ctrl.log_time(f"Xgboost_{i}", model_type, "not_so_random", 1) 86 | # should be always true 87 | self.assertTrue(automl._time_ctrl.enough_time(model_type, "not_so_random")) 88 | 89 | for i in range(5): 90 | automl._time_ctrl.log_time( 91 | f"LightGBM_{i}", model_type_2, "not_so_random", 1 92 | ) 93 | # should be always true 94 | self.assertTrue( 95 | automl._time_ctrl.enough_time(model_type_2, "not_so_random") 96 | ) 97 | 98 | def test_expected_learners_cnt(self): 99 | automl = AutoML(results_path=self.automl_dir) 100 | automl._validation_strategy = {"k_folds": 7, "repeats": 6} 101 | self.assertEqual(automl._expected_learners_cnt(), 42) 102 | 103 | automl._validation_strategy = {"k_folds": 7} 104 | self.assertEqual(automl._expected_learners_cnt(), 7) 105 | automl._validation_strategy = {} 106 | self.assertEqual(automl._expected_learners_cnt(), 1) 107 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/preprocessing_missing.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 5 | 6 | 7 | class PreprocessingMissingValues(object): 8 | FILL_NA_MIN = "na_fill_min_1" 9 | FILL_NA_MEAN = "na_fill_mean" 10 | FILL_NA_MEDIAN = "na_fill_median" 11 | FILL_DATETIME = "na_fill_datetime" 12 | 13 | NA_EXCLUDE = "na_exclude" 14 | MISSING_VALUE = "_missing_value_" 15 | REMOVE_COLUMN = "remove_column" 16 | 17 | def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN): 18 | self._columns = columns 19 | # fill method 20 | self._na_fill_method = na_fill_method 21 | # fill parameters stored as a dict, feature -> fill value 22 | self._na_fill_params = {} 23 | self._datetime_columns = [] 24 | 25 | def fit(self, X): 26 | X = self._fit_na_fill(X) 27 | 28 | def _fit_na_fill(self, X): 29 | for column in self._columns: 30 | if np.sum(pd.isnull(X[column]) == True) == 0: 31 | continue 32 | self._na_fill_params[column] = self._get_fill_value(X[column]) 33 | if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME: 34 | self._datetime_columns += [column] 35 | 36 | def _get_fill_value(self, x): 37 | # categorical type 38 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL: 39 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: 40 | return ( 41 | PreprocessingMissingValues.MISSING_VALUE 42 | ) # add new categorical value 43 | return PreprocessingUtils.get_most_frequent(x) 44 | # datetime 45 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME: 46 | return PreprocessingUtils.get_most_frequent(x) 47 | # text 48 | if PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT: 49 | return PreprocessingMissingValues.MISSING_VALUE 50 | 51 | # numerical type 52 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN: 53 | return PreprocessingUtils.get_min(x) - 1.0 54 | if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MEAN: 55 | return PreprocessingUtils.get_mean(x) 56 | return PreprocessingUtils.get_median(x) 57 | 58 | def transform(self, X): 59 | X = self._transform_na_fill(X) 60 | # this is additional run through columns, 61 | # in case of transforming data with new columns with missing values 62 | # X = self._make_sure_na_filled(X) # disbaled for now 63 | return X 64 | 65 | def _transform_na_fill(self, X): 66 | for column, value in self._na_fill_params.items(): 67 | ind = pd.isnull(X.loc[:, column]) 68 | X.loc[ind, column] = value 69 | return X 70 | 71 | def _make_sure_na_filled(self, X): 72 | self._fit_na_fill(X) 73 | return self._transform_na_fill(X) 74 | 75 | def to_json(self): 76 | # prepare json with all parameters 77 | if len(self._na_fill_params) == 0: 78 | return {} 79 | params = { 80 | "fill_method": self._na_fill_method, 81 | "fill_params": self._na_fill_params, 82 | "datetime_columns": list(self._datetime_columns), 83 | } 84 | for col in self._datetime_columns: 85 | params["fill_params"][col] = str(params["fill_params"][col]) 86 | return params 87 | 88 | def from_json(self, params): 89 | if params is not None: 90 | self._na_fill_method = params.get("fill_method", None) 91 | self._na_fill_params = params.get("fill_params", {}) 92 | self._datetime_columns = params.get("datetime_columns", []) 93 | for col in self._datetime_columns: 94 | self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col]) 95 | else: 96 | self._na_fill_method, self._na_fill_params = None, None 97 | self._datetime_columns = [] 98 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/scale.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | from sklearn import preprocessing 3 | 4 | 5 | class Scale(object): 6 | SCALE_NORMAL = "scale_normal" 7 | SCALE_LOG_AND_NORMAL = "scale_log_and_normal" 8 | 9 | def __init__(self, columns=[], scale_method=SCALE_NORMAL): 10 | self.scale_method = scale_method 11 | self.columns = columns 12 | self.scale = preprocessing.StandardScaler( 13 | copy=True, with_mean=True, with_std=True 14 | ) 15 | self.X_min_values = None # it is used in SCALE_LOG_AND_NORMAL 16 | 17 | def fit(self, X): 18 | if len(self.columns): 19 | for c in self.columns: 20 | X[c] = X[c].astype(float) 21 | 22 | if self.scale_method == self.SCALE_NORMAL: 23 | self.scale.fit(X[self.columns]) 24 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 25 | self.X_min_values = np.min(X[self.columns], axis=0) 26 | self.scale.fit(np.log(X[self.columns] - self.X_min_values + 1)) 27 | 28 | def transform(self, X): 29 | if len(self.columns): 30 | for c in self.columns: 31 | X[c] = X[c].astype(float) 32 | if self.scale_method == self.SCALE_NORMAL: 33 | X.loc[:, self.columns] = self.scale.transform(X[self.columns]) 34 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 35 | X[self.columns] = np.log( 36 | np.clip( 37 | X[self.columns] - self.X_min_values + 1, a_min=1, a_max=None 38 | ) 39 | ) 40 | X.loc[:, self.columns] = self.scale.transform(X[self.columns]) 41 | return X 42 | 43 | def inverse_transform(self, X): 44 | if len(self.columns): 45 | if self.scale_method == self.SCALE_NORMAL: 46 | X.loc[:, self.columns] = self.scale.inverse_transform(X[self.columns]) 47 | elif self.scale_method == self.SCALE_LOG_AND_NORMAL: 48 | X[self.columns] = X[self.columns].astype("float64") 49 | 50 | X[self.columns] = self.scale.inverse_transform(X[self.columns]) 51 | X[self.columns] = np.exp(X[self.columns]) 52 | 53 | X.loc[:, self.columns] += self.X_min_values - 1 54 | return X 55 | 56 | def to_json(self): 57 | if len(self.columns) == 0: 58 | return None 59 | data_json = { 60 | "scale": list(self.scale.scale_), 61 | "mean": list(self.scale.mean_), 62 | "var": list(self.scale.var_), 63 | "n_samples_seen": int(self.scale.n_samples_seen_), 64 | "n_features_in": int(self.scale.n_features_in_), 65 | "columns": self.columns, 66 | "scale_method": self.scale_method, 67 | } 68 | if self.X_min_values is not None: 69 | data_json["X_min_values"] = list(self.X_min_values) 70 | return data_json 71 | 72 | def from_json(self, data_json): 73 | self.scale = preprocessing.StandardScaler( 74 | copy=True, with_mean=True, with_std=True 75 | ) 76 | self.scale.scale_ = data_json.get("scale") 77 | if self.scale.scale_ is not None: 78 | self.scale.scale_ = np.array(self.scale.scale_) 79 | self.scale.mean_ = data_json.get("mean") 80 | if self.scale.mean_ is not None: 81 | self.scale.mean_ = np.array(self.scale.mean_) 82 | self.scale.var_ = data_json.get("var") 83 | if self.scale.var_ is not None: 84 | self.scale.var_ = np.array(self.scale.var_) 85 | self.scale.n_samples_seen_ = int(data_json.get("n_samples_seen")) 86 | self.scale.n_features_in_ = int(data_json.get("n_features_in")) 87 | self.columns = data_json.get("columns", []) 88 | self.scale.feature_names_in_ = data_json.get("columns") 89 | self.scale_method = data_json.get("scale_method") 90 | self.X_min_values = data_json.get("X_min_values") 91 | if self.X_min_values is not None: 92 | self.X_min_values = np.array(self.X_min_values) 93 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/kmeans_transformer.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import time 3 | 4 | import joblib 5 | import numpy as np 6 | from sklearn.cluster import MiniBatchKMeans 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | from supervised.exceptions import AutoMLException 10 | 11 | 12 | class KMeansTransformer(object): 13 | def __init__(self, results_path=None, model_name=None, k_fold=None): 14 | self._new_features = [] 15 | self._input_columns = [] 16 | self._error = None 17 | self._kmeans = None 18 | self._scale = None 19 | self._model_name = model_name 20 | self._k_fold = k_fold 21 | 22 | if results_path is not None: 23 | self._result_file = os.path.join( 24 | self._model_name, f"kmeans_fold_{k_fold}.joblib" 25 | ) 26 | self._result_path = os.path.join(results_path, self._result_file) 27 | # self.try_load() 28 | 29 | def fit(self, X, y): 30 | if self._new_features: 31 | return 32 | if self._error is not None and self._error: 33 | raise AutoMLException( 34 | "KMeans Features not created due to error (please check errors.md). " 35 | + self._error 36 | ) 37 | return 38 | if X.shape[1] == 0: 39 | self._error = f"KMeans not created. No continous features. Input data shape: {X.shape}, {y.shape}" 40 | raise AutoMLException("KMeans Features not created. No continous features.") 41 | 42 | start_time = time.time() 43 | 44 | n_clusters = int(np.log10(X.shape[0]) * 8) 45 | n_clusters = max(8, n_clusters) 46 | n_clusters = min(n_clusters, X.shape[1]) 47 | 48 | self._input_columns = X.columns.tolist() 49 | # scale data 50 | self._scale = StandardScaler(copy=True, with_mean=True, with_std=True) 51 | X = self._scale.fit_transform(X) 52 | 53 | # Kmeans 54 | self._kmeans = kmeans = MiniBatchKMeans(n_clusters=n_clusters, init="k-means++") 55 | self._kmeans.fit(X) 56 | self._create_new_features_names() 57 | 58 | # print( 59 | # f"Created {len(self._new_features)} KMeans Features in {np.round(time.time() - start_time,2)} seconds." 60 | # ) 61 | 62 | def _create_new_features_names(self): 63 | n_clusters = self._kmeans.cluster_centers_.shape[0] 64 | self._new_features = [f"Dist_Cluster_{i}" for i in range(n_clusters)] 65 | self._new_features += ["Cluster"] 66 | 67 | def transform(self, X): 68 | if self._kmeans is None: 69 | raise AutoMLException("KMeans not fitted") 70 | 71 | # scale 72 | X_scaled = self._scale.transform(X[self._input_columns]) 73 | 74 | # kmeans 75 | distances = self._kmeans.transform(X_scaled) 76 | clusters = self._kmeans.predict(X_scaled) 77 | 78 | X[self._new_features[:-1]] = distances 79 | X[self._new_features[-1]] = clusters 80 | 81 | return X 82 | 83 | def to_json(self): 84 | self.save() 85 | data_json = { 86 | "new_features": self._new_features, 87 | "result_file": self._result_file, 88 | "input_columns": self._input_columns, 89 | } 90 | if self._error is not None and self._error: 91 | data_json["error"] = self._error 92 | return data_json 93 | 94 | def from_json(self, data_json, results_path): 95 | self._new_features = data_json.get("new_features", []) 96 | self._input_columns = data_json.get("input_columns", []) 97 | self._result_file = data_json.get("result_file") 98 | self._result_path = os.path.join(results_path, self._result_file) 99 | self._error = data_json.get("error") 100 | self.try_load() 101 | 102 | def save(self): 103 | joblib.dump( 104 | {"kmeans": self._kmeans, "scale": self._scale}, 105 | self._result_path, 106 | compress=True, 107 | ) 108 | 109 | def try_load(self): 110 | if os.path.exists(self._result_path): 111 | data = joblib.load(self._result_path) 112 | self._kmeans = data["kmeans"] 113 | self._scale = data["scale"] 114 | 115 | self._create_new_features_names() 116 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_handle_imbalance.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | from supervised.algorithms.random_forest import additional 9 | from supervised.algorithms.registry import MULTICLASS_CLASSIFICATION 10 | 11 | additional["max_steps"] = 1 12 | additional["trees_in_step"] = 1 13 | 14 | from supervised.algorithms.xgboost import additional 15 | 16 | additional["max_rounds"] = 1 17 | 18 | 19 | class AutoMLHandleImbalanceTest(unittest.TestCase): 20 | automl_dir = "AutoMLHandleImbalanceTest" 21 | 22 | def tearDown(self): 23 | shutil.rmtree(self.automl_dir, ignore_errors=True) 24 | 25 | def test_handle_drastic_imbalance(self): 26 | a = AutoML( 27 | results_path=self.automl_dir, 28 | total_time_limit=10, 29 | algorithms=["Random Forest"], 30 | train_ensemble=False, 31 | validation_strategy={ 32 | "validation_type": "kfold", 33 | "k_folds": 10, 34 | "shuffle": True, 35 | "stratify": True, 36 | }, 37 | start_random_models=1, 38 | ) 39 | 40 | rows = 100 41 | X = pd.DataFrame( 42 | { 43 | "f1": np.random.rand(rows), 44 | "f2": np.random.rand(rows), 45 | "f3": np.random.rand(rows), 46 | } 47 | ) 48 | y = np.ones(rows) 49 | 50 | y[:8] = 0 51 | y[10:12] = 2 52 | y = pd.Series(np.array(y), name="target") 53 | a._ml_task = MULTICLASS_CLASSIFICATION 54 | a._handle_drastic_imbalance(X, y) 55 | 56 | self.assertEqual(X.shape[0], 130) 57 | self.assertEqual(X.shape[1], 3) 58 | self.assertEqual(y.shape[0], 130) 59 | 60 | def test_handle_drastic_imbalance_sample_weight(self): 61 | a = AutoML( 62 | results_path=self.automl_dir, 63 | total_time_limit=10, 64 | algorithms=["Random Forest"], 65 | train_ensemble=False, 66 | validation_strategy={ 67 | "validation_type": "kfold", 68 | "k_folds": 10, 69 | "shuffle": True, 70 | "stratify": True, 71 | }, 72 | start_random_models=1, 73 | ) 74 | 75 | rows = 100 76 | X = pd.DataFrame( 77 | { 78 | "f1": np.random.rand(rows), 79 | "f2": np.random.rand(rows), 80 | "f3": np.random.rand(rows), 81 | } 82 | ) 83 | y = np.ones(rows) 84 | sample_weight = pd.Series(np.array(range(rows)), name="sample_weight") 85 | 86 | y[:1] = 0 87 | y[10:11] = 2 88 | 89 | y = pd.Series(np.array(y), name="target") 90 | a._ml_task = MULTICLASS_CLASSIFICATION 91 | a._handle_drastic_imbalance(X, y, sample_weight) 92 | 93 | self.assertEqual(X.shape[0], 138) 94 | self.assertEqual(X.shape[1], 3) 95 | self.assertEqual(y.shape[0], 138) 96 | 97 | self.assertEqual(np.sum(sample_weight[100:119]), 0) 98 | self.assertEqual(np.sum(sample_weight[119:138]), 19 * 10) 99 | 100 | def test_imbalance_dont_change_data_after_fit(self): 101 | a = AutoML( 102 | results_path=self.automl_dir, 103 | total_time_limit=5, 104 | train_ensemble=False, 105 | validation_strategy={ 106 | "validation_type": "kfold", 107 | "k_folds": 10, 108 | "shuffle": True, 109 | "stratify": True, 110 | }, 111 | start_random_models=1, 112 | explain_level=0, 113 | ) 114 | 115 | rows = 100 116 | X = pd.DataFrame( 117 | { 118 | "f1": np.random.rand(rows), 119 | "f2": np.random.rand(rows), 120 | "f3": np.random.rand(rows), 121 | } 122 | ) 123 | y = np.ones(rows) 124 | 125 | y[:8] = 0 126 | y[10:12] = 2 127 | sample_weight = np.ones(rows) 128 | 129 | a.fit(X, y, sample_weight=sample_weight) 130 | 131 | # original data **without** inserted samples to handle imbalance 132 | self.assertEqual(X.shape[0], rows) 133 | self.assertEqual(y.shape[0], rows) 134 | self.assertEqual(sample_weight.shape[0], rows) 135 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_random_forest.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.random_forest import ( 9 | RandomForestAlgorithm, 10 | RandomForestRegressorAlgorithm, 11 | additional, 12 | regression_additional, 13 | ) 14 | from supervised.utils.metric import Metric 15 | 16 | additional["trees_in_step"] = 1 17 | regression_additional["trees_in_step"] = 1 18 | additional["max_steps"] = 1 19 | regression_additional["max_steps"] = 1 20 | 21 | 22 | class RandomForestRegressorAlgorithmTest(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.X, cls.y = datasets.make_regression( 26 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 27 | ) 28 | 29 | def test_reproduce_fit(self): 30 | metric = Metric({"name": "mse"}) 31 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"} 32 | prev_loss = None 33 | for _ in range(3): 34 | model = RandomForestRegressorAlgorithm(params) 35 | model.fit(self.X, self.y) 36 | y_predicted = model.predict(self.X) 37 | loss = metric(self.y, y_predicted) 38 | if prev_loss is not None: 39 | assert_almost_equal(prev_loss, loss) 40 | prev_loss = loss 41 | 42 | 43 | class RandomForestAlgorithmTest(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.X, cls.y = datasets.make_classification( 47 | n_samples=100, 48 | n_features=5, 49 | n_informative=4, 50 | n_redundant=1, 51 | n_classes=2, 52 | n_clusters_per_class=3, 53 | n_repeated=0, 54 | shuffle=False, 55 | random_state=0, 56 | ) 57 | 58 | def test_reproduce_fit(self): 59 | metric = Metric({"name": "logloss"}) 60 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"} 61 | prev_loss = None 62 | for _ in range(3): 63 | model = RandomForestAlgorithm(params) 64 | model.fit(self.X, self.y) 65 | y_predicted = model.predict(self.X) 66 | loss = metric(self.y, y_predicted) 67 | if prev_loss is not None: 68 | assert_almost_equal(prev_loss, loss) 69 | prev_loss = loss 70 | 71 | def test_fit_predict(self): 72 | metric = Metric({"name": "logloss"}) 73 | params = {"ml_task": "binary_classification"} 74 | rf = RandomForestAlgorithm(params) 75 | 76 | rf.fit(self.X, self.y) 77 | y_predicted = rf.predict(self.X) 78 | self.assertTrue(metric(self.y, y_predicted) < 1.5) 79 | 80 | def test_copy(self): 81 | metric = Metric({"name": "logloss"}) 82 | rf = RandomForestAlgorithm({"ml_task": "binary_classification"}) 83 | rf.fit(self.X, self.y) 84 | y_predicted = rf.predict(self.X) 85 | loss = metric(self.y, y_predicted) 86 | 87 | rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"}) 88 | rf2 = rf.copy() 89 | self.assertEqual(type(rf), type(rf2)) 90 | y_predicted = rf2.predict(self.X) 91 | loss2 = metric(self.y, y_predicted) 92 | assert_almost_equal(loss, loss2) 93 | 94 | def test_save_and_load(self): 95 | metric = Metric({"name": "logloss"}) 96 | rf = RandomForestAlgorithm({"ml_task": "binary_classification"}) 97 | rf.fit(self.X, self.y) 98 | y_predicted = rf.predict(self.X) 99 | loss = metric(self.y, y_predicted) 100 | 101 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 102 | 103 | rf.save(filename) 104 | rf2 = RandomForestAlgorithm({"ml_task": "binary_classification"}) 105 | rf2.load(filename) 106 | # Finished with the file, delete it 107 | os.remove(filename) 108 | 109 | y_predicted = rf2.predict(self.X) 110 | loss2 = metric(self.y, y_predicted) 111 | assert_almost_equal(loss, loss2) 112 | 113 | def test_is_fitted(self): 114 | model = RandomForestAlgorithm({"ml_task": "binary_classification"}) 115 | self.assertFalse(model.is_fitted()) 116 | model.fit(self.X, self.y) 117 | self.assertTrue(model.is_fitted()) 118 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_extra_trees.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | 8 | from supervised.algorithms.extra_trees import ( 9 | ExtraTreesAlgorithm, 10 | ExtraTreesRegressorAlgorithm, 11 | additional, 12 | regression_additional, 13 | ) 14 | from supervised.utils.metric import Metric 15 | 16 | additional["trees_in_step"] = 1 17 | regression_additional["trees_in_step"] = 1 18 | additional["max_steps"] = 1 19 | regression_additional["max_steps"] = 1 20 | 21 | 22 | class ExtraTreesRegressorAlgorithmTest(unittest.TestCase): 23 | @classmethod 24 | def setUpClass(cls): 25 | cls.X, cls.y = datasets.make_regression( 26 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 27 | ) 28 | 29 | def test_reproduce_fit(self): 30 | metric = Metric({"name": "mse"}) 31 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "regression"} 32 | prev_loss = None 33 | for _ in range(3): 34 | model = ExtraTreesRegressorAlgorithm(params) 35 | model.fit(self.X, self.y) 36 | y_predicted = model.predict(self.X) 37 | loss = metric(self.y, y_predicted) 38 | if prev_loss is not None: 39 | assert_almost_equal(prev_loss, loss) 40 | prev_loss = loss 41 | 42 | 43 | class ExtraTreesAlgorithmTest(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.X, cls.y = datasets.make_classification( 47 | n_samples=100, 48 | n_features=5, 49 | n_informative=4, 50 | n_redundant=1, 51 | n_classes=2, 52 | n_clusters_per_class=3, 53 | n_repeated=0, 54 | shuffle=False, 55 | random_state=0, 56 | ) 57 | 58 | def test_reproduce_fit(self): 59 | metric = Metric({"name": "logloss"}) 60 | params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"} 61 | prev_loss = None 62 | for _ in range(3): 63 | model = ExtraTreesAlgorithm(params) 64 | model.fit(self.X, self.y) 65 | y_predicted = model.predict(self.X) 66 | loss = metric(self.y, y_predicted) 67 | if prev_loss is not None: 68 | assert_almost_equal(prev_loss, loss) 69 | prev_loss = loss 70 | 71 | def test_fit_predict(self): 72 | metric = Metric({"name": "logloss"}) 73 | params = {"trees_in_step": 50, "ml_task": "binary_classification"} 74 | rf = ExtraTreesAlgorithm(params) 75 | 76 | rf.fit(self.X, self.y) 77 | y_predicted = rf.predict(self.X) 78 | self.assertTrue(metric(self.y, y_predicted) < 0.6) 79 | 80 | def test_copy(self): 81 | metric = Metric({"name": "logloss"}) 82 | rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 83 | rf.fit(self.X, self.y) 84 | y_predicted = rf.predict(self.X) 85 | loss = metric(self.y, y_predicted) 86 | 87 | rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 88 | rf2 = rf.copy() 89 | self.assertEqual(type(rf), type(rf2)) 90 | y_predicted = rf2.predict(self.X) 91 | loss2 = metric(self.y, y_predicted) 92 | assert_almost_equal(loss, loss2) 93 | 94 | def test_save_and_load(self): 95 | metric = Metric({"name": "logloss"}) 96 | rf = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 97 | rf.fit(self.X, self.y) 98 | y_predicted = rf.predict(self.X) 99 | loss = metric(self.y, y_predicted) 100 | 101 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 102 | 103 | rf.save(filename) 104 | rf2 = ExtraTreesAlgorithm({"ml_task": "binary_classification"}) 105 | rf2.load(filename) 106 | # Finished with the file, delete it 107 | os.remove(filename) 108 | 109 | y_predicted = rf2.predict(self.X) 110 | loss2 = metric(self.y, y_predicted) 111 | assert_almost_equal(loss, loss2) 112 | 113 | def test_is_fitted(self): 114 | params = {"trees_in_step": 50, "ml_task": "binary_classification"} 115 | model = ExtraTreesAlgorithm(params) 116 | self.assertFalse(model.is_fitted()) 117 | model.fit(self.X, self.y) 118 | self.assertTrue(model.is_fitted()) 119 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_lightgbm.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from numpy.testing import assert_almost_equal 8 | from sklearn import datasets 9 | 10 | from supervised.algorithms.lightgbm import LightgbmAlgorithm, additional 11 | from supervised.utils.metric import Metric 12 | 13 | additional["max_rounds"] = 1 14 | 15 | 16 | class LightgbmAlgorithmTest(unittest.TestCase): 17 | @classmethod 18 | def setUpClass(cls): 19 | cls.X, cls.y = datasets.make_classification( 20 | n_samples=100, 21 | n_features=5, 22 | n_informative=4, 23 | n_redundant=1, 24 | n_classes=2, 25 | n_clusters_per_class=3, 26 | n_repeated=0, 27 | shuffle=False, 28 | random_state=0, 29 | ) 30 | cls.params = { 31 | "metric": "binary_logloss", 32 | "num_leaves": "2", 33 | "learning_rate": 0.1, 34 | "feature_fraction": 0.8, 35 | "bagging_fraction": 0.8, 36 | "bagging_freq": 1, 37 | "seed": 1, 38 | "early_stopping_rounds": 0, 39 | } 40 | 41 | def test_reproduce_fit(self): 42 | metric = Metric({"name": "logloss"}) 43 | prev_loss = None 44 | for i in range(3): 45 | model = LightgbmAlgorithm(self.params) 46 | model.fit(self.X, self.y) 47 | y_predicted = model.predict(self.X) 48 | loss = metric(self.y, y_predicted) 49 | if prev_loss is not None: 50 | assert_almost_equal(prev_loss, loss) 51 | prev_loss = loss 52 | 53 | def test_fit_predict(self): 54 | metric = Metric({"name": "logloss"}) 55 | lgb = LightgbmAlgorithm(self.params) 56 | lgb.fit(self.X, self.y) 57 | y_predicted = lgb.predict(self.X) 58 | loss = metric(self.y, y_predicted) 59 | self.assertTrue(loss < 0.7) 60 | 61 | def test_copy(self): 62 | # train model #1 63 | metric = Metric({"name": "logloss"}) 64 | lgb = LightgbmAlgorithm(self.params) 65 | lgb.fit(self.X, self.y) 66 | y_predicted = lgb.predict(self.X) 67 | loss = metric(self.y, y_predicted) 68 | # create model #2 69 | lgb2 = LightgbmAlgorithm(self.params) 70 | # model #2 is set to None, while initialized 71 | self.assertTrue(lgb2.model is None) 72 | # do a copy and use it for predictions 73 | lgb2 = lgb.copy() 74 | self.assertEqual(type(lgb), type(lgb2)) 75 | y_predicted = lgb2.predict(self.X) 76 | loss2 = metric(self.y, y_predicted) 77 | self.assertEqual(loss, loss2) 78 | 79 | def test_save_and_load(self): 80 | metric = Metric({"name": "logloss"}) 81 | lgb = LightgbmAlgorithm(self.params) 82 | lgb.fit(self.X, self.y) 83 | y_predicted = lgb.predict(self.X) 84 | loss = metric(self.y, y_predicted) 85 | 86 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 87 | lgb.save(filename) 88 | lgb2 = LightgbmAlgorithm({}) 89 | self.assertTrue(lgb.uid != lgb2.uid) 90 | self.assertTrue(lgb2.model is None) 91 | lgb2.load(filename) 92 | # Finished with the file, delete it 93 | os.remove(filename) 94 | 95 | y_predicted = lgb2.predict(self.X) 96 | loss2 = metric(self.y, y_predicted) 97 | assert_almost_equal(loss, loss2) 98 | 99 | def test_get_metric_name(self): 100 | model = LightgbmAlgorithm(self.params) 101 | self.assertEqual(model.get_metric_name(), "logloss") 102 | 103 | def test_restricted_characters_in_feature_name(self): 104 | df = pd.DataFrame( 105 | { 106 | "y": np.random.randint(0, 2, size=100), 107 | "[test1]": np.random.uniform(0, 1, size=100), 108 | "test2 < 1": np.random.uniform(0, 1, size=100), 109 | } 110 | ) 111 | 112 | y = df.iloc[:, 0] 113 | X = df.iloc[:, 1:] 114 | 115 | metric = Metric({"name": "logloss"}) 116 | params = {"objective": "binary:logistic", "eval_metric": "logloss"} 117 | lgb = LightgbmAlgorithm(self.params) 118 | lgb.fit(X, y) 119 | lgb.predict(X) 120 | 121 | def test_is_fitted(self): 122 | model = LightgbmAlgorithm(self.params) 123 | self.assertFalse(model.is_fitted()) 124 | model.fit(self.X, self.y) 125 | self.assertTrue(model.is_fitted()) 126 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/preprocessing_utils.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | from scipy import stats 4 | from sklearn import preprocessing 5 | 6 | 7 | class PreprocessingUtilsException(Exception): 8 | pass 9 | 10 | 11 | class PreprocessingUtils(object): 12 | CATEGORICAL = "categorical" 13 | CONTINOUS = "continous" 14 | DISCRETE = "discrete" 15 | DATETIME = "datetime" 16 | TEXT = "text" 17 | 18 | @staticmethod 19 | def get_type(x): 20 | if len(x.shape) > 1: 21 | if x.shape[1] != 1: 22 | raise PreprocessingUtilsException( 23 | "Please select one column to get its type" 24 | ) 25 | col_type = str(x.dtype) 26 | 27 | data_type = PreprocessingUtils.CATEGORICAL 28 | if col_type.startswith("float"): 29 | data_type = PreprocessingUtils.CONTINOUS 30 | elif col_type.startswith("int") or col_type.startswith("uint"): 31 | data_type = PreprocessingUtils.DISCRETE 32 | elif col_type.startswith("datetime"): 33 | data_type = PreprocessingUtils.DATETIME 34 | elif col_type.startswith("category"): 35 | # do not check the additional condition for text feature 36 | # treat it as categorical 37 | return PreprocessingUtils.CATEGORICAL 38 | 39 | if data_type == PreprocessingUtils.CATEGORICAL: 40 | # check maybe this categorical is a text 41 | # it is a text, if: 42 | # has more than 200 unique values 43 | # more than half of rows is unique 44 | unique_cnt = len(np.unique(x[~pd.isnull(x)])) 45 | if unique_cnt > 200 and unique_cnt > int(0.5 * x.shape[0]): 46 | data_type = PreprocessingUtils.TEXT 47 | 48 | return data_type 49 | 50 | @staticmethod 51 | def is_categorical(x_org): 52 | x = x_org[~pd.isnull(x_org)] 53 | return PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL 54 | 55 | @staticmethod 56 | def is_datetime(x_org): 57 | x = x_org[~pd.isnull(x_org)] 58 | return PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME 59 | 60 | @staticmethod 61 | def is_text(x_org): 62 | x = x_org[~pd.isnull(x_org)] 63 | return PreprocessingUtils.get_type(x) == PreprocessingUtils.TEXT 64 | 65 | @staticmethod 66 | def is_0_1(x_org): 67 | x = x_org[~pd.isnull(x_org)] 68 | u = np.unique(x) 69 | if len(u) != 2: 70 | return False 71 | return 0 in u and 1 in u 72 | 73 | @staticmethod 74 | def num_class(x_org): 75 | x = x_org[~pd.isnull(x_org)] 76 | u = np.unique(x) 77 | return len(u) 78 | 79 | @staticmethod 80 | def is_scale_needed(x_org): 81 | x = x_org[~pd.isnull(x_org)] 82 | abs_avg = np.abs(np.mean(x)) 83 | stddev = np.std(x) 84 | if abs_avg > 0.5 or stddev > 1.5: 85 | return True 86 | return False 87 | 88 | @staticmethod 89 | def is_log_scale_needed(x_org): 90 | x_full = np.array(x_org[~pd.isnull(x_org)]) 91 | # first scale on raw data 92 | x = preprocessing.scale(x_full) 93 | # second scale on log data 94 | x_log = preprocessing.scale(np.log(x_full - np.min(x_full) + 1)) 95 | 96 | # the old approach, let's check how new approach will work 97 | # original_skew = np.abs(stats.skew(x)) 98 | # log_skew = np.abs(stats.skew(x_log)) 99 | # return log_skew < original_skew 100 | ######################################################################## 101 | # p is probability of being normal distributions 102 | k2, p1 = stats.normaltest(x) 103 | k2, p2 = stats.normaltest(x_log) 104 | 105 | return p2 > p1 106 | 107 | @staticmethod 108 | def is_na(x): 109 | return np.sum(pd.isnull(x) == True) > 0 110 | 111 | @staticmethod 112 | def get_most_frequent(x): 113 | a = x.value_counts() 114 | first = sorted(dict(a).items(), key=lambda x: -x[1])[0] 115 | return first[0] 116 | 117 | @staticmethod 118 | def get_min(x): 119 | v = np.amin(np.nanmin(x)) 120 | if pd.isnull(v): 121 | return 0 122 | return float(v) 123 | 124 | @staticmethod 125 | def get_mean(x): 126 | v = np.nanmean(x) 127 | if pd.isnull(v): 128 | return 0 129 | return float(v) 130 | 131 | @staticmethod 132 | def get_median(x): 133 | v = np.nanmedian(x) 134 | if pd.isnull(v): 135 | return 0 136 | return float(v) 137 | ``` -------------------------------------------------------------------------------- /tests/tests_fairness/test_binary_classification.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised import AutoML 8 | 9 | 10 | class FairnessInBinaryClassificationTest(unittest.TestCase): 11 | automl_dir = "automl_fairness_testing" 12 | 13 | def tearDown(self): 14 | shutil.rmtree(self.automl_dir, ignore_errors=True) 15 | 16 | def test_init(self): 17 | X = np.random.uniform(size=(30, 2)) 18 | y = np.random.randint(0, 2, size=(30,)) 19 | S = pd.DataFrame({"sensitive": ["A", "B"] * 15}) 20 | 21 | automl = AutoML( 22 | results_path=self.automl_dir, 23 | model_time_limit=10, 24 | algorithms=["Xgboost"], 25 | explain_level=0, 26 | train_ensemble=False, 27 | stack_models=False, 28 | validation_strategy={"validation_type": "split"}, 29 | start_random_models=1, 30 | ) 31 | 32 | automl.fit(X, y, sensitive_features=S) 33 | 34 | self.assertGreater(len(automl._models), 0) 35 | 36 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 37 | self.assertEqual(len(sensitive_features_names), 1) 38 | self.assertTrue("sensitive" in sensitive_features_names) 39 | 40 | self.assertTrue(automl._models[0].get_fairness_metric("sensitive") is not None) 41 | self.assertTrue(len(automl._models[0].get_fairness_optimization()) > 1) 42 | self.assertTrue(automl._models[0].get_worst_fairness() is not None) 43 | self.assertTrue(automl._models[0].get_best_fairness() is not None) 44 | 45 | def test_arguments(self): 46 | X = np.random.uniform(size=(30, 2)) 47 | y = np.random.randint(0, 2, size=(30,)) 48 | S = pd.DataFrame({"sensitive": ["A", "B"] * 15}) 49 | 50 | automl = AutoML( 51 | results_path=self.automl_dir, 52 | model_time_limit=10, 53 | algorithms=["Xgboost"], 54 | privileged_groups=[{"sensitive": "A"}], 55 | underprivileged_groups=[{"sensitive": "B"}], 56 | fairness_metric="demographic_parity_ratio", 57 | fairness_threshold=0.2, 58 | explain_level=0, 59 | train_ensemble=False, 60 | stack_models=False, 61 | validation_strategy={"validation_type": "split"}, 62 | start_random_models=1, 63 | ) 64 | 65 | automl.fit(X, y, sensitive_features=S) 66 | 67 | self.assertGreater(len(automl._models), 0) 68 | 69 | def test_wrong_metric_name(self): 70 | X = np.random.uniform(size=(30, 2)) 71 | y = np.random.randint(0, 2, size=(30,)) 72 | S = pd.DataFrame({"sensitive": ["A", "B"] * 15}) 73 | 74 | with self.assertRaises(ValueError) as context: 75 | automl = AutoML( 76 | results_path=self.automl_dir, 77 | model_time_limit=10, 78 | algorithms=["Xgboost"], 79 | privileged_groups=[{"sensitive": "A"}], 80 | underprivileged_groups=[{"sensitive": "B"}], 81 | fairness_metric="wrong_metric_name", 82 | fairness_threshold=0.2, 83 | explain_level=0, 84 | train_ensemble=False, 85 | stack_models=False, 86 | validation_strategy={"validation_type": "split"}, 87 | start_random_models=1, 88 | ) 89 | automl.fit(X, y, sensitive_features=S) 90 | self.assertTrue("is not allowed" in str(context.exception)) 91 | 92 | def test_two_sensitive_features(self): 93 | X = np.random.uniform(size=(30, 2)) 94 | y = np.random.randint(0, 2, size=(30,)) 95 | S = pd.DataFrame( 96 | { 97 | "sensitive_1": ["White", "Black"] * 15, 98 | "sensitive_2": ["Male", "Female"] * 15, 99 | } 100 | ) 101 | 102 | automl = AutoML( 103 | results_path=self.automl_dir, 104 | model_time_limit=10, 105 | algorithms=["Xgboost"], 106 | explain_level=0, 107 | train_ensemble=False, 108 | stack_models=False, 109 | start_random_models=1, 110 | ) 111 | 112 | automl.fit(X, y, sensitive_features=S) 113 | 114 | self.assertGreater(len(automl._models), 0) 115 | 116 | sensitive_features_names = automl._models[0].get_sensitive_features_names() 117 | self.assertEqual(len(sensitive_features_names), 2) 118 | ``` -------------------------------------------------------------------------------- /supervised/fairness/plots.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | 4 | 5 | class FairnessPlots: 6 | @staticmethod 7 | def binary_classification( 8 | fairness_metric, 9 | col_name, 10 | metrics, 11 | selection_rates, 12 | max_selection_rate, 13 | fairness_threshold, 14 | ): 15 | figures = [] 16 | # selection rate figure 17 | fair_selection_rate = max_selection_rate * fairness_threshold 18 | 19 | fig = plt.figure(figsize=(10, 7)) 20 | ax1 = fig.add_subplot(1, 1, 1) 21 | bars = ax1.bar(metrics.index[1:], metrics["Selection Rate"][1:]) 22 | 23 | ax1.spines[["right", "top", "left"]].set_visible(False) 24 | ax1.yaxis.set_visible(False) 25 | _ = ax1.bar_label(bars, padding=5) 26 | 27 | if fairness_metric == "demographic_parity_ratio": 28 | ax1.axhline(y=fair_selection_rate, zorder=0, color="grey", ls="--", lw=1.5) 29 | _ = ax1.text( 30 | y=fair_selection_rate, 31 | x=-0.6, 32 | s="Fairness threshold", 33 | ha="center", 34 | fontsize=12, 35 | bbox=dict(facecolor="white", edgecolor="grey", ls="--"), 36 | ) 37 | _ = ax1.text( 38 | y=1.2 * fair_selection_rate, 39 | x=-0.6, 40 | s="Fair", 41 | ha="center", 42 | fontsize=12, 43 | ) 44 | _ = ax1.text( 45 | y=0.8 * fair_selection_rate, 46 | x=-0.6, 47 | s="Unfair", 48 | ha="center", 49 | fontsize=12, 50 | ) 51 | 52 | ax1.axhspan( 53 | fairness_threshold * max_selection_rate, 54 | 1.25 * np.max(selection_rates[1:]), 55 | color="green", 56 | alpha=0.05, 57 | ) 58 | ax1.axhspan( 59 | 0, fairness_threshold * max_selection_rate, color="red", alpha=0.05 60 | ) 61 | 62 | figures += [ 63 | { 64 | "title": f"Selection Rate for {col_name}", 65 | "fname": f"selection_rate_{col_name}.png", 66 | "figure": fig, 67 | } 68 | ] 69 | 70 | fig, axes = plt.subplots(figsize=(10, 5), ncols=2, sharey=True) 71 | fig.tight_layout() 72 | bars = axes[0].barh( 73 | metrics.index[1:], 74 | metrics["False Negative Rate"][1:], 75 | zorder=10, 76 | color="tab:orange", 77 | ) 78 | xmax = 1.2 * max( 79 | metrics["False Negative Rate"][1:].max(), 80 | metrics["False Positive Rate"][1:].max(), 81 | ) 82 | axes[0].set_xlim(0, xmax) 83 | axes[0].invert_xaxis() 84 | axes[0].set_title("False Negative Rate") 85 | _ = axes[0].bar_label(bars, padding=5) 86 | 87 | bars = axes[1].barh( 88 | metrics.index[1:], 89 | metrics["False Positive Rate"][1:], 90 | zorder=10, 91 | color="tab:blue", 92 | ) 93 | axes[1].tick_params(axis="y", colors="tab:orange") # tick color 94 | axes[1].set_xlim(0, xmax) 95 | axes[1].set_title("False Positive Rate") 96 | _ = axes[1].bar_label(bars, padding=5) 97 | _ = plt.subplots_adjust(wspace=0, top=0.85, bottom=0.1, left=0.18, right=0.95) 98 | 99 | figures += [ 100 | { 101 | "title": f"False Rates for {col_name}", 102 | "fname": f"false_rates_{col_name}.png", 103 | "figure": fig, 104 | } 105 | ] 106 | 107 | return figures 108 | 109 | @staticmethod 110 | def regression(fairness_metric, col_name, metrics, fairness_metric_name): 111 | figures = [] 112 | metric_name = fairness_metric.split("@")[1].upper() 113 | 114 | fig = plt.figure(figsize=(10, 7)) 115 | ax1 = fig.add_subplot(1, 1, 1) 116 | bars = ax1.bar(metrics.index[1:], metrics[metric_name][1:]) 117 | 118 | ax1.spines[["right", "top"]].set_visible(False) 119 | # ax1.yaxis.set_visible(False) 120 | ax1.set_ylabel(metric_name) 121 | _ = ax1.bar_label(bars, padding=5) 122 | 123 | figures += [ 124 | { 125 | "title": f"{metric_name} for {col_name}", 126 | "fname": f"{metric_name}_{col_name}.png", 127 | "figure": fig, 128 | } 129 | ] 130 | 131 | return figures 132 | ``` -------------------------------------------------------------------------------- /supervised/validation/validator_custom.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import joblib 5 | import numpy as np 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | from supervised.exceptions import AutoMLException 10 | from supervised.utils.utils import load_data 11 | from supervised.validation.validator_base import BaseValidator 12 | 13 | 14 | class CustomValidator(BaseValidator): 15 | def __init__(self, params): 16 | BaseValidator.__init__(self, params) 17 | 18 | cv_path = self.params.get("cv_path") 19 | 20 | if cv_path is None: 21 | raise AutoMLException("You need to specify `cv` as list or iterable") 22 | 23 | self.cv = joblib.load(cv_path) 24 | self.cv = list(self.cv) 25 | 26 | self._results_path = self.params.get("results_path") 27 | self._X_path = self.params.get("X_path") 28 | self._y_path = self.params.get("y_path") 29 | self._sample_weight_path = self.params.get("sample_weight_path") 30 | self._sensitive_features_path = self.params.get("sensitive_features_path") 31 | 32 | if self._X_path is None or self._y_path is None: 33 | raise AutoMLException("No data path set in CustomValidator params") 34 | 35 | folds_path = os.path.join(self._results_path, "folds") 36 | 37 | if not os.path.exists(folds_path): 38 | os.mkdir(folds_path) 39 | 40 | print("Custom validation strategy") 41 | for fold_cnt, (train_index, validation_index) in enumerate(self.cv): 42 | print(f"Split {fold_cnt}.") 43 | print(f"Train {train_index.shape[0]} samples.") 44 | print(f"Validation {validation_index.shape[0]} samples.") 45 | train_index_file = os.path.join( 46 | self._results_path, 47 | "folds", 48 | f"fold_{fold_cnt}_train_indices.npy", 49 | ) 50 | validation_index_file = os.path.join( 51 | self._results_path, 52 | "folds", 53 | f"fold_{fold_cnt}_validation_indices.npy", 54 | ) 55 | 56 | np.save(train_index_file, train_index) 57 | np.save(validation_index_file, validation_index) 58 | 59 | else: 60 | log.debug("Folds split already done, reuse it") 61 | 62 | def get_split(self, k, repeat=0): 63 | try: 64 | train_index_file = os.path.join( 65 | self._results_path, "folds", f"fold_{k}_train_indices.npy" 66 | ) 67 | validation_index_file = os.path.join( 68 | self._results_path, "folds", f"fold_{k}_validation_indices.npy" 69 | ) 70 | 71 | train_index = np.load(train_index_file) 72 | validation_index = np.load(validation_index_file) 73 | 74 | X = load_data(self._X_path) 75 | y = load_data(self._y_path) 76 | y = y["target"] 77 | 78 | sample_weight = None 79 | if self._sample_weight_path is not None: 80 | sample_weight = load_data(self._sample_weight_path) 81 | sample_weight = sample_weight["sample_weight"] 82 | 83 | sensitive_features = None 84 | if self._sensitive_features_path is not None: 85 | sensitive_features = load_data(self._sensitive_features_path) 86 | 87 | train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]} 88 | validation_data = { 89 | "X": X.iloc[validation_index], 90 | "y": y.iloc[validation_index], 91 | } 92 | if sample_weight is not None: 93 | train_data["sample_weight"] = sample_weight.iloc[train_index] 94 | validation_data["sample_weight"] = sample_weight.iloc[validation_index] 95 | if sensitive_features is not None: 96 | train_data["sensitive_features"] = sensitive_features.iloc[train_index] 97 | validation_data["sensitive_features"] = sensitive_features.iloc[ 98 | validation_index 99 | ] 100 | 101 | except Exception as e: 102 | import traceback 103 | 104 | print(traceback.format_exc()) 105 | raise AutoMLException("Problem with custom validation. " + str(e)) 106 | return (train_data, validation_data) 107 | 108 | def get_n_splits(self): 109 | return len(self.cv) 110 | 111 | def get_repeats(self): 112 | return 1 113 | ``` -------------------------------------------------------------------------------- /tests/tests_automl/test_integration.py: -------------------------------------------------------------------------------- ```python 1 | import shutil 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn import datasets 7 | 8 | from supervised import AutoML 9 | 10 | 11 | class AutoMLIntegrationTest(unittest.TestCase): 12 | automl_dir = "AutoMLIntegrationTest" 13 | 14 | def tearDown(self): 15 | shutil.rmtree(self.automl_dir, ignore_errors=True) 16 | 17 | def test_integration(self): 18 | a = AutoML( 19 | results_path=self.automl_dir, 20 | total_time_limit=1, 21 | explain_level=0, 22 | start_random_models=1, 23 | ) 24 | 25 | X, y = datasets.make_classification( 26 | n_samples=100, 27 | n_features=5, 28 | n_informative=4, 29 | n_redundant=1, 30 | n_classes=2, 31 | n_clusters_per_class=3, 32 | n_repeated=0, 33 | shuffle=False, 34 | random_state=0, 35 | ) 36 | 37 | a.fit(X, y) 38 | p = a.predict(X) 39 | self.assertIsInstance(p, np.ndarray) 40 | self.assertEqual(len(p), X.shape[0]) 41 | 42 | def test_one_column_input_regression(self): 43 | a = AutoML( 44 | results_path=self.automl_dir, 45 | total_time_limit=5, 46 | explain_level=0, 47 | start_random_models=1, 48 | ) 49 | 50 | X, y = datasets.make_regression(n_features=1) 51 | 52 | a.fit(X, y) 53 | p = a.predict(X) 54 | 55 | self.assertIsInstance(p, np.ndarray) 56 | self.assertEqual(len(p), X.shape[0]) 57 | 58 | def test_one_column_input_bin_class(self): 59 | a = AutoML( 60 | results_path=self.automl_dir, 61 | total_time_limit=5, 62 | explain_level=0, 63 | start_random_models=1, 64 | ) 65 | 66 | X = pd.DataFrame({"feature_1": np.random.rand(100)}) 67 | y = (np.random.rand(X.shape[0]) > 0.5).astype(int) 68 | 69 | a.fit(X, y) 70 | p = a.predict(X) 71 | 72 | self.assertIsInstance(p, np.ndarray) 73 | self.assertEqual(len(p), X.shape[0]) 74 | 75 | def test_different_input_types(self): 76 | """Test the different data input types for AutoML""" 77 | model = AutoML( 78 | total_time_limit=10, 79 | explain_level=0, 80 | start_random_models=1, 81 | algorithms=["Linear"], 82 | verbose=0, 83 | ) 84 | X, y = datasets.make_regression() 85 | 86 | # First test - X and y as numpy arrays 87 | 88 | pred = model.fit(X, y).predict(X) 89 | 90 | self.assertIsInstance(pred, np.ndarray) 91 | self.assertEqual(len(pred), X.shape[0]) 92 | 93 | del model 94 | 95 | model = AutoML( 96 | total_time_limit=10, 97 | explain_level=0, 98 | start_random_models=1, 99 | algorithms=["Linear"], 100 | verbose=0, 101 | ) 102 | # Second test - X and y as pandas dataframe 103 | X_pandas = pd.DataFrame(X) 104 | y_pandas = pd.DataFrame(y) 105 | pred_pandas = model.fit(X_pandas, y_pandas).predict(X_pandas) 106 | 107 | self.assertIsInstance(pred_pandas, np.ndarray) 108 | self.assertEqual(len(pred_pandas), X.shape[0]) 109 | 110 | del model 111 | 112 | model = AutoML( 113 | total_time_limit=10, 114 | explain_level=0, 115 | start_random_models=1, 116 | algorithms=["Linear"], 117 | verbose=0, 118 | ) 119 | # Third test - X and y as lists 120 | X_list = pd.DataFrame(X).values.tolist() 121 | y_list = pd.DataFrame(y).values.tolist() 122 | pred_list = model.fit(X_pandas, y_pandas).predict(X_pandas) 123 | 124 | self.assertIsInstance(pred_list, np.ndarray) 125 | self.assertEqual(len(pred_list), X.shape[0]) 126 | 127 | def test_integration_float16_data(self): 128 | a = AutoML( 129 | results_path=self.automl_dir, 130 | total_time_limit=1, 131 | explain_level=0, 132 | start_random_models=1, 133 | ) 134 | 135 | X, y = datasets.make_classification( 136 | n_samples=100, 137 | n_features=5, 138 | n_informative=4, 139 | n_redundant=1, 140 | n_classes=2, 141 | n_clusters_per_class=3, 142 | n_repeated=0, 143 | shuffle=False, 144 | random_state=0, 145 | ) 146 | X = pd.DataFrame(X) 147 | X = X.astype(float) 148 | a.fit(X, y) 149 | p = a.predict(X) 150 | self.assertIsInstance(p, np.ndarray) 151 | self.assertEqual(len(p), X.shape[0]) 152 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/algorithm.py: -------------------------------------------------------------------------------- ```python 1 | import uuid 2 | 3 | import numpy as np 4 | 5 | from sklearn.base import BaseEstimator 6 | 7 | from supervised.utils.common import construct_learner_name 8 | from supervised.utils.importance import PermutationImportance 9 | from supervised.utils.shap import PlotSHAP 10 | 11 | 12 | class BaseAlgorithm(BaseEstimator): 13 | """ 14 | This is an abstract class. 15 | All algorithms inherit from BaseAlgorithm. 16 | """ 17 | 18 | algorithm_name = "Unknown" 19 | algorithm_short_name = "Unknown" 20 | 21 | def __init__(self, params): 22 | self.params = params 23 | self.stop_training = False 24 | self.library_version = None 25 | self.model = None 26 | self.uid = params.get("uid", str(uuid.uuid4())) 27 | self.ml_task = params.get("ml_task") 28 | self.model_file_path = None 29 | self.name = "amazing_learner" 30 | 31 | def set_learner_name(self, fold, repeat, repeats): 32 | self.name = construct_learner_name(fold, repeat, repeats) 33 | 34 | def is_fitted(self): 35 | # base class method 36 | return False 37 | 38 | def reload(self): 39 | if not self.is_fitted() and self.model_file_path is not None: 40 | self.load(self.model_file_path) 41 | 42 | def fit( 43 | self, 44 | X, 45 | y, 46 | sample_weight=None, 47 | X_validation=None, 48 | y_validation=None, 49 | sample_weight_validation=None, 50 | log_to_file=None, 51 | max_time=None, 52 | ): 53 | pass 54 | 55 | def predict(self, X): 56 | pass 57 | 58 | # needed for feature importance 59 | def predict_proba(self, X): 60 | y = self.predict(X) 61 | if "num_class" in self.params and self.params["num_class"] > 2: 62 | return y 63 | return np.column_stack((1 - y, y)) 64 | 65 | def update(self, update_params): 66 | pass 67 | 68 | def copy(self): 69 | pass 70 | 71 | def save(self, model_file_path): 72 | pass 73 | 74 | def load(self, model_file_path): 75 | pass 76 | 77 | def get_fname(self): 78 | return f"{self.name}.{self.file_extension()}" 79 | 80 | def interpret( 81 | self, 82 | X_train, 83 | y_train, 84 | X_validation, 85 | y_validation, 86 | model_file_path, 87 | learner_name, 88 | target_name=None, 89 | class_names=None, 90 | metric_name=None, 91 | ml_task=None, 92 | explain_level=2, 93 | ): 94 | # do not produce feature importance for Baseline 95 | if self.algorithm_short_name == "Baseline": 96 | return 97 | if explain_level > 0: 98 | PermutationImportance.compute_and_plot( 99 | self, 100 | X_validation, 101 | y_validation, 102 | model_file_path, 103 | learner_name, 104 | metric_name, 105 | ml_task, 106 | self.params.get("n_jobs", -1), 107 | ) 108 | if explain_level > 1: 109 | PlotSHAP.compute( 110 | self, 111 | X_train, 112 | y_train, 113 | X_validation, 114 | y_validation, 115 | model_file_path, 116 | learner_name, 117 | class_names, 118 | ml_task, 119 | ) 120 | 121 | def get_metric_name(self): 122 | return None 123 | 124 | def get_params(self): 125 | params = { 126 | "library_version": self.library_version, 127 | "algorithm_name": self.algorithm_name, 128 | "algorithm_short_name": self.algorithm_short_name, 129 | "uid": self.uid, 130 | "params": self.params, 131 | "name": self.name, 132 | } 133 | if hasattr(self, "best_ntree_limit") and self.best_ntree_limit is not None: 134 | params["best_ntree_limit"] = self.best_ntree_limit 135 | return params 136 | 137 | def set_params(self, json_desc, learner_path): 138 | self.library_version = json_desc.get("library_version", self.library_version) 139 | self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) 140 | self.algorithm_short_name = json_desc.get( 141 | "algorithm_short_name", self.algorithm_short_name 142 | ) 143 | self.uid = json_desc.get("uid", self.uid) 144 | self.params = json_desc.get("params", self.params) 145 | self.name = json_desc.get("name", self.name) 146 | self.model_file_path = learner_path 147 | 148 | if hasattr(self, "best_ntree_limit"): 149 | self.best_ntree_limit = json_desc.get( 150 | "best_ntree_limit", self.best_ntree_limit 151 | ) 152 | ``` -------------------------------------------------------------------------------- /tests/data/iris_missing_values_missing_target.csv: -------------------------------------------------------------------------------- ``` 1 | feature_1,feature_2,feature_3,feature_4,class 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3.0,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,,Iris-setosa 5 | 4.6,3.1,1.5,,Iris-setosa 6 | 5.0,3.6,1.4,0.2,Iris-setosa 7 | ,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5.0,3.4,1.5,0.2,Iris-setosa 10 | 4.4,,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,,0.2,Iris-setosa 14 | 4.8,3.0,1.4,0.1,Iris-setosa 15 | 4.3,3.0,1.1,0.1,Iris-setosa 16 | 5.8,4.0,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3, 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1.0,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5.0,3.0,1.6,0.2,Iris-setosa 28 | 5.0,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5.0,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3.0,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5.0,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5.0,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3.0,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5.0,3.3,1.4,0.2,Iris-setosa 52 | 7.0,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5, 55 | 5.5,2.3,4.0,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1.0,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5.0,2.0,3.5,1.0,Iris-versicolor 63 | 5.9,3.0,4.2,1.5,Iris-versicolor 64 | 6.0,2.2,4.0,1.0,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3.0,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1.0,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4.0,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3.0,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3.0,5.0,1.7,Iris-versicolor 80 | 6.0,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1.0,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1.0,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6.0,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3.0,4.5,1.5,Iris-versicolor 87 | 6.0,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3.0,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4.0,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3.0,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4.0,1.2,Iris-versicolor 95 | 5.0,2.3,3.3,1.0,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3.0,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3.0,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6.0,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3.0,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3.0,5.8,2.2,Iris-virginica 107 | 7.6,3.0,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2.0,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3.0,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5.0,2.0,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3.0,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6.0,2.2,5.0,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2.0,Iris-virginica 124 | 7.7,2.8,6.7,2.0,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6.0,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3.0,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3.0,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2.0,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3.0,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6.0,3.0,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3.0,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5.0,1.9,Iris-virginica 149 | 6.5,3.0,5.2,2.0,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3.0,5.1,1.8,Iris-virginica 152 | 153 | ``` -------------------------------------------------------------------------------- /supervised/preprocessing/preprocessing_categorical.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from supervised.preprocessing.label_binarizer import LabelBinarizer 5 | from supervised.preprocessing.label_encoder import LabelEncoder 6 | from supervised.preprocessing.preprocessing_utils import PreprocessingUtils 7 | 8 | 9 | class PreprocessingCategorical(object): 10 | CONVERT_ONE_HOT = "categorical_to_onehot" 11 | CONVERT_INTEGER = "categorical_to_int" 12 | 13 | FEW_CATEGORIES = "few_categories" 14 | MANY_CATEGORIES = "many_categories" 15 | 16 | def __init__(self, columns=[], method=CONVERT_INTEGER): 17 | self._convert_method = method 18 | self._convert_params = {} 19 | self._columns = columns 20 | self._enc = None 21 | 22 | def fit(self, X, y=None): 23 | self._fit_categorical_convert(X) 24 | 25 | def _fit_categorical_convert(self, X): 26 | for column in self._columns: 27 | if PreprocessingUtils.get_type(X[column]) != PreprocessingUtils.CATEGORICAL: 28 | # no need to convert, already a number 29 | continue 30 | # limit categories - it is needed when doing one hot encoding 31 | # this code is also used in predict.py file 32 | # and transform_utils.py 33 | # TODO it needs refactoring !!! 34 | too_much_categories = len(np.unique(list(X[column].values))) > 200 35 | lbl = None 36 | if ( 37 | self._convert_method == PreprocessingCategorical.CONVERT_ONE_HOT 38 | and not too_much_categories 39 | ): 40 | lbl = LabelBinarizer() 41 | lbl.fit(X, column) 42 | else: 43 | lbl = LabelEncoder() 44 | lbl.fit(X[column]) 45 | 46 | if lbl is not None: 47 | self._convert_params[column] = lbl.to_json() 48 | 49 | def transform(self, X): 50 | for column, lbl_params in self._convert_params.items(): 51 | if "unique_values" in lbl_params and "new_columns" in lbl_params: 52 | # convert to one hot 53 | lbl = LabelBinarizer() 54 | lbl.from_json(lbl_params) 55 | X = lbl.transform(X, column) 56 | else: 57 | # convert to integer 58 | lbl = LabelEncoder() 59 | lbl.from_json(lbl_params) 60 | transformed_values = lbl.transform(X.loc[:, column]) 61 | # check for pandas FutureWarning: Setting an item 62 | # of incompatible dtype is deprecated and will raise 63 | # in a future error of pandas. 64 | if transformed_values.dtype != X.loc[:, column].dtype and \ 65 | (X.loc[:, column].dtype == bool or X.loc[:, column].dtype == int): 66 | X = X.astype({column: transformed_values.dtype}) 67 | if isinstance(X[column].dtype, pd.CategoricalDtype): 68 | X[column] = X[column].astype('object') 69 | X.loc[:, column] = transformed_values 70 | 71 | return X 72 | 73 | def inverse_transform(self, X): 74 | for column, lbl_params in self._convert_params.items(): 75 | if "unique_values" in lbl_params and "new_columns" in lbl_params: 76 | # convert to one hot 77 | lbl = LabelBinarizer() 78 | lbl.from_json(lbl_params) 79 | X = lbl.inverse_transform(X, column) # should raise exception 80 | else: 81 | # convert to integer 82 | lbl = LabelEncoder() 83 | lbl.from_json(lbl_params) 84 | transformed_values = lbl.inverse_transform(X.loc[:, column]) 85 | # check for pandas FutureWarning: Setting an item 86 | # of incompatible dtype is deprecated and will raise 87 | # in a future error of pandas. 88 | if transformed_values.dtype != X.loc[:, column].dtype and \ 89 | (X.loc[:, column].dtype == bool or X.loc[:, column].dtype == int): 90 | X = X.astype({column: transformed_values.dtype}) 91 | X.loc[:, column] = transformed_values 92 | 93 | return X 94 | 95 | def to_json(self): 96 | params = {} 97 | 98 | if len(self._convert_params) == 0: 99 | return {} 100 | params = { 101 | "convert_method": self._convert_method, 102 | "convert_params": self._convert_params, 103 | "columns": self._columns, 104 | } 105 | return params 106 | 107 | def from_json(self, params): 108 | if params is not None: 109 | self._convert_method = params.get("convert_method", None) 110 | self._columns = params.get("columns", []) 111 | self._convert_params = params.get("convert_params", {}) 112 | 113 | else: 114 | self._convert_method, self._convert_params = None, None 115 | self._columns = [] 116 | ``` -------------------------------------------------------------------------------- /tests/tests_preprocessing/test_label_encoder.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from supervised.preprocessing.label_encoder import LabelEncoder 8 | 9 | 10 | class LabelEncoderTest(unittest.TestCase): 11 | def test_fit(self): 12 | # training data 13 | d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]} 14 | df = pd.DataFrame(data=d) 15 | le = LabelEncoder() 16 | # check first column 17 | le.fit(df["col1"]) 18 | data_json = le.to_json() 19 | # values from column should be in data json 20 | self.assertTrue("a" in data_json) 21 | self.assertTrue("c" in data_json) 22 | self.assertTrue("b" not in data_json) 23 | # there is alphabetical order for values 24 | self.assertEqual(0, data_json["a"]) 25 | self.assertEqual(1, data_json["c"]) 26 | 27 | # check next column 28 | le.fit(df["col2"]) 29 | data_json = le.to_json() 30 | self.assertEqual(0, data_json["d"]) 31 | self.assertEqual(1, data_json["e"]) 32 | self.assertEqual(2, data_json["w"]) 33 | 34 | def test_transform(self): 35 | # training data 36 | d = {"col1": ["a", "a", "c"]} 37 | df = pd.DataFrame(data=d) 38 | # fit encoder 39 | le = LabelEncoder() 40 | le.fit(df["col1"]) 41 | # test data 42 | d_test = {"col2": ["c", "c", "a"]} 43 | df_test = pd.DataFrame(data=d_test) 44 | # transform 45 | y = le.transform(df_test["col2"]) 46 | self.assertEqual(y[0], 1) 47 | self.assertEqual(y[1], 1) 48 | self.assertEqual(y[2], 0) 49 | 50 | def test_transform_with_new_values(self): 51 | # training data 52 | d = {"col1": ["a", "a", "c"]} 53 | df = pd.DataFrame(data=d) 54 | # fit encoder 55 | le = LabelEncoder() 56 | le.fit(df["col1"]) 57 | # test data 58 | d_test = {"col2": ["c", "a", "d", "f"]} 59 | df_test = pd.DataFrame(data=d_test) 60 | # transform 61 | y = le.transform(df_test["col2"]) 62 | self.assertEqual(y[0], 1) 63 | self.assertEqual(y[1], 0) 64 | self.assertEqual(y[2], 2) 65 | self.assertEqual(y[3], 3) 66 | 67 | def test_to_and_from_json(self): 68 | # training data 69 | d = {"col1": ["a", "a", "c"]} 70 | df = pd.DataFrame(data=d) 71 | # fit encoder 72 | le = LabelEncoder() 73 | le.fit(df["col1"]) 74 | 75 | # new encoder 76 | new_le = LabelEncoder() 77 | new_le.from_json(le.to_json()) 78 | 79 | # test data 80 | d_test = {"col2": ["c", "c", "a"]} 81 | df_test = pd.DataFrame(data=d_test) 82 | # transform 83 | y = new_le.transform(df_test["col2"]) 84 | self.assertEqual(y[0], 1) 85 | self.assertEqual(y[1], 1) 86 | self.assertEqual(y[2], 0) 87 | 88 | def test_to_and_from_json_booleans(self): 89 | # training data 90 | d = {"col1": [True, False, True]} 91 | df = pd.DataFrame(data=d) 92 | # fit encoder 93 | le = LabelEncoder() 94 | le.fit(df["col1"]) 95 | 96 | # new encoder 97 | new_le = LabelEncoder() 98 | new_le.from_json(json.loads(json.dumps(le.to_json(), indent=4))) 99 | 100 | # test data 101 | d_test = {"col2": [True, False, True]} 102 | df_test = pd.DataFrame(data=d_test) 103 | # transform 104 | y = new_le.transform(df_test["col2"]) 105 | 106 | self.assertEqual(y[0], 1) 107 | self.assertEqual(y[1], 0) 108 | self.assertEqual(y[2], 1) 109 | 110 | def test_fit_on_numeric_categories(self): 111 | # categories are as strings 112 | # but they represent numbers 113 | # we force encoder to sort them by numeric values 114 | # it is needed for computing predictions for many classes 115 | 116 | # training data 117 | d = {"col1": ["1", "10", "2"]} 118 | df = pd.DataFrame(data=d) 119 | le = LabelEncoder(try_to_fit_numeric=True) 120 | # check first column 121 | le.fit(df["col1"]) 122 | data_json = le.to_json() 123 | print(data_json) 124 | # values from column should be in data json 125 | self.assertTrue("1" in data_json) 126 | self.assertTrue("10" in data_json) 127 | self.assertTrue("2" in data_json) 128 | # there is numeric order for values 129 | self.assertEqual(0, data_json["1"]) 130 | self.assertEqual(1, data_json["2"]) 131 | self.assertEqual(2, data_json["10"]) 132 | p = le.transform(df["col1"]) 133 | p2 = le.transform(np.array(df["col1"].values)) 134 | self.assertEqual(p[0], 0) 135 | self.assertEqual(p[1], 2) 136 | self.assertEqual(p[2], 1) 137 | 138 | self.assertEqual(p[0], p2[0]) 139 | self.assertEqual(p[1], p2[1]) 140 | self.assertEqual(p[2], p2[2]) 141 | 142 | new_le = LabelEncoder() 143 | new_le.from_json(json.loads(json.dumps(le.to_json(), indent=4))) 144 | p2 = new_le.transform(df["col1"]) 145 | 146 | self.assertEqual(p[0], p2[0]) 147 | self.assertEqual(p[1], p2[1]) 148 | self.assertEqual(p[2], p2[2]) 149 | ``` -------------------------------------------------------------------------------- /tests/tests_algorithms/test_nn.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import unittest 4 | 5 | from numpy.testing import assert_almost_equal 6 | from sklearn import datasets 7 | from sklearn import preprocessing 8 | 9 | from supervised.algorithms.nn import MLPAlgorithm, MLPRegressorAlgorithm 10 | from supervised.utils.metric import Metric 11 | 12 | 13 | class MLPAlgorithmTest(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.X, cls.y = datasets.make_classification( 17 | n_samples=100, 18 | n_features=5, 19 | n_informative=4, 20 | n_redundant=1, 21 | n_classes=2, 22 | n_clusters_per_class=3, 23 | n_repeated=0, 24 | shuffle=False, 25 | random_state=1, 26 | ) 27 | 28 | cls.params = { 29 | "dense_1_size": 8, 30 | "dense_2_size": 4, 31 | "learning_rate": 0.01, 32 | "ml_task": "binary_classification", 33 | } 34 | 35 | def test_fit_predict(self): 36 | metric = Metric({"name": "logloss"}) 37 | nn = MLPAlgorithm(self.params) 38 | nn.fit(self.X, self.y) 39 | y_predicted = nn.predict_proba(self.X) 40 | loss = metric(self.y, y_predicted) 41 | self.assertLess(loss, 2) 42 | 43 | def test_copy(self): 44 | # train model #1 45 | metric = Metric({"name": "logloss"}) 46 | nn = MLPAlgorithm(self.params) 47 | nn.fit(self.X, self.y) 48 | y_predicted = nn.predict(self.X) 49 | loss = metric(self.y, y_predicted) 50 | # create model #2 51 | nn2 = MLPAlgorithm(self.params) 52 | # do a copy and use it for predictions 53 | nn2 = nn.copy() 54 | self.assertEqual(type(nn), type(nn2)) 55 | y_predicted = nn2.predict(self.X) 56 | loss2 = metric(self.y, y_predicted) 57 | self.assertEqual(loss, loss2) 58 | 59 | # the loss of model #2 should not change 60 | y_predicted = nn2.predict(self.X) 61 | loss4 = metric(self.y, y_predicted) 62 | assert_almost_equal(loss2, loss4) 63 | 64 | def test_save_and_load(self): 65 | metric = Metric({"name": "logloss"}) 66 | nn = MLPAlgorithm(self.params) 67 | nn.fit(self.X, self.y) 68 | y_predicted = nn.predict(self.X) 69 | loss = metric(self.y, y_predicted) 70 | 71 | filename = os.path.join(tempfile.gettempdir(), os.urandom(12).hex()) 72 | 73 | nn.save(filename) 74 | json_desc = nn.get_params() 75 | nn2 = MLPAlgorithm(json_desc["params"]) 76 | nn2.load(filename) 77 | # Finished with the file, delete it 78 | os.remove(filename) 79 | 80 | y_predicted = nn2.predict(self.X) 81 | loss2 = metric(self.y, y_predicted) 82 | assert_almost_equal(loss, loss2) 83 | 84 | 85 | class MLPRegressorAlgorithmTest(unittest.TestCase): 86 | @classmethod 87 | def setUpClass(cls): 88 | cls.X, cls.y = datasets.make_regression( 89 | n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0 90 | ) 91 | 92 | cls.params = { 93 | "dense_layers": 2, 94 | "dense_1_size": 8, 95 | "dense_2_size": 4, 96 | "dropout": 0, 97 | "learning_rate": 0.01, 98 | "momentum": 0.9, 99 | "decay": 0.001, 100 | "ml_task": "regression", 101 | } 102 | 103 | cls.y = preprocessing.scale(cls.y) 104 | 105 | def test_fit_predict(self): 106 | metric = Metric({"name": "mse"}) 107 | nn = MLPRegressorAlgorithm(self.params) 108 | nn.fit(self.X, self.y) 109 | y_predicted = nn.predict(self.X) 110 | loss = metric(self.y, y_predicted) 111 | self.assertLess(loss, 2) 112 | 113 | 114 | class MultiClassNeuralNetworkAlgorithmTest(unittest.TestCase): 115 | @classmethod 116 | def setUpClass(cls): 117 | cls.X, cls.y = datasets.make_classification( 118 | n_samples=100, 119 | n_features=5, 120 | n_informative=4, 121 | n_redundant=1, 122 | n_classes=3, 123 | n_clusters_per_class=3, 124 | n_repeated=0, 125 | shuffle=False, 126 | random_state=0, 127 | ) 128 | 129 | cls.params = { 130 | "dense_layers": 2, 131 | "dense_1_size": 8, 132 | "dense_2_size": 4, 133 | "dropout": 0, 134 | "learning_rate": 0.01, 135 | "momentum": 0.9, 136 | "decay": 0.001, 137 | "ml_task": "multiclass_classification", 138 | "num_class": 3, 139 | } 140 | 141 | lb = preprocessing.LabelEncoder() 142 | lb.fit(cls.y) 143 | cls.y = lb.transform(cls.y) 144 | 145 | def test_fit_predict(self): 146 | metric = Metric({"name": "logloss"}) 147 | nn = MLPAlgorithm(self.params) 148 | nn.fit(self.X, self.y) 149 | y_predicted = nn.predict(self.X) 150 | loss = metric(self.y, y_predicted) 151 | self.assertLess(loss, 2) 152 | 153 | def test_is_fitted(self): 154 | model = MLPAlgorithm(self.params) 155 | self.assertFalse(model.is_fitted()) 156 | model.fit(self.X, self.y) 157 | self.assertTrue(model.is_fitted()) 158 | ``` -------------------------------------------------------------------------------- /supervised/validation/validator_split.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | import warnings 4 | 5 | import numpy as np 6 | 7 | log = logging.getLogger(__name__) 8 | 9 | from sklearn.model_selection import train_test_split 10 | 11 | from supervised.exceptions import AutoMLException 12 | from supervised.utils.utils import load_data 13 | from supervised.validation.validator_base import BaseValidator 14 | 15 | 16 | class SplitValidator(BaseValidator): 17 | def __init__(self, params): 18 | BaseValidator.__init__(self, params) 19 | 20 | self.train_ratio = self.params.get("train_ratio", 0.8) 21 | self.shuffle = self.params.get("shuffle", True) 22 | self.stratify = self.params.get("stratify", False) 23 | self.random_seed = self.params.get("random_seed", 1234) 24 | self.repeats = self.params.get("repeats", 1) 25 | 26 | if not self.shuffle and self.repeats > 1: 27 | warnings.warn( 28 | "Disable repeats in validation because shuffle is disabled", UserWarning 29 | ) 30 | self.repeats = 1 31 | 32 | self._results_path = self.params.get("results_path") 33 | self._X_path = self.params.get("X_path") 34 | self._y_path = self.params.get("y_path") 35 | self._sample_weight_path = self.params.get("sample_weight_path") 36 | self._sensitive_features_path = self.params.get("sensitive_features_path") 37 | 38 | if self._X_path is None or self._y_path is None: 39 | raise AutoMLException("No data path set in SplitValidator params") 40 | 41 | def get_split(self, k=0, repeat=0): 42 | X = load_data(self._X_path) 43 | y = load_data(self._y_path) 44 | y = y["target"] 45 | 46 | sample_weight = None 47 | if self._sample_weight_path is not None: 48 | sample_weight = load_data(self._sample_weight_path) 49 | sample_weight = sample_weight["sample_weight"] 50 | 51 | sensitive_features = None 52 | if self._sensitive_features_path is not None: 53 | sensitive_features = load_data(self._sensitive_features_path) 54 | 55 | stratify = None 56 | if self.stratify: 57 | stratify = y 58 | if self.shuffle == False: 59 | stratify = None 60 | 61 | input_data = [X, y] 62 | if sample_weight is not None: 63 | input_data += [sample_weight] 64 | if sensitive_features is not None: 65 | input_data += [sensitive_features] 66 | 67 | output_data = train_test_split( 68 | *input_data, 69 | train_size=self.train_ratio, 70 | test_size=1.0 - self.train_ratio, 71 | shuffle=self.shuffle, 72 | stratify=stratify, 73 | random_state=self.random_seed + repeat, 74 | ) 75 | 76 | X_train = output_data[0] 77 | X_validation = output_data[1] 78 | y_train = output_data[2] 79 | y_validation = output_data[3] 80 | if sample_weight is not None: 81 | sample_weight_train = output_data[4] 82 | sample_weight_validation = output_data[5] 83 | if sensitive_features is not None: 84 | sensitive_features_train = output_data[6] 85 | sensitive_features_validation = output_data[7] 86 | else: 87 | if sensitive_features is not None: 88 | sensitive_features_train = output_data[4] 89 | sensitive_features_validation = output_data[5] 90 | 91 | train_data = {"X": X_train, "y": y_train} 92 | validation_data = {"X": X_validation, "y": y_validation} 93 | if sample_weight is not None: 94 | train_data["sample_weight"] = sample_weight_train 95 | validation_data["sample_weight"] = sample_weight_validation 96 | if sensitive_features is not None: 97 | train_data["sensitive_features"] = sensitive_features_train 98 | validation_data["sensitive_features"] = sensitive_features_validation 99 | 100 | repeat_str = f"repeat_{repeat}_" if self.repeats > 1 else "" 101 | 102 | train_data_file = os.path.join( 103 | self._results_path, f"split_{repeat_str}train_indices.npy" 104 | ) 105 | validation_data_file = os.path.join( 106 | self._results_path, f"split_{repeat_str}validation_indices.npy" 107 | ) 108 | 109 | np.save(train_data_file, X_train.index) 110 | np.save(validation_data_file, X_validation.index) 111 | 112 | return train_data, validation_data 113 | 114 | def get_n_splits(self): 115 | return 1 116 | 117 | def get_repeats(self): 118 | return self.repeats 119 | 120 | 121 | """ 122 | import numpy as np 123 | import pandas as pd 124 | 125 | from sklearn.utils.fixes import bincount 126 | from sklearn.model_selection import train_test_split 127 | 128 | import logging 129 | logger = logging.getLogger('mljar') 130 | 131 | 132 | def validation_split(train, validation_train_split, stratify, shuffle, random_seed): 133 | 134 | if shuffle: 135 | else: 136 | if stratify is None: 137 | train, validation = data_split(validation_train_split, train) 138 | else: 139 | train, validation = data_split_stratified(validation_train_split, train, stratify) 140 | return train, validation 141 | 142 | 143 | """ 144 | ``` -------------------------------------------------------------------------------- /supervised/tuner/optuna/xgboost.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | import optuna 3 | import optuna_integration 4 | import xgboost as xgb 5 | 6 | from supervised.algorithms.registry import ( 7 | MULTICLASS_CLASSIFICATION, 8 | ) 9 | from supervised.algorithms.xgboost import xgboost_eval_metric, xgboost_objective 10 | from supervised.utils.metric import ( 11 | Metric, 12 | xgboost_eval_metric_accuracy, 13 | xgboost_eval_metric_average_precision, 14 | xgboost_eval_metric_f1, 15 | xgboost_eval_metric_mse, 16 | xgboost_eval_metric_pearson, 17 | xgboost_eval_metric_r2, 18 | xgboost_eval_metric_spearman, 19 | xgboost_eval_metric_user_defined, 20 | ) 21 | 22 | EPS = 1e-8 23 | 24 | 25 | class XgboostObjective: 26 | def __init__( 27 | self, 28 | ml_task, 29 | X_train, 30 | y_train, 31 | sample_weight, 32 | X_validation, 33 | y_validation, 34 | sample_weight_validation, 35 | eval_metric, 36 | n_jobs, 37 | random_state, 38 | ): 39 | self.dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weight) 40 | self.dvalidation = xgb.DMatrix( 41 | X_validation, label=y_validation, weight=sample_weight_validation 42 | ) 43 | self.X_validation = X_validation 44 | self.y_validation = y_validation 45 | self.eval_metric = eval_metric 46 | self.n_jobs = n_jobs 47 | 48 | self.learning_rate = 0.0125 49 | self.rounds = 1000 50 | self.early_stopping_rounds = 50 51 | self.seed = random_state 52 | 53 | self.objective = "" 54 | self.eval_metric_name = "" 55 | self.num_class = ( 56 | len(np.unique(y_train)) if ml_task == MULTICLASS_CLASSIFICATION else None 57 | ) 58 | 59 | self.objective = xgboost_objective(ml_task, eval_metric.name) 60 | self.eval_metric_name = xgboost_eval_metric(ml_task, eval_metric.name) 61 | 62 | self.custom_eval_metric = None 63 | if self.eval_metric_name == "r2": 64 | self.custom_eval_metric = xgboost_eval_metric_r2 65 | elif self.eval_metric_name == "spearman": 66 | self.custom_eval_metric = xgboost_eval_metric_spearman 67 | elif self.eval_metric_name == "pearson": 68 | self.custom_eval_metric = xgboost_eval_metric_pearson 69 | elif self.eval_metric_name == "f1": 70 | self.custom_eval_metric = xgboost_eval_metric_f1 71 | elif self.eval_metric_name == "average_precision": 72 | self.custom_eval_metric = xgboost_eval_metric_average_precision 73 | elif self.eval_metric_name == "accuracy": 74 | self.custom_eval_metric = xgboost_eval_metric_accuracy 75 | elif self.eval_metric_name == "mse": 76 | self.custom_eval_metric = xgboost_eval_metric_mse 77 | elif self.eval_metric_name == "user_defined_metric": 78 | self.custom_eval_metric = xgboost_eval_metric_user_defined 79 | 80 | def __call__(self, trial): 81 | param = { 82 | "objective": self.objective, 83 | "eval_metric": self.eval_metric_name, 84 | "tree_method": "hist", 85 | "booster": "gbtree", 86 | "eta": trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]), 87 | "max_depth": trial.suggest_int("max_depth", 2, 12), 88 | "lambda": trial.suggest_float("lambda", EPS, 10.0, log=True), 89 | "alpha": trial.suggest_float("alpha", EPS, 10.0, log=True), 90 | "colsample_bytree": min( 91 | trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0 92 | ), 93 | "subsample": min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0), 94 | "min_child_weight": trial.suggest_int("min_child_weight", 1, 100), 95 | "n_jobs": self.n_jobs, 96 | "seed": self.seed, 97 | "verbosity": 0, 98 | } 99 | if self.custom_eval_metric is not None: 100 | del param["eval_metric"] 101 | 102 | if self.num_class is not None: 103 | param["num_class"] = self.num_class 104 | try: 105 | pruning_callback = optuna_integration.XGBoostPruningCallback( 106 | trial, f"validation-{self.eval_metric_name}" 107 | ) 108 | bst = xgb.train( 109 | param, 110 | self.dtrain, 111 | self.rounds, 112 | evals=[(self.dvalidation, "validation")], 113 | early_stopping_rounds=self.early_stopping_rounds, 114 | callbacks=[pruning_callback], 115 | verbose_eval=False, 116 | custom_metric=self.custom_eval_metric, 117 | ) 118 | preds = bst.predict( 119 | self.dvalidation, iteration_range=(0, bst.best_iteration) 120 | ) 121 | score = self.eval_metric(self.y_validation, preds) 122 | if Metric.optimize_negative(self.eval_metric.name): 123 | score *= -1.0 124 | except optuna.exceptions.TrialPruned as e: 125 | raise e 126 | except Exception as e: 127 | print("Exception in XgboostObjective", str(e)) 128 | return None 129 | 130 | return score 131 | ``` -------------------------------------------------------------------------------- /supervised/algorithms/nn.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import warnings 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import sklearn 7 | from sklearn.base import ClassifierMixin, RegressorMixin 8 | from sklearn.neural_network import MLPClassifier, MLPRegressor 9 | 10 | from supervised.algorithms.registry import ( 11 | BINARY_CLASSIFICATION, 12 | MULTICLASS_CLASSIFICATION, 13 | REGRESSION, 14 | AlgorithmsRegistry, 15 | ) 16 | from supervised.algorithms.sklearn import SklearnAlgorithm 17 | from supervised.utils.config import LOG_LEVEL 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(LOG_LEVEL) 21 | 22 | 23 | class NNFit(SklearnAlgorithm): 24 | def file_extension(self): 25 | return "neural_network" 26 | 27 | def is_fitted(self): 28 | return ( 29 | hasattr(self.model, "n_iter_") 30 | and self.model.n_iter_ is not None 31 | and self.model.n_iter_ > 0 32 | ) 33 | 34 | def fit( 35 | self, 36 | X, 37 | y, 38 | sample_weight=None, 39 | X_validation=None, 40 | y_validation=None, 41 | sample_weight_validation=None, 42 | log_to_file=None, 43 | max_time=None, 44 | ): 45 | with warnings.catch_warnings(): 46 | warnings.simplefilter(action="ignore") 47 | # filter 48 | # X does not have valid feature names, but MLPClassifier was fitted with feature names 49 | self.model.fit(X, y) 50 | 51 | if log_to_file is not None: 52 | loss_curve = self.model.loss_curve_ 53 | result = pd.DataFrame( 54 | { 55 | "iteration": range(len(loss_curve)), 56 | "train": loss_curve, 57 | "validation": None, 58 | } 59 | ) 60 | result.to_csv(log_to_file, index=False, header=False) 61 | 62 | if self.params["ml_task"] != REGRESSION: 63 | self.classes_ = np.unique(y) 64 | 65 | 66 | class MLPAlgorithm(ClassifierMixin, NNFit): 67 | algorithm_name = "Neural Network" 68 | algorithm_short_name = "Neural Network" 69 | 70 | def __init__(self, params): 71 | super(MLPAlgorithm, self).__init__(params) 72 | logger.debug("MLPAlgorithm.__init__") 73 | self.max_iters = 1 74 | self.library_version = sklearn.__version__ 75 | h1 = params.get("dense_1_size", 32) 76 | h2 = params.get("dense_2_size", 16) 77 | learning_rate = params.get("learning_rate", 0.05) 78 | 79 | max_iter = 500 80 | self.model = MLPClassifier( 81 | hidden_layer_sizes=(h1, h2), 82 | activation="relu", 83 | solver="adam", 84 | learning_rate=params.get("learning_rate_type", "constant"), 85 | learning_rate_init=learning_rate, 86 | alpha=params.get("alpha", 0.0001), 87 | early_stopping=True, 88 | n_iter_no_change=50, 89 | max_iter=max_iter, 90 | random_state=params.get("seed", 123), 91 | ) 92 | 93 | def get_metric_name(self): 94 | return "logloss" 95 | 96 | 97 | class MLPRegressorAlgorithm(RegressorMixin, NNFit): 98 | algorithm_name = "Neural Network" 99 | algorithm_short_name = "Neural Network" 100 | 101 | def __init__(self, params): 102 | super(MLPRegressorAlgorithm, self).__init__(params) 103 | logger.debug("MLPRegressorAlgorithm.__init__") 104 | self.max_iters = 1 105 | self.library_version = sklearn.__version__ 106 | h1 = params.get("dense_1_size", 32) 107 | h2 = params.get("dense_2_size", 16) 108 | learning_rate = params.get("learning_rate", 0.05) 109 | momentum = params.get("momentum", 0.9) 110 | early_stopping = True 111 | max_iter = 500 112 | self.model = MLPRegressor( 113 | hidden_layer_sizes=(h1, h2), 114 | activation="relu", 115 | solver="adam", 116 | learning_rate="constant", 117 | learning_rate_init=learning_rate, 118 | momentum=momentum, 119 | early_stopping=early_stopping, 120 | max_iter=max_iter, 121 | ) 122 | 123 | def get_metric_name(self): 124 | return "mse" 125 | 126 | 127 | nn_params = { 128 | "dense_1_size": [16, 32, 64], 129 | "dense_2_size": [4, 8, 16, 32], 130 | "learning_rate": [0.01, 0.05, 0.08, 0.1], 131 | } 132 | 133 | default_nn_params = {"dense_1_size": 32, "dense_2_size": 16, "learning_rate": 0.05} 134 | 135 | additional = {"max_rows_limit": None, "max_cols_limit": None} 136 | 137 | required_preprocessing = [ 138 | "missing_values_inputation", 139 | "convert_categorical", 140 | "datetime_transform", 141 | "text_transform", 142 | "scale", 143 | "target_as_integer", 144 | ] 145 | 146 | AlgorithmsRegistry.add( 147 | BINARY_CLASSIFICATION, 148 | MLPAlgorithm, 149 | nn_params, 150 | required_preprocessing, 151 | additional, 152 | default_nn_params, 153 | ) 154 | 155 | AlgorithmsRegistry.add( 156 | MULTICLASS_CLASSIFICATION, 157 | MLPAlgorithm, 158 | nn_params, 159 | required_preprocessing, 160 | additional, 161 | default_nn_params, 162 | ) 163 | 164 | required_preprocessing = [ 165 | "missing_values_inputation", 166 | "convert_categorical", 167 | "datetime_transform", 168 | "text_transform", 169 | "scale", 170 | "target_scale", 171 | ] 172 | 173 | AlgorithmsRegistry.add( 174 | REGRESSION, 175 | MLPRegressorAlgorithm, 176 | nn_params, 177 | required_preprocessing, 178 | additional, 179 | default_nn_params, 180 | ) 181 | ``` -------------------------------------------------------------------------------- /supervised/utils/leaderboard_plots.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | logger = logging.getLogger(__name__) 8 | from supervised.utils.config import LOG_LEVEL 9 | from supervised.utils.metric import Metric 10 | 11 | logger.setLevel(LOG_LEVEL) 12 | 13 | import warnings 14 | 15 | import matplotlib.pyplot as plt 16 | 17 | warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 18 | 19 | 20 | markers = { 21 | "Baseline": {"color": "tab:cyan", "marker": "8"}, 22 | "Linear": {"color": "tab:pink", "marker": "s"}, 23 | "Decision Tree": {"color": "tab:gray", "marker": "^"}, 24 | "Random Forest": {"color": "tab:green", "marker": "o"}, 25 | "Extra Trees": {"color": "tab:brown", "marker": "v"}, 26 | "LightGBM": {"color": "tab:purple", "marker": "P"}, 27 | "Xgboost": {"color": "tab:blue", "marker": "*"}, 28 | "CatBoost": {"color": "tab:orange", "marker": "D"}, 29 | "Neural Network": {"color": "tab:red", "marker": "x"}, 30 | "Nearest Neighbors": {"color": "tab:olive", "marker": "+"}, 31 | "Ensemble": {"color": "black", "marker": "p"}, 32 | } 33 | 34 | 35 | class LeaderboardPlots: 36 | performance_fname = "ldb_performance.png" 37 | performance_boxplot_fname = "ldb_performance_boxplot.png" 38 | 39 | @staticmethod 40 | def compute(ldb, model_path, fout, fairness_threshold=None): 41 | if ldb.shape[0] < 2: 42 | return 43 | # Scatter plot 44 | plt.figure(figsize=(10, 7)) 45 | for model_type in ldb.model_type.unique(): 46 | ii = ldb.model_type == model_type 47 | plt.plot( 48 | ldb.metric_value[ii], 49 | markers[model_type]["marker"], 50 | markersize=12, 51 | alpha=0.75, 52 | color=markers[model_type]["color"], 53 | label=model_type, 54 | ) 55 | # plt.plot(ldb.metric_value, "*", markersize=12, alpha=0.75) 56 | 57 | plt.xlabel("#Iteration") 58 | plt.ylabel(ldb.metric_type.iloc[0]) 59 | plt.legend() 60 | plt.title("AutoML Performance") 61 | plt.tight_layout(pad=2.0) 62 | plot_path = os.path.join(model_path, LeaderboardPlots.performance_fname) 63 | plt.savefig(plot_path) 64 | plt.close("all") 65 | 66 | fout.write("\n\n### AutoML Performance\n") 67 | fout.write(f"") 68 | 69 | # Boxplot 70 | by = "model_type" 71 | column = "metric_value" 72 | df2 = pd.DataFrame({col: vals[column] for col, vals in ldb.groupby(by)}) 73 | 74 | ascending_sort = Metric.optimize_negative(ldb.metric_type.iloc[0]) 75 | mins = df2.min().sort_values(ascending=ascending_sort) 76 | 77 | plt.figure(figsize=(10, 7)) 78 | # plt.title("") 79 | plt.ylabel(ldb.metric_type.iloc[0]) 80 | df2[mins.index].boxplot(rot=90, fontsize=12) 81 | 82 | plt.tight_layout(pad=2.0) 83 | plot_path = os.path.join(model_path, LeaderboardPlots.performance_boxplot_fname) 84 | plt.savefig(plot_path) 85 | plt.close("all") 86 | 87 | fout.write("\n\n### AutoML Performance Boxplot\n") 88 | fout.write( 89 | f"" 90 | ) 91 | 92 | if fairness_threshold is not None: 93 | fairness_metrics = [ 94 | f for f in ldb.columns if "fairness_" in f and f != "fairness_metric" 95 | ] 96 | for fm in fairness_metrics: 97 | x_axis_name = ldb.metric_type.iloc[0] 98 | y_axis_name = ldb["fairness_metric"].iloc[0] 99 | 100 | # Scatter plot 101 | plt.figure(figsize=(10, 7)) 102 | for model_type in ldb.model_type.unique(): 103 | ii = ldb.model_type == model_type 104 | plt.plot( 105 | ldb.metric_value[ii], 106 | ldb[fm][ii], 107 | markers[model_type]["marker"], 108 | markersize=12, 109 | alpha=0.75, 110 | color=markers[model_type]["color"], 111 | label=model_type, 112 | ) 113 | 114 | plt.xlabel(x_axis_name) 115 | plt.ylabel(y_axis_name) 116 | plt.legend() 117 | plt.title(f"Performance vs {fm}") 118 | plt.tight_layout(pad=2.0) 119 | 120 | ymin = 0 121 | ymax = max(1, ldb[fm].max() * 1.1) 122 | plt.ylim(0, ymax) 123 | if "ratio" in y_axis_name: 124 | plt.axhspan(fairness_threshold, ymax, color="green", alpha=0.05) 125 | plt.axhspan(ymin, fairness_threshold, color="red", alpha=0.05) 126 | else: 127 | # difference metric 128 | plt.axhspan(ymin, fairness_threshold, color="green", alpha=0.05) 129 | plt.axhspan(fairness_threshold, ymax, color="red", alpha=0.05) 130 | 131 | fname = f"performance_vs_{fm}.png" 132 | plot_path = os.path.join(model_path, fname) 133 | plt.savefig(plot_path) 134 | plt.close("all") 135 | 136 | fout.write(f"\n\n### Performance vs {fm}\n") 137 | fout.write(f"") 138 | ```