From 5aa8eeba9d8a627e84a7a436a7bcdf332d0afc6f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 18:50:48 +0200 Subject: [PATCH 1/3] ENH add categorical_encoder param to SMOTENC --- doc/whats_new/v0.11.rst | 17 +++++++- imblearn/over_sampling/_smote/base.py | 42 +++++++++++++++---- .../_smote/tests/test_smote_nc.py | 39 ++++++++++++++++- 3 files changed, 87 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index d5f28ab9a..a12bde941 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -9,5 +9,20 @@ Changelog Compatibility ............. -- Maintenance release for be compatible with scikit-learn >= 1.3.0. +- Maintenance release for being compatible with scikit-learn >= 1.3.0. :pr:`999` by :user:`Guillaume Lemaitre `. + +Enhancements +............ + +- :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder` + allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom + parameters. + :pr:`1000` by :user:`Guillaume Lemaitre `. + +Deprecation +........... + +- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated + and will be removed in version 0.13. Use `categorical_encoder_` instead. + :pr:`1000` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 967f59d6f..8075cb988 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -13,6 +13,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils import _safe_indexing, check_array, check_random_state from sklearn.utils.sparsefuncs_fast import ( @@ -393,6 +394,11 @@ class SMOTENC(SMOTE): - mask array of shape (n_features, ) and ``bool`` dtype for which ``True`` indicates the categorical features. + categorical_encoder : estimator, default=None + One-hot encoder used to encode the categorical features. If `None`, a + :class:`~sklearn.preprocessing.OneHotEncoder` is used with default parameters + apart from `handle_unknown` which is set to 'ignore'. + {sampling_strategy} {random_state} @@ -431,6 +437,13 @@ class SMOTENC(SMOTE): ohe_ : :class:`~sklearn.preprocessing.OneHotEncoder` The one-hot encoder used to encode the categorical features. + .. deprecated:: 0.11 + `ohe_` is deprecated in 0.11 and will be removed in 0.13. Use + `categorical_encoder_` instead. + + categorical_encoder_ : estimator + The encoder used to encode the categorical features. + categorical_features_ : ndarray of shape (n_cat_features,), dtype=np.int64 Indices of the categorical features. @@ -520,6 +533,7 @@ def __init__( self, categorical_features, *, + categorical_encoder=None, sampling_strategy="auto", random_state=None, k_neighbors=5, @@ -532,6 +546,7 @@ def __init__( n_jobs=n_jobs, ) self.categorical_features = categorical_features + self.categorical_encoder = categorical_encoder def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical @@ -603,17 +618,19 @@ def _fit_resample(self, X, y): else: dtype_ohe = np.float64 - self.ohe_ = OneHotEncoder(handle_unknown="ignore", dtype=dtype_ohe) - if hasattr(self.ohe_, "sparse_output"): - # scikit-learn >= 1.2 - self.ohe_.set_params(sparse_output=True) + if self.categorical_encoder is None: + self.categorical_encoder_ = OneHotEncoder( + handle_unknown="ignore", dtype=dtype_ohe + ) else: - self.ohe_.set_params(sparse=True) + self.categorical_encoder_ = clone(self.categorical_encoder) # the input of the OneHotEncoder needs to be dense - X_ohe = self.ohe_.fit_transform( + X_ohe = self.categorical_encoder_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) + if not sparse.issparse(X_ohe): + X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever @@ -636,7 +653,7 @@ def _fit_resample(self, X, y): # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size :] X_res_cat.data = np.ones_like(X_res_cat.data) - X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) + X_res_cat_dec = self.categorical_encoder_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( @@ -695,7 +712,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): all_neighbors = nn_data[nn_num[rows]] categories_size = [self.continuous_features_.size] + [ - cat.size for cat in self.ohe_.categories_ + cat.size for cat in self.categorical_encoder_.categories_ ] for start_idx, end_idx in zip( @@ -714,6 +731,15 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): return X_new + @property + def ohe_(self): + warnings.warn( + "'ohe_' attribute has been deprecated in 0.11 and will be removed " + "in 0.13. Use 'categorical_encoder_' instead.", + FutureWarning, + ) + return self.categorical_encoder_ + @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index 63f36b62c..2a06209cd 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -8,12 +8,17 @@ import numpy as np import pytest +import sklearn from scipy import sparse from sklearn.datasets import make_classification +from sklearn.preprocessing import OneHotEncoder from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import parse_version from imblearn.over_sampling import SMOTENC +sklearn_version = parse_version(sklearn.__version__) + def data_heterogneous_ordered(): rng = np.random.RandomState(42) @@ -182,8 +187,7 @@ def test_smotenc_pandas(): smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) - # FIXME: we should use to_numpy with pandas >= 0.25 - assert_array_equal(X_res_pd.values, X_res) + assert_array_equal(X_res_pd.to_numpy(), X_res) assert_allclose(y_res_pd, y_res) @@ -240,3 +244,34 @@ def test_smote_nc_with_null_median_std(): # check that the categorical feature is not random but correspond to the # categories seen in the minority class samples assert X_res[-1, -1] == "C" + + +def test_smotenc_categorical_encoder(): + """Check that we can pass our own categorical encoder.""" + + # TODO: only use `sparse_output` when sklearn >= 1.2 + param = "sparse" if sklearn_version < parse_version("1.2") else "sparse_output" + + X, y, categorical_features = data_heterogneous_unordered() + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + smote.fit_resample(X, y) + + assert getattr(smote.categorical_encoder_, param) is True + + encoder = OneHotEncoder() + encoder.set_params(**{param: False}) + smote.set_params(categorical_encoder=encoder).fit_resample(X, y) + assert smote.categorical_encoder is encoder + assert smote.categorical_encoder_ is not encoder + assert getattr(smote.categorical_encoder_, param) is False + + +# TODO(0.13): remove this test +def test_smotenc_deprecation_ohe_(): + """Check that we raise a deprecation warning when using `ohe_`.""" + X, y, categorical_features = data_heterogneous_unordered() + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + smote.fit_resample(X, y) + + with pytest.warns(FutureWarning, match="'ohe_' attribute has been deprecated"): + smote.ohe_ From e3ea964f39aa32ce6978583ce00157b1d61acf71 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 19:04:21 +0200 Subject: [PATCH 2/3] iter --- imblearn/over_sampling/_smote/base.py | 4 ++++ .../over_sampling/_smote/tests/test_smote_nc.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 8075cb988..a35e04d00 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -527,6 +527,10 @@ class SMOTENC(SMOTE): _parameter_constraints: dict = { **SMOTE._parameter_constraints, "categorical_features": ["array-like"], + "categorical_encoder": [ + HasMethods(["fit_transform", "inverse_transform"]), + None, + ], } def __init__( diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index 2a06209cd..f2c6f4aed 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -16,6 +16,10 @@ from sklearn.utils.fixes import parse_version from imblearn.over_sampling import SMOTENC +from imblearn.utils.estimator_checks import ( + _set_checking_parameters, + check_param_validation, +) sklearn_version = parse_version(sklearn.__version__) @@ -275,3 +279,14 @@ def test_smotenc_deprecation_ohe_(): with pytest.warns(FutureWarning, match="'ohe_' attribute has been deprecated"): smote.ohe_ + + +def test_smotenc_param_validation(): + """Check that we validate the parameters correctly since this estimator requires + a specific parameter. + """ + categorical_features = [0] + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + name = smote.__class__.__name__ + _set_checking_parameters(smote) + check_param_validation(name, smote) From 4acdd1ae37808fca695fbdfb6e78274b85e60eaf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 7 Jul 2023 19:56:15 +0200 Subject: [PATCH 3/3] iter --- imblearn/over_sampling/_smote/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index a35e04d00..bd0823ed0 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -737,6 +737,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): @property def ohe_(self): + """One-hot encoder used to encode the categorical features.""" warnings.warn( "'ohe_' attribute has been deprecated in 0.11 and will be removed " "in 0.13. Use 'categorical_encoder_' instead.",