diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index d5f28ab9a..a12bde941 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -9,5 +9,20 @@ Changelog Compatibility ............. -- Maintenance release for be compatible with scikit-learn >= 1.3.0. +- Maintenance release for being compatible with scikit-learn >= 1.3.0. :pr:`999` by :user:`Guillaume Lemaitre `. + +Enhancements +............ + +- :class:`~imblearn.over_sampling.SMOTENC` now accepts a parameter `categorical_encoder` + allowing to specify a :class:`~sklearn.preprocessing.OneHotEncoder` with custom + parameters. + :pr:`1000` by :user:`Guillaume Lemaitre `. + +Deprecation +........... + +- The fitted attribute `ohe_` in :class:`~imblearn.over_sampling.SMOTENC` is deprecated + and will be removed in version 0.13. Use `categorical_encoder_` instead. + :pr:`1000` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 967f59d6f..bd0823ed0 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -13,6 +13,7 @@ import numpy as np from scipy import sparse +from sklearn.base import clone from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.utils import _safe_indexing, check_array, check_random_state from sklearn.utils.sparsefuncs_fast import ( @@ -393,6 +394,11 @@ class SMOTENC(SMOTE): - mask array of shape (n_features, ) and ``bool`` dtype for which ``True`` indicates the categorical features. + categorical_encoder : estimator, default=None + One-hot encoder used to encode the categorical features. If `None`, a + :class:`~sklearn.preprocessing.OneHotEncoder` is used with default parameters + apart from `handle_unknown` which is set to 'ignore'. + {sampling_strategy} {random_state} @@ -431,6 +437,13 @@ class SMOTENC(SMOTE): ohe_ : :class:`~sklearn.preprocessing.OneHotEncoder` The one-hot encoder used to encode the categorical features. + .. deprecated:: 0.11 + `ohe_` is deprecated in 0.11 and will be removed in 0.13. Use + `categorical_encoder_` instead. + + categorical_encoder_ : estimator + The encoder used to encode the categorical features. + categorical_features_ : ndarray of shape (n_cat_features,), dtype=np.int64 Indices of the categorical features. @@ -514,12 +527,17 @@ class SMOTENC(SMOTE): _parameter_constraints: dict = { **SMOTE._parameter_constraints, "categorical_features": ["array-like"], + "categorical_encoder": [ + HasMethods(["fit_transform", "inverse_transform"]), + None, + ], } def __init__( self, categorical_features, *, + categorical_encoder=None, sampling_strategy="auto", random_state=None, k_neighbors=5, @@ -532,6 +550,7 @@ def __init__( n_jobs=n_jobs, ) self.categorical_features = categorical_features + self.categorical_encoder = categorical_encoder def _check_X_y(self, X, y): """Overwrite the checking to let pass some string for categorical @@ -603,17 +622,19 @@ def _fit_resample(self, X, y): else: dtype_ohe = np.float64 - self.ohe_ = OneHotEncoder(handle_unknown="ignore", dtype=dtype_ohe) - if hasattr(self.ohe_, "sparse_output"): - # scikit-learn >= 1.2 - self.ohe_.set_params(sparse_output=True) + if self.categorical_encoder is None: + self.categorical_encoder_ = OneHotEncoder( + handle_unknown="ignore", dtype=dtype_ohe + ) else: - self.ohe_.set_params(sparse=True) + self.categorical_encoder_ = clone(self.categorical_encoder) # the input of the OneHotEncoder needs to be dense - X_ohe = self.ohe_.fit_transform( + X_ohe = self.categorical_encoder_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) + if not sparse.issparse(X_ohe): + X_ohe = sparse.csr_matrix(X_ohe, dtype=dtype_ohe) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever @@ -636,7 +657,7 @@ def _fit_resample(self, X, y): # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size :] X_res_cat.data = np.ones_like(X_res_cat.data) - X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) + X_res_cat_dec = self.categorical_encoder_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( @@ -695,7 +716,7 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): all_neighbors = nn_data[nn_num[rows]] categories_size = [self.continuous_features_.size] + [ - cat.size for cat in self.ohe_.categories_ + cat.size for cat in self.categorical_encoder_.categories_ ] for start_idx, end_idx in zip( @@ -714,6 +735,16 @@ def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps): return X_new + @property + def ohe_(self): + """One-hot encoder used to encode the categorical features.""" + warnings.warn( + "'ohe_' attribute has been deprecated in 0.11 and will be removed " + "in 0.13. Use 'categorical_encoder_' instead.", + FutureWarning, + ) + return self.categorical_encoder_ + @Substitution( sampling_strategy=BaseOverSampler._sampling_strategy_docstring, diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index 63f36b62c..f2c6f4aed 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -8,11 +8,20 @@ import numpy as np import pytest +import sklearn from scipy import sparse from sklearn.datasets import make_classification +from sklearn.preprocessing import OneHotEncoder from sklearn.utils._testing import assert_allclose, assert_array_equal +from sklearn.utils.fixes import parse_version from imblearn.over_sampling import SMOTENC +from imblearn.utils.estimator_checks import ( + _set_checking_parameters, + check_param_validation, +) + +sklearn_version = parse_version(sklearn.__version__) def data_heterogneous_ordered(): @@ -182,8 +191,7 @@ def test_smotenc_pandas(): smote = SMOTENC(categorical_features=categorical_features, random_state=0) X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) X_res, y_res = smote.fit_resample(X, y) - # FIXME: we should use to_numpy with pandas >= 0.25 - assert_array_equal(X_res_pd.values, X_res) + assert_array_equal(X_res_pd.to_numpy(), X_res) assert_allclose(y_res_pd, y_res) @@ -240,3 +248,45 @@ def test_smote_nc_with_null_median_std(): # check that the categorical feature is not random but correspond to the # categories seen in the minority class samples assert X_res[-1, -1] == "C" + + +def test_smotenc_categorical_encoder(): + """Check that we can pass our own categorical encoder.""" + + # TODO: only use `sparse_output` when sklearn >= 1.2 + param = "sparse" if sklearn_version < parse_version("1.2") else "sparse_output" + + X, y, categorical_features = data_heterogneous_unordered() + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + smote.fit_resample(X, y) + + assert getattr(smote.categorical_encoder_, param) is True + + encoder = OneHotEncoder() + encoder.set_params(**{param: False}) + smote.set_params(categorical_encoder=encoder).fit_resample(X, y) + assert smote.categorical_encoder is encoder + assert smote.categorical_encoder_ is not encoder + assert getattr(smote.categorical_encoder_, param) is False + + +# TODO(0.13): remove this test +def test_smotenc_deprecation_ohe_(): + """Check that we raise a deprecation warning when using `ohe_`.""" + X, y, categorical_features = data_heterogneous_unordered() + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + smote.fit_resample(X, y) + + with pytest.warns(FutureWarning, match="'ohe_' attribute has been deprecated"): + smote.ohe_ + + +def test_smotenc_param_validation(): + """Check that we validate the parameters correctly since this estimator requires + a specific parameter. + """ + categorical_features = [0] + smote = SMOTENC(categorical_features=categorical_features, random_state=0) + name = smote.__class__.__name__ + _set_checking_parameters(smote) + check_param_validation(name, smote)