From a0ebd68077404a5269e726cbfdc0fe545b6fb7fa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Jul 2023 20:10:55 +0200 Subject: [PATCH 1/5] ENH support array-like of str for categorical_features in SMOTENC --- doc/over_sampling.rst | 4 +-- doc/whats_new/v0.11.rst | 4 +++ imblearn/over_sampling/_smote/base.py | 36 ++++++++++--------- .../_smote/tests/test_smote_nc.py | 29 +++++++++++++-- 4 files changed, 52 insertions(+), 21 deletions(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 581f395d7..2e6969f3f 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -192,8 +192,8 @@ which categorical data are treated differently:: In this data set, the first and last features are considered as categorical features. One needs to provide this information to :class:`SMOTENC` via the -parameters ``categorical_features`` either by passing the indices of these -features or a boolean mask marking these features:: +parameters ``categorical_features`` either by passing the indices, the feature +names when `X` is a pandas DataFrame, or a boolean mask marking these features:: >>> from imblearn.over_sampling import SMOTENC >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index aa49204f1..ac169418c 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -53,3 +53,7 @@ Enhancements :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not None`) now accept any data types and will not attempt any data conversion. :pr:`1004` by :user:`Guillaume Lemaitre `. + +- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str` + when passing the `categorical_features` parameter. + :pr:`1007` by :user`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index f26b91b19..74c4db5a8 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -16,7 +16,12 @@ from sklearn.base import clone from sklearn.exceptions import DataConversionWarning from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -from sklearn.utils import _safe_indexing, check_array, check_random_state +from sklearn.utils import ( + _get_column_indices, + _safe_indexing, + check_array, + check_random_state, +) from sklearn.utils.sparsefuncs_fast import ( csc_mean_variance_axis0, csr_mean_variance_axis0, @@ -390,10 +395,14 @@ class SMOTENC(SMOTE): Parameters ---------- - categorical_features : array-like of shape (n_cat_features,) or (n_features,) + categorical_features : array-like of shape (n_cat_features,) or (n_features,), \ + dtype={{bool, int, str}} Specified which features are categorical. Can either be: - - array of indices specifying the categorical features; + - array of `int` corresponding to the indices specifying the categorical + features; + - array of `str` corresponding to the feature names. `X` should be a pandas + :class:`pandas.DataFrame` in this case. - mask array of shape (n_features, ) and ``bool`` dtype for which ``True`` indicates the categorical features. @@ -565,24 +574,16 @@ def _check_X_y(self, X, y): self._check_feature_names(X, reset=True) return X, y, binarize_y - def _validate_estimator(self): - super()._validate_estimator() - categorical_features = np.asarray(self.categorical_features) - if categorical_features.dtype.name == "bool": - self.categorical_features_ = np.flatnonzero(categorical_features) - else: - if any( - [cat not in np.arange(self.n_features_) for cat in categorical_features] - ): - raise ValueError( - f"Some of the categorical indices are out of range. Indices" - f" should be between 0 and {self.n_features_ - 1}" - ) - self.categorical_features_ = categorical_features + def _validate_column_types(self, X): + self.categorical_features_ = np.array( + _get_column_indices(X, self.categorical_features) + ) self.continuous_features_ = np.setdiff1d( np.arange(self.n_features_), self.categorical_features_ ) + def _validate_estimator(self): + super()._validate_estimator() if self.categorical_features_.size == self.n_features_in_: raise ValueError( "SMOTE-NC is not designed to work only with categorical " @@ -600,6 +601,7 @@ def _fit_resample(self, X, y): ) self.n_features_ = _num_features(X) + self._validate_column_types(X) self._validate_estimator() # compute the median of the standard deviation of the minority class diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index fa82abeef..3d23cef64 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -63,7 +63,7 @@ def data_heterogneous_masked(): X[:, 3] = rng.randint(3, size=30) y = np.array([0] * 10 + [1] * 20) # return the categories - return X, y, [True, False, True] + return X, y, [True, False, False, True] def data_heterogneous_unordered_multiclass(): @@ -98,7 +98,7 @@ def test_smotenc_error(): X, y, _ = data_heterogneous_unordered() categorical_features = [0, 10] smote = SMOTENC(random_state=0, categorical_features=categorical_features) - with pytest.raises(ValueError, match="indices are out of range"): + with pytest.raises(ValueError, match="all features must be in"): smote.fit_resample(X, y) @@ -324,3 +324,28 @@ def test_smotenc_bool_categorical(): X_res, y_res = smote.fit_resample(X, y) pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) assert len(X_res) == len(y_res) + + +def test_smotenc_categorical_features_str(): + """Check that we support array-like of strings for `categorical_features` using + pandas dataframe. + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "B": ["a", "b"] * 5, + "C": ["a", "b", "c"] * 3 + ["a"], + } + ) + X = pd.concat([X] * 10, ignore_index=True) + y = np.array([0] * 70 + [1] * 30) + smote = SMOTENC(categorical_features=["B", "C"], random_state=0) + X_res, y_res = smote.fit_resample(X, y) + assert X_res["B"].isin(["a", "b"]).all() + assert X_res["C"].isin(["a", "b", "c"]).all() + counter = Counter(y_res) + assert counter[0] == counter[1] == 70 + assert_array_equal(smote.categorical_features_, [1, 2]) + assert_array_equal(smote.continuous_features_, [0]) From 8a0b7600b0a53389e95dee96fca564625546eeb1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Jul 2023 20:13:32 +0200 Subject: [PATCH 2/5] iter --- doc/whats_new/v0.11.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index ac169418c..67d89d27d 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -56,4 +56,4 @@ Enhancements - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str` when passing the `categorical_features` parameter. - :pr:`1007` by :user`Guillaume Lemaitre `. + :pr:`1008` by :user`Guillaume Lemaitre `. From 3c131ce0dc91bea0bef081875d641ff4b44755d5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Jul 2023 20:55:04 +0200 Subject: [PATCH 3/5] ENH add auto inference based on pd.CategoricalDtype in SMOTENC --- doc/whats_new/v0.11.rst | 4 ++ imblearn/over_sampling/_smote/base.py | 45 ++++++++++++----- .../_smote/tests/test_smote_nc.py | 49 +++++++++++++++++++ 3 files changed, 87 insertions(+), 11 deletions(-) diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst index 9c7feefea..dffe65c24 100644 --- a/doc/whats_new/v0.11.rst +++ b/doc/whats_new/v0.11.rst @@ -61,3 +61,7 @@ Enhancements - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str` when passing the `categorical_features` parameter. :pr:`1008` by :user`Guillaume Lemaitre `. + +- :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference + when `categorical_features` is set to `"auto"`. + :pr:`1009` by :user`Guillaume Lemaitre `. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 74c4db5a8..3bbe5f3b0 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -31,9 +31,9 @@ from ...metrics.pairwise import ValueDifferenceMetric from ...utils import Substitution, check_neighbors_object, check_target_type from ...utils._docstring import _n_jobs_docstring, _random_state_docstring -from ...utils._param_validation import HasMethods, Interval +from ...utils._param_validation import HasMethods, Interval, StrOptions from ...utils._validation import _check_X -from ...utils.fixes import _mode +from ...utils.fixes import _is_pandas_df, _mode from ..base import BaseOverSampler @@ -395,10 +395,13 @@ class SMOTENC(SMOTE): Parameters ---------- - categorical_features : array-like of shape (n_cat_features,) or (n_features,), \ - dtype={{bool, int, str}} + categorical_features : "infer" or array-like of shape (n_cat_features,) or \ + (n_features,), dtype={{bool, int, str}} Specified which features are categorical. Can either be: + - "auto" (default) to automatically detect categorical features. Only + supported when `X` is a :class:`pandas.DataFrame` and it corresponds + to columns that have a :class:`pandas.CategoricalDtype`; - array of `int` corresponding to the indices specifying the categorical features; - array of `str` corresponding to the feature names. `X` should be a pandas @@ -538,7 +541,7 @@ class SMOTENC(SMOTE): _parameter_constraints: dict = { **SMOTE._parameter_constraints, - "categorical_features": ["array-like"], + "categorical_features": ["array-like", StrOptions({"auto"})], "categorical_encoder": [ HasMethods(["fit_transform", "inverse_transform"]), None, @@ -575,12 +578,27 @@ def _check_X_y(self, X, y): return X, y, binarize_y def _validate_column_types(self, X): - self.categorical_features_ = np.array( - _get_column_indices(X, self.categorical_features) - ) - self.continuous_features_ = np.setdiff1d( - np.arange(self.n_features_), self.categorical_features_ - ) + """Compute the indices of the categorical and continuous features.""" + if self.categorical_features == "auto": + if not _is_pandas_df(X): + raise ValueError( + "When `categorical_features='auto'`, the input data " + f"should be a pandas.DataFrame. Got {type(X)} instead." + ) + import pandas as pd # safely import pandas now + + are_columns_categorical = np.array( + [isinstance(col_dtype, pd.CategoricalDtype) for col_dtype in X.dtypes] + ) + self.categorical_features_ = np.flatnonzero(are_columns_categorical) + self.continuous_features_ = np.flatnonzero(~are_columns_categorical) + else: + self.categorical_features_ = np.array( + _get_column_indices(X, self.categorical_features) + ) + self.continuous_features_ = np.setdiff1d( + np.arange(self.n_features_), self.categorical_features_ + ) def _validate_estimator(self): super()._validate_estimator() @@ -589,6 +607,11 @@ def _validate_estimator(self): "SMOTE-NC is not designed to work only with categorical " "features. It requires some numerical features." ) + elif self.categorical_features_.size == 0: + raise ValueError( + "SMOTE-NC is not designed to work only with numerical " + "features. It requires some categorical features." + ) def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py index 3d23cef64..84dd6c252 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py +++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py @@ -349,3 +349,52 @@ def test_smotenc_categorical_features_str(): assert counter[0] == counter[1] == 70 assert_array_equal(smote.categorical_features_, [1, 2]) assert_array_equal(smote.continuous_features_, [0]) + + +def test_smotenc_categorical_features_auto(): + """Check that we can automatically detect categorical features based on pandas + dataframe. + """ + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "B": ["a", "b"] * 5, + "C": ["a", "b", "c"] * 3 + ["a"], + } + ) + X = pd.concat([X] * 10, ignore_index=True) + X["B"] = X["B"].astype("category") + X["C"] = X["C"].astype("category") + y = np.array([0] * 70 + [1] * 30) + smote = SMOTENC(categorical_features="auto", random_state=0) + X_res, y_res = smote.fit_resample(X, y) + assert X_res["B"].isin(["a", "b"]).all() + assert X_res["C"].isin(["a", "b", "c"]).all() + counter = Counter(y_res) + assert counter[0] == counter[1] == 70 + assert_array_equal(smote.categorical_features_, [1, 2]) + assert_array_equal(smote.continuous_features_, [0]) + + +def test_smote_nc_categorical_features_auto_error(): + """Check that we raise a proper error when we cannot use the `'auto'` mode.""" + pd = pytest.importorskip("pandas") + + X = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "B": ["a", "b"] * 5, + "C": ["a", "b", "c"] * 3 + ["a"], + } + ) + y = np.array([0] * 70 + [1] * 30) + smote = SMOTENC(categorical_features="auto", random_state=0) + + with pytest.raises(ValueError, match="the input data should be a pandas.DataFrame"): + smote.fit_resample(X.to_numpy(), y) + + err_msg = "SMOTE-NC is not designed to work only with numerical features" + with pytest.raises(ValueError, match=err_msg): + smote.fit_resample(X, y) From daf8c6b1bf8bbd7e83d4673688d9c964ed99325e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Jul 2023 20:57:57 +0200 Subject: [PATCH 4/5] update documentation --- doc/over_sampling.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 2e6969f3f..dcb5af980 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -193,7 +193,9 @@ which categorical data are treated differently:: In this data set, the first and last features are considered as categorical features. One needs to provide this information to :class:`SMOTENC` via the parameters ``categorical_features`` either by passing the indices, the feature -names when `X` is a pandas DataFrame, or a boolean mask marking these features:: +names when `X` is a pandas DataFrame, a boolean mask marking these features, +or relying on `dtype` inference if the columns are using the +:class:`pandas.CategoricalDtype`:: >>> from imblearn.over_sampling import SMOTENC >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) From 6c0a7284d6dd6bdfc65afcf2cf135027d2c11876 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 8 Jul 2023 21:01:22 +0200 Subject: [PATCH 5/5] iter --- imblearn/over_sampling/_smote/base.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index bf2155028..3bbe5f3b0 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -395,7 +395,6 @@ class SMOTENC(SMOTE): Parameters ---------- -<<<<<<< HEAD categorical_features : "infer" or array-like of shape (n_cat_features,) or \ (n_features,), dtype={{bool, int, str}} Specified which features are categorical. Can either be: @@ -403,12 +402,6 @@ class SMOTENC(SMOTE): - "auto" (default) to automatically detect categorical features. Only supported when `X` is a :class:`pandas.DataFrame` and it corresponds to columns that have a :class:`pandas.CategoricalDtype`; -======= - categorical_features : array-like of shape (n_cat_features,) or (n_features,), \ - dtype={{bool, int, str}} - Specified which features are categorical. Can either be: - ->>>>>>> origin/master - array of `int` corresponding to the indices specifying the categorical features; - array of `str` corresponding to the feature names. `X` should be a pandas