From a0ebd68077404a5269e726cbfdc0fe545b6fb7fa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 Jul 2023 20:10:55 +0200
Subject: [PATCH 1/5] ENH support array-like of str for categorical_features in
 SMOTENC

---
 doc/over_sampling.rst                         |  4 +--
 doc/whats_new/v0.11.rst                       |  4 +++
 imblearn/over_sampling/_smote/base.py         | 36 ++++++++++---------
 .../_smote/tests/test_smote_nc.py             | 29 +++++++++++++--
 4 files changed, 52 insertions(+), 21 deletions(-)
diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
index 581f395d7..2e6969f3f 100644
--- a/doc/over_sampling.rst
+++ b/doc/over_sampling.rst
@@ -192,8 +192,8 @@ which categorical data are treated differently::
 
 In this data set, the first and last features are considered as categorical
 features. One needs to provide this information to :class:`SMOTENC` via the
-parameters ``categorical_features`` either by passing the indices of these
-features or a boolean mask marking these features::
+parameters ``categorical_features`` either by passing the indices, the feature
+names when `X` is a pandas DataFrame, or a boolean mask marking these features::
 
   >>> from imblearn.over_sampling import SMOTENC
   >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
index aa49204f1..ac169418c 100644
--- a/doc/whats_new/v0.11.rst
+++ b/doc/whats_new/v0.11.rst
@@ -53,3 +53,7 @@ Enhancements
   :class:`~imblearn.over_sampling.RandomOverSampler` (when `shrinkage is not
   None`) now accept any data types and will not attempt any data conversion.
   :pr:`1004` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
+  when passing the `categorical_features` parameter.
+  :pr:`1007` by :user`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index f26b91b19..74c4db5a8 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -16,7 +16,12 @@
 from sklearn.base import clone
 from sklearn.exceptions import DataConversionWarning
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
-from sklearn.utils import _safe_indexing, check_array, check_random_state
+from sklearn.utils import (
+    _get_column_indices,
+    _safe_indexing,
+    check_array,
+    check_random_state,
+)
 from sklearn.utils.sparsefuncs_fast import (
     csc_mean_variance_axis0,
     csr_mean_variance_axis0,
@@ -390,10 +395,14 @@ class SMOTENC(SMOTE):
 
     Parameters
     ----------
-    categorical_features : array-like of shape (n_cat_features,) or (n_features,)
+    categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
+            dtype={{bool, int, str}}
         Specified which features are categorical. Can either be:
 
-        - array of indices specifying the categorical features;
+        - array of `int` corresponding to the indices specifying the categorical
+          features;
+        - array of `str` corresponding to the feature names. `X` should be a pandas
+          :class:`pandas.DataFrame` in this case.
         - mask array of shape (n_features, ) and ``bool`` dtype for which
           ``True`` indicates the categorical features.
 
@@ -565,24 +574,16 @@ def _check_X_y(self, X, y):
         self._check_feature_names(X, reset=True)
         return X, y, binarize_y
 
-    def _validate_estimator(self):
-        super()._validate_estimator()
-        categorical_features = np.asarray(self.categorical_features)
-        if categorical_features.dtype.name == "bool":
-            self.categorical_features_ = np.flatnonzero(categorical_features)
-        else:
-            if any(
-                [cat not in np.arange(self.n_features_) for cat in categorical_features]
-            ):
-                raise ValueError(
-                    f"Some of the categorical indices are out of range. Indices"
-                    f" should be between 0 and {self.n_features_ - 1}"
-                )
-            self.categorical_features_ = categorical_features
+    def _validate_column_types(self, X):
+        self.categorical_features_ = np.array(
+            _get_column_indices(X, self.categorical_features)
+        )
         self.continuous_features_ = np.setdiff1d(
             np.arange(self.n_features_), self.categorical_features_
         )
 
+    def _validate_estimator(self):
+        super()._validate_estimator()
         if self.categorical_features_.size == self.n_features_in_:
             raise ValueError(
                 "SMOTE-NC is not designed to work only with categorical "
@@ -600,6 +601,7 @@ def _fit_resample(self, X, y):
             )
 
         self.n_features_ = _num_features(X)
+        self._validate_column_types(X)
         self._validate_estimator()
 
         # compute the median of the standard deviation of the minority class
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
index fa82abeef..3d23cef64 100644
--- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py
+++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
@@ -63,7 +63,7 @@ def data_heterogneous_masked():
     X[:, 3] = rng.randint(3, size=30)
     y = np.array([0] * 10 + [1] * 20)
     # return the categories
-    return X, y, [True, False, True]
+    return X, y, [True, False, False, True]
 
 
 def data_heterogneous_unordered_multiclass():
@@ -98,7 +98,7 @@ def test_smotenc_error():
     X, y, _ = data_heterogneous_unordered()
     categorical_features = [0, 10]
     smote = SMOTENC(random_state=0, categorical_features=categorical_features)
-    with pytest.raises(ValueError, match="indices are out of range"):
+    with pytest.raises(ValueError, match="all features must be in"):
         smote.fit_resample(X, y)
 
 
@@ -324,3 +324,28 @@ def test_smotenc_bool_categorical():
     X_res, y_res = smote.fit_resample(X, y)
     pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
     assert len(X_res) == len(y_res)
+
+
+def test_smotenc_categorical_features_str():
+    """Check that we support array-like of strings for `categorical_features` using
+    pandas dataframe.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "B": ["a", "b"] * 5,
+            "C": ["a", "b", "c"] * 3 + ["a"],
+        }
+    )
+    X = pd.concat([X] * 10, ignore_index=True)
+    y = np.array([0] * 70 + [1] * 30)
+    smote = SMOTENC(categorical_features=["B", "C"], random_state=0)
+    X_res, y_res = smote.fit_resample(X, y)
+    assert X_res["B"].isin(["a", "b"]).all()
+    assert X_res["C"].isin(["a", "b", "c"]).all()
+    counter = Counter(y_res)
+    assert counter[0] == counter[1] == 70
+    assert_array_equal(smote.categorical_features_, [1, 2])
+    assert_array_equal(smote.continuous_features_, [0])

From 8a0b7600b0a53389e95dee96fca564625546eeb1 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 Jul 2023 20:13:32 +0200
Subject: [PATCH 2/5] iter

---
 doc/whats_new/v0.11.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
index ac169418c..67d89d27d 100644
--- a/doc/whats_new/v0.11.rst
+++ b/doc/whats_new/v0.11.rst
@@ -56,4 +56,4 @@ Enhancements
 
 - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
   when passing the `categorical_features` parameter.
-  :pr:`1007` by :user`Guillaume Lemaitre <glemaitre>`.
+  :pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.

From 3c131ce0dc91bea0bef081875d641ff4b44755d5 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 Jul 2023 20:55:04 +0200
Subject: [PATCH 3/5] ENH add auto inference based on pd.CategoricalDtype in
 SMOTENC

---
 doc/whats_new/v0.11.rst                       |  4 ++
 imblearn/over_sampling/_smote/base.py         | 45 ++++++++++++-----
 .../_smote/tests/test_smote_nc.py             | 49 +++++++++++++++++++
 3 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/doc/whats_new/v0.11.rst b/doc/whats_new/v0.11.rst
index 9c7feefea..dffe65c24 100644
--- a/doc/whats_new/v0.11.rst
+++ b/doc/whats_new/v0.11.rst
@@ -61,3 +61,7 @@ Enhancements
 - :class:`~imblearn.over_sampling.SMOTENC` now support passing array-like of `str`
   when passing the `categorical_features` parameter.
   :pr:`1008` by :user`Guillaume Lemaitre <glemaitre>`.
+
+- :class:`~imblearn.over_sampling.SMOTENC` now support automatic categorical inference
+  when `categorical_features` is set to `"auto"`.
+  :pr:`1009` by :user`Guillaume Lemaitre <glemaitre>`.
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index 74c4db5a8..3bbe5f3b0 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -31,9 +31,9 @@
 from ...metrics.pairwise import ValueDifferenceMetric
 from ...utils import Substitution, check_neighbors_object, check_target_type
 from ...utils._docstring import _n_jobs_docstring, _random_state_docstring
-from ...utils._param_validation import HasMethods, Interval
+from ...utils._param_validation import HasMethods, Interval, StrOptions
 from ...utils._validation import _check_X
-from ...utils.fixes import _mode
+from ...utils.fixes import _is_pandas_df, _mode
 from ..base import BaseOverSampler
 
 
@@ -395,10 +395,13 @@ class SMOTENC(SMOTE):
 
     Parameters
     ----------
-    categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
-            dtype={{bool, int, str}}
+    categorical_features : "infer" or array-like of shape (n_cat_features,) or \
+            (n_features,), dtype={{bool, int, str}}
         Specified which features are categorical. Can either be:
 
+        - "auto" (default) to automatically detect categorical features. Only
+          supported when `X` is a :class:`pandas.DataFrame` and it corresponds
+          to columns that have a :class:`pandas.CategoricalDtype`;
         - array of `int` corresponding to the indices specifying the categorical
           features;
         - array of `str` corresponding to the feature names. `X` should be a pandas
@@ -538,7 +541,7 @@ class SMOTENC(SMOTE):
 
     _parameter_constraints: dict = {
         **SMOTE._parameter_constraints,
-        "categorical_features": ["array-like"],
+        "categorical_features": ["array-like", StrOptions({"auto"})],
         "categorical_encoder": [
             HasMethods(["fit_transform", "inverse_transform"]),
             None,
@@ -575,12 +578,27 @@ def _check_X_y(self, X, y):
         return X, y, binarize_y
 
     def _validate_column_types(self, X):
-        self.categorical_features_ = np.array(
-            _get_column_indices(X, self.categorical_features)
-        )
-        self.continuous_features_ = np.setdiff1d(
-            np.arange(self.n_features_), self.categorical_features_
-        )
+        """Compute the indices of the categorical and continuous features."""
+        if self.categorical_features == "auto":
+            if not _is_pandas_df(X):
+                raise ValueError(
+                    "When `categorical_features='auto'`, the input data "
+                    f"should be a pandas.DataFrame. Got {type(X)} instead."
+                )
+            import pandas as pd  # safely import pandas now
+
+            are_columns_categorical = np.array(
+                [isinstance(col_dtype, pd.CategoricalDtype) for col_dtype in X.dtypes]
+            )
+            self.categorical_features_ = np.flatnonzero(are_columns_categorical)
+            self.continuous_features_ = np.flatnonzero(~are_columns_categorical)
+        else:
+            self.categorical_features_ = np.array(
+                _get_column_indices(X, self.categorical_features)
+            )
+            self.continuous_features_ = np.setdiff1d(
+                np.arange(self.n_features_), self.categorical_features_
+            )
 
     def _validate_estimator(self):
         super()._validate_estimator()
@@ -589,6 +607,11 @@ def _validate_estimator(self):
                 "SMOTE-NC is not designed to work only with categorical "
                 "features. It requires some numerical features."
             )
+        elif self.categorical_features_.size == 0:
+            raise ValueError(
+                "SMOTE-NC is not designed to work only with numerical "
+                "features. It requires some categorical features."
+            )
 
     def _fit_resample(self, X, y):
         # FIXME: to be removed in 0.12
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
index 3d23cef64..84dd6c252 100644
--- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py
+++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
@@ -349,3 +349,52 @@ def test_smotenc_categorical_features_str():
     assert counter[0] == counter[1] == 70
     assert_array_equal(smote.categorical_features_, [1, 2])
     assert_array_equal(smote.continuous_features_, [0])
+
+
+def test_smotenc_categorical_features_auto():
+    """Check that we can automatically detect categorical features based on pandas
+    dataframe.
+    """
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "B": ["a", "b"] * 5,
+            "C": ["a", "b", "c"] * 3 + ["a"],
+        }
+    )
+    X = pd.concat([X] * 10, ignore_index=True)
+    X["B"] = X["B"].astype("category")
+    X["C"] = X["C"].astype("category")
+    y = np.array([0] * 70 + [1] * 30)
+    smote = SMOTENC(categorical_features="auto", random_state=0)
+    X_res, y_res = smote.fit_resample(X, y)
+    assert X_res["B"].isin(["a", "b"]).all()
+    assert X_res["C"].isin(["a", "b", "c"]).all()
+    counter = Counter(y_res)
+    assert counter[0] == counter[1] == 70
+    assert_array_equal(smote.categorical_features_, [1, 2])
+    assert_array_equal(smote.continuous_features_, [0])
+
+
+def test_smote_nc_categorical_features_auto_error():
+    """Check that we raise a proper error when we cannot use the `'auto'` mode."""
+    pd = pytest.importorskip("pandas")
+
+    X = pd.DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "B": ["a", "b"] * 5,
+            "C": ["a", "b", "c"] * 3 + ["a"],
+        }
+    )
+    y = np.array([0] * 70 + [1] * 30)
+    smote = SMOTENC(categorical_features="auto", random_state=0)
+
+    with pytest.raises(ValueError, match="the input data should be a pandas.DataFrame"):
+        smote.fit_resample(X.to_numpy(), y)
+
+    err_msg = "SMOTE-NC is not designed to work only with numerical features"
+    with pytest.raises(ValueError, match=err_msg):
+        smote.fit_resample(X, y)

From daf8c6b1bf8bbd7e83d4673688d9c964ed99325e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 Jul 2023 20:57:57 +0200
Subject: [PATCH 4/5] update documentation

---
 doc/over_sampling.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
index 2e6969f3f..dcb5af980 100644
--- a/doc/over_sampling.rst
+++ b/doc/over_sampling.rst
@@ -193,7 +193,9 @@ which categorical data are treated differently::
 In this data set, the first and last features are considered as categorical
 features. One needs to provide this information to :class:`SMOTENC` via the
 parameters ``categorical_features`` either by passing the indices, the feature
-names when `X` is a pandas DataFrame, or a boolean mask marking these features::
+names when `X` is a pandas DataFrame, a boolean mask marking these features,
+or relying on `dtype` inference if the columns are using the
+:class:`pandas.CategoricalDtype`::
 
   >>> from imblearn.over_sampling import SMOTENC
   >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)

From 6c0a7284d6dd6bdfc65afcf2cf135027d2c11876 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 8 Jul 2023 21:01:22 +0200
Subject: [PATCH 5/5] iter

---
 imblearn/over_sampling/_smote/base.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index bf2155028..3bbe5f3b0 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -395,7 +395,6 @@ class SMOTENC(SMOTE):
 
     Parameters
     ----------
-<<<<<<< HEAD
     categorical_features : "infer" or array-like of shape (n_cat_features,) or \
             (n_features,), dtype={{bool, int, str}}
         Specified which features are categorical. Can either be:
@@ -403,12 +402,6 @@ class SMOTENC(SMOTE):
         - "auto" (default) to automatically detect categorical features. Only
           supported when `X` is a :class:`pandas.DataFrame` and it corresponds
           to columns that have a :class:`pandas.CategoricalDtype`;
-=======
-    categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
-            dtype={{bool, int, str}}
-        Specified which features are categorical. Can either be:
-
->>>>>>> origin/master
         - array of `int` corresponding to the indices specifying the categorical
           features;
         - array of `str` corresponding to the feature names. `X` should be a pandas