3131from ...metrics .pairwise import ValueDifferenceMetric
3232from ...utils import Substitution , check_neighbors_object , check_target_type
3333from ...utils ._docstring import _n_jobs_docstring , _random_state_docstring
34- from ...utils ._param_validation import HasMethods , Interval
34+ from ...utils ._param_validation import HasMethods , Interval , StrOptions
3535from ...utils ._validation import _check_X
36- from ...utils .fixes import _mode
36+ from ...utils .fixes import _is_pandas_df , _mode
3737from ..base import BaseOverSampler
3838
3939
@@ -395,10 +395,13 @@ class SMOTENC(SMOTE):
395395
396396 Parameters
397397 ----------
398- categorical_features : array-like of shape (n_cat_features,) or (n_features,), \
399- dtype={{bool, int, str}}
398+ categorical_features : "infer" or array-like of shape (n_cat_features,) or \
399+ (n_features,), dtype={{bool, int, str}}
400400 Specified which features are categorical. Can either be:
401401
402+ - "auto" (default) to automatically detect categorical features. Only
403+ supported when `X` is a :class:`pandas.DataFrame` and it corresponds
404+ to columns that have a :class:`pandas.CategoricalDtype`;
402405 - array of `int` corresponding to the indices specifying the categorical
403406 features;
404407 - array of `str` corresponding to the feature names. `X` should be a pandas
@@ -538,7 +541,7 @@ class SMOTENC(SMOTE):
538541
539542 _parameter_constraints : dict = {
540543 ** SMOTE ._parameter_constraints ,
541- "categorical_features" : ["array-like" ],
544+ "categorical_features" : ["array-like" , StrOptions ({ "auto" }) ],
542545 "categorical_encoder" : [
543546 HasMethods (["fit_transform" , "inverse_transform" ]),
544547 None ,
@@ -575,12 +578,27 @@ def _check_X_y(self, X, y):
575578 return X , y , binarize_y
576579
577580 def _validate_column_types (self , X ):
578- self .categorical_features_ = np .array (
579- _get_column_indices (X , self .categorical_features )
580- )
581- self .continuous_features_ = np .setdiff1d (
582- np .arange (self .n_features_ ), self .categorical_features_
583- )
581+ """Compute the indices of the categorical and continuous features."""
582+ if self .categorical_features == "auto" :
583+ if not _is_pandas_df (X ):
584+ raise ValueError (
585+ "When `categorical_features='auto'`, the input data "
586+ f"should be a pandas.DataFrame. Got { type (X )} instead."
587+ )
588+ import pandas as pd # safely import pandas now
589+
590+ are_columns_categorical = np .array (
591+ [isinstance (col_dtype , pd .CategoricalDtype ) for col_dtype in X .dtypes ]
592+ )
593+ self .categorical_features_ = np .flatnonzero (are_columns_categorical )
594+ self .continuous_features_ = np .flatnonzero (~ are_columns_categorical )
595+ else :
596+ self .categorical_features_ = np .array (
597+ _get_column_indices (X , self .categorical_features )
598+ )
599+ self .continuous_features_ = np .setdiff1d (
600+ np .arange (self .n_features_ ), self .categorical_features_
601+ )
584602
585603 def _validate_estimator (self ):
586604 super ()._validate_estimator ()
@@ -589,6 +607,11 @@ def _validate_estimator(self):
589607 "SMOTE-NC is not designed to work only with categorical "
590608 "features. It requires some numerical features."
591609 )
610+ elif self .categorical_features_ .size == 0 :
611+ raise ValueError (
612+ "SMOTE-NC is not designed to work only with numerical "
613+ "features. It requires some categorical features."
614+ )
592615
593616 def _fit_resample (self , X , y ):
594617 # FIXME: to be removed in 0.12
0 commit comments