Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/over_sampling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a
new generated sample are decided by picking the most frequent category of the
nearest neighbors present during the generation.

.. warning::
Be aware that SMOTE-NC is not designed to work with only categorical data.

The other SMOTE variants and ADASYN differ from each other by selecting the
samples :math:`x_i` ahead of generating the new samples.

Expand Down
4 changes: 4 additions & 0 deletions doc/whats_new/v0.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ Bug fixes
unusable.
:pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.

- Raise a proper error message when only numerical or categorical features
are given in :class:`imblearn.over_sampling.SMOTENC`.
:pr:`720` by :user:`Guillaume Lemaitre <glemaitre>`.

Enhancements
............

Expand Down
12 changes: 10 additions & 2 deletions imblearn/over_sampling/_smote.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from sklearn.utils import check_random_state
from sklearn.utils import _safe_indexing
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0
from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0

Expand Down Expand Up @@ -747,6 +746,7 @@ class SMOTENC(SMOTE):
"""Synthetic Minority Over-sampling Technique for Nominal and Continuous.

Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
categorical features. However, it is not designed to work with only
categorical features.

Read more in the :ref:`User Guide <smote_adasyn>`.
Expand Down Expand Up @@ -893,7 +893,9 @@ def _check_X_y(self, X, y):
features.
"""
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
X, y = self._validate_data(
X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"]
)
return X, y, binarize_y

def _validate_estimator(self):
Expand All @@ -917,6 +919,12 @@ def _validate_estimator(self):
np.arange(self.n_features_), self.categorical_features_
)

if self.categorical_features_.size == self.n_features_in_:
raise ValueError(
"SMOTE-NC is not designed to work only with categorical "
"features. It requires some numerical features."
)

def _fit_resample(self, X, y):
self.n_features_ = X.shape[1]
self._validate_estimator()
Expand Down
14 changes: 14 additions & 0 deletions imblearn/over_sampling/tests/test_smote_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype():
X_res, y_res = smote.fit_resample(X, y)
assert X.dtype == X_res.dtype, "X dtype is not preserved"
assert y.dtype == y_res.dtype, "y dtype is not preserved"


@pytest.mark.parametrize(
"categorical_features", [[True, True, True], [0, 1, 2]]
)
def test_smotenc_raising_error_all_categorical(categorical_features):
X, y = make_classification(
n_features=3, n_informative=1, n_redundant=1, n_repeated=0,
n_clusters_per_class=1,
)
smote = SMOTENC(categorical_features=categorical_features)
err_msg = "SMOTE-NC is not designed to work only with categorical features"
with pytest.raises(ValueError, match=err_msg):
smote.fit_resample(X, y)