Skip to content

Commit 4961a58

Browse files
committed
FIX raise proper error message when only categorical passed to SMOTE-NC
1 parent 91b99ce commit 4961a58

File tree

4 files changed

+31
-1
lines changed

4 files changed

+31
-1
lines changed

doc/over_sampling.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ something specific for the categorical features. In fact, the categories of a
230230
new generated sample are decided by picking the most frequent category of the
231231
nearest neighbors present during the generation.
232232

233+
.. warning::
234+
Be aware that SMOTE-NC is not designed to work with only categorical data.
235+
233236
The other SMOTE variants and ADASYN differ from each other by selecting the
234237
samples :math:`x_i` ahead of generating the new samples.
235238

doc/whats_new/v0.7.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ Bug fixes
3838
unusable.
3939
:pr:`710` by :user:`Guillaume Lemaitre <glemaitre>`.
4040

41+
- Raise a proper error message when only numerical or categorical features
42+
are given in :class:`imblearn.over_sampling.SMOTENC`.
43+
:pr:`xxx` by :user:`Guillaume Lemaitre <glemaitre>`.
44+
4145
Enhancements
4246
............
4347

imblearn/over_sampling/_smote.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,7 @@ class SMOTENC(SMOTE):
747747
"""Synthetic Minority Over-sampling Technique for Nominal and Continuous.
748748
749749
Unlike :class:`SMOTE`, SMOTE-NC for dataset containing continuous and
750+
categorical features. However, it is not designed to work with only
750751
categorical features.
751752
752753
Read more in the :ref:`User Guide <smote_adasyn>`.
@@ -893,7 +894,9 @@ def _check_X_y(self, X, y):
893894
features.
894895
"""
895896
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
896-
X, y = check_X_y(X, y, accept_sparse=["csr", "csc"], dtype=None)
897+
X, y = self._validate_data(
898+
X, y, reset=True, dtype=None, accept_sparse=["csr", "csc"]
899+
)
897900
return X, y, binarize_y
898901

899902
def _validate_estimator(self):
@@ -917,6 +920,12 @@ def _validate_estimator(self):
917920
np.arange(self.n_features_), self.categorical_features_
918921
)
919922

923+
if self.categorical_features_.size == self.n_features_in_:
924+
raise ValueError(
925+
"SMOTE-NC is not designed to work only with categorical "
926+
"features. It requires some numerical features."
927+
)
928+
920929
def _fit_resample(self, X, y):
921930
self.n_features_ = X.shape[1]
922931
self._validate_estimator()

imblearn/over_sampling/tests/test_smote_nc.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,17 @@ def test_smotenc_preserve_dtype():
204204
X_res, y_res = smote.fit_resample(X, y)
205205
assert X.dtype == X_res.dtype, "X dtype is not preserved"
206206
assert y.dtype == y_res.dtype, "y dtype is not preserved"
207+
208+
209+
@pytest.mark.parametrize(
210+
"categorical_features", [[True, True, True], [0, 1, 2]]
211+
)
212+
def test_smotenc_raising_error_all_categorical(categorical_features):
213+
X, y = make_classification(
214+
n_features=3, n_informative=1, n_redundant=1, n_repeated=0,
215+
n_clusters_per_class=1,
216+
)
217+
smote = SMOTENC(categorical_features=categorical_features)
218+
err_msg = "SMOTE-NC is not designed to work only with categorical features"
219+
with pytest.raises(ValueError, match=err_msg):
220+
smote.fit_resample(X, y)

0 commit comments

Comments
 (0)