@@ -1297,7 +1297,75 @@ def _fit_resample(self, X, y):
12971297 return X_resampled , y_resampled
12981298
12991299
1300+ @Substitution (
1301+ sampling_strategy = BaseOverSampler ._sampling_strategy_docstring ,
1302+ n_jobs = _n_jobs_docstring ,
1303+ random_state = _random_state_docstring ,
1304+ )
13001305class SMOTEN (SMOTE ):
1306+ """Perform SMOTE over-sampling for nominal categorical features only.
1307+
1308+ This method is refered as SMOTEN in [1]_.
1309+
1310+ Read more in the :ref:`User Guide <smote_adasyn>`.
1311+
1312+ Parameters
1313+ ----------
1314+ {sampling_strategy}
1315+
1316+ {random_state}
1317+
1318+ k_neighbors : int or object, default=5
1319+ If ``int``, number of nearest neighbours to used to construct synthetic
1320+ samples. If object, an estimator that inherits from
1321+ :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to
1322+ find the k_neighbors.
1323+
1324+ {n_jobs}
1325+
1326+ See Also
1327+ --------
1328+ SMOTE : Over-sample using SMOTE.
1329+
1330+ SMOTENC : Over-sample using SMOTE for continuous and categorical features.
1331+
1332+ BorderlineSMOTE : Over-sample using the borderline-SMOTE variant.
1333+
1334+ SVMSMOTE : Over-sample using the SVM-SMOTE variant.
1335+
1336+ ADASYN : Over-sample using ADASYN.
1337+
1338+ KMeansSMOTE : Over-sample applying a clustering before to oversample using
1339+ SMOTE.
1340+
1341+ Notes
1342+ -----
1343+ See the original papers: [1]_ for more details.
1344+
1345+ Supports multi-class resampling. A one-vs.-rest scheme is used as
1346+ originally proposed in [1]_.
1347+
1348+ References
1349+ ----------
1350+ .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
1351+ synthetic minority over-sampling technique," Journal of artificial
1352+ intelligence research, 321-357, 2002.
1353+
1354+ Examples
1355+ --------
1356+ >>> import numpy as np
1357+ >>> X = np.array(["A"] * 10 + ["B"] * 20 + ["C"] * 30, dtype=object).reshape(-1, 1)
1358+ >>> y = np.array([0] * 20 + [1] * 40, dtype=np.int32)
1359+ >>> from collections import Counter
1360+ >>> print(f"Original class counts: {{Counter(y)}}")
1361+ Original class counts: Counter({{1: 40, 0: 20}})
1362+ >>> from imblearn.over_sampling import SMOTEN
1363+ >>> sampler = SMOTEN(random_state=0)
1364+ >>> X_res, y_res = sampler.fit_resample(X, y)
1365+ >>> print(f"Class counts after resampling {{Counter(y_res)}}")
1366+ Class counts after resampling Counter({{0: 40, 1: 40}})
1367+ """
1368+
13011369 def _check_X_y (self , X , y ):
13021370 y , binarize_y = check_target_type (y , indicate_one_vs_all = True )
13031371 X , y = self ._validate_data (
@@ -1315,9 +1383,12 @@ def _make_samples(self, X_class, klass, y_dtype, nn_indices, n_samples):
13151383 samples_indices = random_state .choice (
13161384 np .arange (X_class .shape [0 ]), size = n_samples , replace = True
13171385 )
1318- X_new = np .empty (shape = (n_samples , X_class .shape [1 ]), dtype = X_class .dtype )
1319- for idx , sample_idx in enumerate (samples_indices ):
1320- X_new [idx , :] = stats .mode (X_class [nn_indices [sample_idx ]], axis = 0 ).mode
1386+ # for each drawn samples, select its k-neighbors and generate a sample
1387+ # where for each feature individually, each category generated is the
1388+ # most common category
1389+ X_new = np .squeeze (
1390+ stats .mode (X_class [nn_indices [samples_indices ]], axis = 1 ).mode , axis = 1
1391+ )
13211392 y_new = np .full (n_samples , fill_value = klass , dtype = y_dtype )
13221393 return X_new , y_new
13231394
0 commit comments