diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 448f4a15a..004e63a71 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -198,6 +198,15 @@ Therefore, it can be seen that the samples generated in the first and last columns are belonging to the same categories originally presented without any other extra interpolation. +Furthermore, if the dataset solely consists of categorical features one may use the :class:`SMOTEN` class. This class generates samples in an identical fashion to :class:`SMOTENC` - however - only categorical features are permitted. Each feature is treated as a categorical feature and therefore it is not advised to use `SMOTEN` for datasets that contain both categorical and continious features:: + + >>> from imblearn.over_sampling import SMOTEN + >>> smote_n = SMOTEN(random_state=0) + >>> X[:, 1] = rng.randint(2, size=n_samples) + >>> X_resampled, y_resampled = smote_n.fit_resample(X, y) + >>> print(sorted(Counter(y_resampled).items())) + [(0, 30), (1, 30)] + .. topic:: References .. [HWB2005] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new diff --git a/doc/whats_new/v0.5.rst b/doc/whats_new/v0.5.rst index 2b892e6c1..28192c8af 100644 --- a/doc/whats_new/v0.5.rst +++ b/doc/whats_new/v0.5.rst @@ -27,6 +27,9 @@ Enhancement and issue template showing how to print system and dependency information from the command line. :issue:`557` by :user:`Alexander L. Hayes `. +- Add :class:`SMOTEN`. Add ability to use SMOTE on pure categorical features. + by :user:`Thomas Kluiters `. + + Parameters + ---------- + sampling_strategy : float, str, dict or callable, (default='auto') + Sampling information to resample the data set. + + - When ``float``, it corresponds to the desired ratio of the number of + samples in the minority class over the number of samples in the + majority class after resampling. Therefore, the ratio is expressed as + :math:`\\alpha_{os} = N_{rm} / N_{M}` where :math:`N_{rm}` is the + number of samples in the minority class after resampling and + :math:`N_{M}` is the number of samples in the majority class. + + .. warning:: + ``float`` is only available for **binary** classification. An + error is raised for multi-class classification. + + - When ``str``, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + + ``'minority'``: resample only the minority class; + + ``'not minority'``: resample all classes but the minority class; + + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: equivalent to ``'not majority'``. + + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm. + + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. + + k_neighbors : int or object, optional (default=5) + If ``int``, number of nearest neighbours to used to construct synthetic + samples. If object, an estimator that inherits from + :class:`sklearn.neighbors.base.KNeighborsMixin` that will be used to + find the k_neighbors. + + n_jobs : int, optional (default=1) + The number of threads to open if possible. + + Notes + ----- + See the original paper [1]_ for more details. + + Supports mutli-class resampling. A one-vs.-rest scheme is used as + originally proposed in [1]_. + + See + :ref:`sphx_glr_auto_examples_over-sampling_plot_comparison_over_sampling.py`, + and :ref:`sphx_glr_auto_examples_over-sampling_plot_smote.py`. + + See also + -------- + SMOTE : Over-sample using SMOTE. + + SVMSMOTE : Over-sample using SVM-SMOTE variant. + + BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. + + ADASYN : Over-sample using ADASYN. + + References + ---------- + .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: + synthetic minority over-sampling technique," Journal of artificial + intelligence research, 321-357, 2002. + + Examples + -------- + + >>> from collections import Counter + >>> from numpy.random import RandomState + >>> from sklearn.datasets import make_classification + >>> from imblearn.over_sampling import SMOTEN + >>> X, y = make_classification(n_classes=2, class_sep=2, + ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, + ... n_features=5, n_clusters_per_class=1, n_samples=1000, random_state=10) + >>> print('Original dataset shape (%s, %s)' % X.shape) + Original dataset shape (1000, 5) + >>> print('Original dataset samples in class 0: {}'.format(sum(y == 0))) + Original dataset samples in class 0: 100 + >>> X[:, ] = RandomState(10).randint(0, 4, size=(1000, 5)) + >>> sm = SMOTEN(random_state=42) + >>> X_res, y_res = sm.fit_resample(X, y) + >>> print('Resampled dataset samples in class 0: {}'.format(sum(y_res == 0))) + Resampled dataset samples in class 0: 900 + + """ + + def __init__(self, sampling_strategy='auto', kind='regular', + random_state=None, k_neighbors=5, n_jobs=1): + super(SMOTEN, self).__init__(categorical_features=[], + sampling_strategy=sampling_strategy, + random_state=random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs, + kind=kind) + + def _validate_estimator(self): + self.categorical_features = np.asarray(range(self.n_features_)) + self.continuous_features_ = np.asarray([]) + super(SMOTEN, self)._validate_estimator() + + def _decode(self, X, X_resampled): + X_unstacked = self.ohe_.inverse_transform(X_resampled) + if sparse.issparse(X): + X_unstacked = sparse.csr_matrix(X_unstacked) + return X_unstacked + + def _encode(self, X, y): + self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore', + dtype=np.float64) + # the input of the OneHotEncoder needs to be dense + return self.ohe_.fit_transform( + X.toarray() if sparse.issparse(X) + else X) + + diff --git a/imblearn/over_sampling/tests/test_smote_n.py b/imblearn/over_sampling/tests/test_smote_n.py new file mode 100644 index 000000000..a310cd188 --- /dev/null +++ b/imblearn/over_sampling/tests/test_smote_n.py @@ -0,0 +1,162 @@ +"""Test the module smoten.""" +# Authors: Guillaume Lemaitre +# Christos Aridas +# Dzianis Dudnik +# License: MIT + +from collections import Counter + +import pytest + +import numpy as np +from scipy import sparse + +from sklearn.datasets import make_classification +from sklearn.utils.testing import assert_allclose + +from imblearn.over_sampling import SMOTEN + + +def data_heterogneous_ordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 2), dtype=object) + # create a categorical feature using some string + X[:, 0] = rng.choice(['a', 'b', 'c'], size=30).astype(object) + # create a categorical feature using some integer + X[:, 1] = rng.randint(3, size=30) + y = np.array([0] * 10 + [1] * 20) + # return the categories + return X, y + + +def data_heterogneous_unordered(): + rng = np.random.RandomState(42) + X = np.empty((30, 2), dtype=object) + # create a categorical feature using some string + X[:, 0] = rng.choice(['a', 'b', 'c'], size=30).astype(object) + # create a categorical feature using some integer + X[:, 1] = rng.randint(3, size=30) + y = np.array([0] * 10 + [1] * 20) + # return the categories + return X, y + + +def data_heterogneous_unordered_multiclass(): + rng = np.random.RandomState(42) + X = np.empty((50, 2), dtype=object) + # create a categorical feature using some string + X[:, 0] = rng.choice(['a', 'b', 'c'], size=50).astype(object) + # create a categorical feature using some integer + X[:, 1] = rng.randint(3, size=50) + y = np.array([0] * 10 + [1] * 15 + [2] * 25) + # return the categories + return X, y + + +def data_sparse(format): + rng = np.random.RandomState(42) + X = np.empty((30, 2), dtype=np.float64) + # create a categorical feature using some string + X[:, 0] = rng.randint(3, size=30) + # create a categorical feature using some integer + X[:, 1] = rng.randint(3, size=30) + y = np.array([0] * 10 + [1] * 20) + X = sparse.csr_matrix(X) if format == 'csr' else sparse.csc_matrix(X) + return X, y + + +@pytest.mark.parametrize( + "data", + [data_heterogneous_ordered(), data_heterogneous_unordered(), + data_sparse('csr'), data_sparse('csc')] +) +def test_smoten(data): + X, y = data + smote = SMOTEN(random_state=0) + X_resampled, y_resampled = smote.fit_resample(X, y) + + assert X_resampled.dtype == X.dtype + + categorical_features = np.array([0, 1]) + if categorical_features.dtype == bool: + categorical_features = np.flatnonzero(categorical_features) + for cat_idx in categorical_features: + if sparse.issparse(X): + assert set(X[:, cat_idx].data) == set(X_resampled[:, cat_idx].data) + assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype + else: + assert set(X[:, cat_idx]) == set(X_resampled[:, cat_idx]) + assert X[:, cat_idx].dtype == X_resampled[:, cat_idx].dtype + + +# part of the common test which apply to SMOTE-N even if it is not default +# constructible +def test_smoten_check_target_type(): + X, _ = data_heterogneous_unordered() + y = np.linspace(0, 1, 30) + smote = SMOTEN(random_state=0) + with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): + smote.fit_resample(X, y) + rng = np.random.RandomState(42) + y = rng.randint(2, size=(20, 3)) + with pytest.raises(ValueError, match="'y' should encode the multiclass"): + smote.fit_resample(X, y) + + +def test_smoten_samplers_one_label(): + X, _ = data_heterogneous_unordered() + y = np.zeros(30) + smote = SMOTEN(random_state=0) + with pytest.raises(ValueError, match='needs to have more than 1 class'): + smote.fit(X, y) + + +def test_smoten_fit(): + X, y = data_heterogneous_unordered() + smote = SMOTEN(random_state=0) + smote.fit_resample(X, y) + assert hasattr(smote, 'sampling_strategy_'), \ + "No fitted attribute sampling_strategy_" + + +def test_smoten_fit_resample(): + X, y = data_heterogneous_unordered() + target_stats = Counter(y) + smote = SMOTEN(random_state=0) + X_res, y_res = smote.fit_resample(X, y) + n_samples = max(target_stats.values()) + assert all(value >= n_samples for value in Counter(y_res).values()) + + +def test_smoten_fit_resample_sampling_strategy(): + X, y = data_heterogneous_unordered_multiclass() + expected_stat = Counter(y)[1] + smote = SMOTEN(random_state=0) + sampling_strategy = {2: 25, 0: 25} + smote.set_params(sampling_strategy=sampling_strategy) + X_res, y_res = smote.fit_resample(X, y) + assert Counter(y_res)[1] == expected_stat + + +def test_smoten_pandas(): + pd = pytest.importorskip("pandas") + # Check that the samplers handle pandas dataframe and pandas series + X, y = data_heterogneous_unordered_multiclass() + X_pd = pd.DataFrame(X) + smote = SMOTEN(random_state=0) + X_res_pd, y_res_pd = smote.fit_resample(X_pd, y) + X_res, y_res = smote.fit_resample(X, y) + assert X_res_pd.tolist() == X_res.tolist() + assert_allclose(y_res_pd, y_res) + + +def test_smoten_preserve_dtype(): + X, y = make_classification(n_samples=50, n_classes=3, n_informative=4, + weights=[0.2, 0.3, 0.5], random_state=0) + # Cast X and y to not default dtype + X = X.astype(np.float32) + y = y.astype(np.int32) + smote = SMOTEN(random_state=0) + X_res, y_res = smote.fit_resample(X, y) + assert X.dtype == X_res.dtype, "X dtype is not preserved" + assert y.dtype == y_res.dtype, "y dtype is not preserved" diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 7d08f3313..70e0cfe45 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -33,8 +33,8 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids -DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] -SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] +DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE', 'SMOTEN'] +SUPPORT_STRING = ['SMOTEN', 'RandomUnderSampler', 'RandomOverSampler'] HAVE_SAMPLE_INDICES = [ 'RandomOverSampler', 'RandomUnderSampler', 'InstanceHardnessThreshold', 'NearMiss', 'TomekLinks', 'EditedNearestNeighbours',