add test no dependent on cupy

glemaitre · glemaitre · commit 964d082064a8 · 2022-01-16T12:15:33.000+01:00
diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py
@@ -154,7 +154,6 @@ def _validate_estimator(self):
         self.nn_m_ = check_neighbors_object(
             "m_neighbors", self.m_neighbors, additional_neighbor=1
         )
-        self.nn_m_.set_params(**{"n_jobs": self.n_jobs})
         if self.kind not in ("borderline-1", "borderline-2"):
             raise ValueError(
                 f'The possible "kind" of algorithm are '
@@ -382,7 +381,6 @@ def _validate_estimator(self):
         self.nn_m_ = check_neighbors_object(
             "m_neighbors", self.m_neighbors, additional_neighbor=1
         )
-        self.nn_m_.set_params(**{"n_jobs": self.n_jobs})
 
         if self.svm_estimator is None:
             self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state)
diff --git a/imblearn/over_sampling/_smote/tests/test_common.py b/imblearn/over_sampling/_smote/tests/test_common.py
@@ -0,0 +1,114 @@
+from collections import Counter
+
+import pytest
+import numpy as np
+
+from imblearn.over_sampling import (
+    BorderlineSMOTE,
+    KMeansSMOTE,
+    SMOTE,
+    SMOTEN,
+    SMOTENC,
+    SVMSMOTE,
+)
+from imblearn.utils.testing import CustomNearestNeighbors
+
+
+@pytest.fixture
+def numerical_data():
+    rng = np.random.RandomState(0)
+    X = rng.randn(100, 2)
+    y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5)
+
+    return X, y
+
+
+@pytest.fixture
+def categorical_data():
+    rng = np.random.RandomState(0)
+
+    feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30
+    feature_2 = ["A"] * 40 + ["B"] * 20
+    feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10
+    X = np.array([feature_1, feature_2, feature_3], dtype=object).T
+    rng.shuffle(X)
+    y = np.array([0] * 20 + [1] * 40, dtype=np.int32)
+    y_labels = np.array(["not apple", "apple"], dtype=object)
+    y = y_labels[y]
+    return X, y
+
+
+@pytest.fixture
+def heterogeneous_data():
+    rng = np.random.RandomState(42)
+    X = np.empty((30, 4), dtype=object)
+    X[:, :2] = rng.randn(30, 2)
+    X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object)
+    X[:, 3] = rng.randint(3, size=30)
+    y = np.array([0] * 10 + [1] * 20)
+    return X, y, [2, 3]
+
+
+@pytest.mark.parametrize(
+    "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"]
+)
+def test_smote_m_neighbors(numerical_data, smote):
+    # check that m_neighbors is properly set. Regression test for:
+    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568
+    X, y = numerical_data
+    _ = smote.fit_resample(X, y)
+    assert smote.nn_k_.n_neighbors == 6
+    assert smote.nn_m_.n_neighbors == 11
+
+
+@pytest.mark.parametrize(
+    "smote",
+    [
+        BorderlineSMOTE(random_state=0),
+        KMeansSMOTE(random_state=1),
+        SMOTE(random_state=0),
+        SVMSMOTE(random_state=0),
+    ],
+    ids=["borderline", "kmeans", "smote", "svm"],
+)
+def test_numerical_smote_k_custom_nn(numerical_data, smote):
+    X, y = numerical_data
+    smote.set_params(k_neighbors=CustomNearestNeighbors(n_neighbors=5))
+    X_res, y_res = smote.fit_resample(X, y)
+
+    assert X_res.shape == (120, 2)
+    assert Counter(y_res) == {0: 60, 1: 60}
+
+
+def test_categorical_smote_k_custom_nn(categorical_data):
+    X, y = categorical_data
+    smote = SMOTEN(k_neighbors=CustomNearestNeighbors(n_neighbors=5))
+    X_res, y_res = smote.fit_resample(X, y)
+
+    assert X_res.shape == (80, 3)
+    assert Counter(y_res) == {"apple": 40, "not apple": 40}
+
+
+def test_heterogeneous_smote_k_custom_nn(heterogeneous_data):
+    X, y, categorical_features = heterogeneous_data
+    smote = SMOTENC(
+        categorical_features, k_neighbors=CustomNearestNeighbors(n_neighbors=5)
+    )
+    X_res, y_res = smote.fit_resample(X, y)
+
+    assert X_res.shape == (40, 4)
+    assert Counter(y_res) == {0: 20, 1: 20}
+
+
+@pytest.mark.parametrize(
+    "smote",
+    [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)],
+    ids=["borderline", "svm"],
+)
+def test_numerical_smote_extra_custom_nn(numerical_data, smote):
+    X, y = numerical_data
+    smote.set_params(m_neighbors=CustomNearestNeighbors(n_neighbors=5))
+    X_res, y_res = smote.fit_resample(X, y)
+
+    assert X_res.shape == (120, 2)
+    assert Counter(y_res) == {0: 60, 1: 60}
diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py
@@ -4,15 +4,12 @@
 # License: MIT
 
 import numpy as np
-import pytest
 
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_equal
 from sklearn.neighbors import NearestNeighbors
 
 from imblearn.over_sampling import SMOTE
-from imblearn.over_sampling import SVMSMOTE
-from imblearn.over_sampling import BorderlineSMOTE
 
 
 RND_SEED = 0
@@ -153,54 +150,3 @@ def test_sample_regular_with_nn():
     )
     assert_allclose(X_resampled, X_gt, rtol=R_TOL)
     assert_array_equal(y_resampled, y_gt)
-
-
-@pytest.mark.parametrize(
-    "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"]
-)
-def test_smote_m_neighbors(smote):
-    # check that m_neighbors is properly set. Regression test for:
-    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568
-    _ = smote.fit_resample(X, Y)
-    assert smote.nn_k_.n_neighbors == 6
-    assert smote.nn_m_.n_neighbors == 11
-
-
-def test_sample_cuml_with_nn():
-    cuml = pytest.importorskip("cuml")
-    nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=2)
-    smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k)
-    X_resampled, y_resampled = smote.fit_resample(X, Y)
-    X_gt = np.array(
-        [
-            [0.11622591, -0.0317206],
-            [0.77481731, 0.60935141],
-            [1.25192108, -0.22367336],
-            [0.53366841, -0.30312976],
-            [1.52091956, -0.49283504],
-            [-0.28162401, -2.10400981],
-            [0.83680821, 1.72827342],
-            [0.3084254, 0.33299982],
-            [0.70472253, -0.73309052],
-            [0.28893132, -0.38761769],
-            [1.15514042, 0.0129463],
-            [0.88407872, 0.35454207],
-            [1.31301027, -0.92648734],
-            [-1.11515198, -0.93689695],
-            [-0.18410027, -0.45194484],
-            [0.9281014, 0.53085498],
-            [-0.14374509, 0.27370049],
-            [-0.41635887, -0.38299653],
-            [0.08711622, 0.93259929],
-            [1.70580611, -0.11219234],
-            [1.10580062, 0.00601499],
-            [1.60506454, -0.31959815],
-            [1.40109204, -0.74276846],
-            [0.38584956, -0.20702218],
-        ]
-    )
-    y_gt = np.array(
-        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]
-    )
-    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
-    assert_array_equal(y_resampled, y_gt)
diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py
@@ -13,6 +13,7 @@
 from pathlib import Path
 from re import compile
 
+from scipy import sparse
 from pytest import warns as _warns
 
 from sklearn.base import BaseEstimator
@@ -168,17 +169,23 @@ def warns(expected_warning, match=None):
 
 
 class CustomNearestNeighbors(BaseEstimator):
-    """Basic implementation of nearest neighbors not relying on scikit-learn."""
+    """Basic implementation of nearest neighbors not relying on scikit-learn.
 
-    def __init__(self, n_neighbors=1):
+    `kneighbors_graph` is ignored and `metric` does not have any impact.
+    """
+
+    def __init__(self, n_neighbors=1, metric="euclidean"):
         self.n_neighbors = n_neighbors
+        self.metric = metric
 
     def fit(self, X, y=None):
+        X = X.toarray() if sparse.issparse(X) else X
         self._kd_tree = KDTree(X)
         return self
 
     def kneighbors(self, X, n_neighbors=None, return_distance=True):
         n_neighbors = n_neighbors if n_neighbors is not None else self.n_neighbors
+        X = X.toarray() if sparse.issparse(X) else X
         distances, indices = self._kd_tree.query(X, k=n_neighbors)
         if return_distance:
             return distances, indices

Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,6 @@ def _validate_estimator(self):`
`154`	`154`	`self.nn_m_ = check_neighbors_object(`
`155`	`155`	`"m_neighbors", self.m_neighbors, additional_neighbor=1`
`156`	`156`	`)`
`157`		`- self.nn_m_.set_params(**{"n_jobs": self.n_jobs})`
`158`	`157`	`if self.kind not in ("borderline-1", "borderline-2"):`
`159`	`158`	`raise ValueError(`
`160`	`159`	`f'The possible "kind" of algorithm are '`
`@@ -382,7 +381,6 @@ def _validate_estimator(self):`
`382`	`381`	`self.nn_m_ = check_neighbors_object(`
`383`	`382`	`"m_neighbors", self.m_neighbors, additional_neighbor=1`
`384`	`383`	`)`
`385`		`- self.nn_m_.set_params(**{"n_jobs": self.n_jobs})`
`386`	`384`
`387`	`385`	`if self.svm_estimator is None:`
`388`	`386`	`self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state)`