|
1 | | -import numpy as np |
| 1 | +from collections import Counter |
| 2 | + |
2 | 3 | import pytest |
3 | | -from sklearn.neighbors import NearestNeighbors |
| 4 | +from sklearn.datasets import make_classification |
| 5 | +from sklearn.linear_model import LogisticRegression |
4 | 6 | from sklearn.utils._testing import assert_allclose, assert_array_equal |
5 | 7 |
|
6 | 8 | from imblearn.over_sampling import BorderlineSMOTE |
7 | 9 |
|
8 | 10 |
|
9 | | -@pytest.fixture |
10 | | -def data(): |
11 | | - X = np.array( |
12 | | - [ |
13 | | - [0.11622591, -0.0317206], |
14 | | - [0.77481731, 0.60935141], |
15 | | - [1.25192108, -0.22367336], |
16 | | - [0.53366841, -0.30312976], |
17 | | - [1.52091956, -0.49283504], |
18 | | - [-0.28162401, -2.10400981], |
19 | | - [0.83680821, 1.72827342], |
20 | | - [0.3084254, 0.33299982], |
21 | | - [0.70472253, -0.73309052], |
22 | | - [0.28893132, -0.38761769], |
23 | | - [1.15514042, 0.0129463], |
24 | | - [0.88407872, 0.35454207], |
25 | | - [1.31301027, -0.92648734], |
26 | | - [-1.11515198, -0.93689695], |
27 | | - [-0.18410027, -0.45194484], |
28 | | - [0.9281014, 0.53085498], |
29 | | - [-0.14374509, 0.27370049], |
30 | | - [-0.41635887, -0.38299653], |
31 | | - [0.08711622, 0.93259929], |
32 | | - [1.70580611, -0.11219234], |
33 | | - ] |
| 11 | +@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) |
| 12 | +def test_borderline_smote_no_in_danger_samples(kind): |
| 13 | + """Check that the algorithm behave properly even on a dataset without any sample |
| 14 | + in danger. |
| 15 | + """ |
| 16 | + X, y = make_classification( |
| 17 | + n_samples=500, |
| 18 | + n_features=2, |
| 19 | + n_informative=2, |
| 20 | + n_redundant=0, |
| 21 | + n_repeated=0, |
| 22 | + n_clusters_per_class=1, |
| 23 | + n_classes=3, |
| 24 | + weights=[0.1, 0.2, 0.7], |
| 25 | + class_sep=1.5, |
| 26 | + random_state=1, |
34 | 27 | ) |
35 | | - y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0]) |
36 | | - return X, y |
| 28 | + smote = BorderlineSMOTE(kind=kind, m_neighbors=3, k_neighbors=5, random_state=0) |
| 29 | + X_res, y_res = smote.fit_resample(X, y) |
37 | 30 |
|
| 31 | + assert_allclose(X, X_res) |
| 32 | + assert_allclose(y, y_res) |
| 33 | + assert not smote.in_danger_indices |
38 | 34 |
|
39 | | -@pytest.mark.parametrize("kind", ["borderline-1", "borderline-2"]) |
40 | | -def test_borderline_smote(kind, data): |
41 | | - bsmote = BorderlineSMOTE(kind=kind, random_state=42) |
42 | | - bsmote_nn = BorderlineSMOTE( |
43 | | - kind=kind, |
44 | | - random_state=42, |
45 | | - k_neighbors=NearestNeighbors(n_neighbors=6), |
46 | | - m_neighbors=NearestNeighbors(n_neighbors=11), |
| 35 | + |
| 36 | +def test_borderline_smote_kind(): |
| 37 | + """Check the behaviour of the `kind` parameter. |
| 38 | +
|
| 39 | + In short, "borderline-2" generates sample closer to the boundary decision than |
| 40 | + "borderline-1". We generate an example where a logistic regression will perform |
| 41 | + worse on "borderline-2" than on "borderline-1". |
| 42 | + """ |
| 43 | + X, y = make_classification( |
| 44 | + n_samples=500, |
| 45 | + n_features=2, |
| 46 | + n_informative=2, |
| 47 | + n_redundant=0, |
| 48 | + n_repeated=0, |
| 49 | + n_clusters_per_class=1, |
| 50 | + n_classes=3, |
| 51 | + weights=[0.1, 0.2, 0.7], |
| 52 | + class_sep=1.0, |
| 53 | + random_state=1, |
| 54 | + ) |
| 55 | + smote = BorderlineSMOTE( |
| 56 | + kind="borderline-1", m_neighbors=9, k_neighbors=5, random_state=0 |
47 | 57 | ) |
| 58 | + X_res_borderline_1, y_res_borderline_1 = smote.fit_resample(X, y) |
| 59 | + smote.set_params(kind="borderline-2") |
| 60 | + X_res_borderline_2, y_res_borderline_2 = smote.fit_resample(X, y) |
48 | 61 |
|
49 | | - X_res_1, y_res_1 = bsmote.fit_resample(*data) |
50 | | - X_res_2, y_res_2 = bsmote_nn.fit_resample(*data) |
| 62 | + score_borderline_1 = ( |
| 63 | + LogisticRegression() |
| 64 | + .fit(X_res_borderline_1, y_res_borderline_1) |
| 65 | + .score(X_res_borderline_1, y_res_borderline_1) |
| 66 | + ) |
| 67 | + score_borderline_2 = ( |
| 68 | + LogisticRegression() |
| 69 | + .fit(X_res_borderline_2, y_res_borderline_2) |
| 70 | + .score(X_res_borderline_2, y_res_borderline_2) |
| 71 | + ) |
| 72 | + assert score_borderline_1 > score_borderline_2 |
| 73 | + |
| 74 | + |
| 75 | +def test_borderline_smote_in_danger(): |
| 76 | + X, y = make_classification( |
| 77 | + n_samples=500, |
| 78 | + n_features=2, |
| 79 | + n_informative=2, |
| 80 | + n_redundant=0, |
| 81 | + n_repeated=0, |
| 82 | + n_clusters_per_class=1, |
| 83 | + n_classes=3, |
| 84 | + weights=[0.1, 0.2, 0.7], |
| 85 | + class_sep=0.8, |
| 86 | + random_state=1, |
| 87 | + ) |
| 88 | + smote = BorderlineSMOTE( |
| 89 | + kind="borderline-1", |
| 90 | + m_neighbors=9, |
| 91 | + k_neighbors=5, |
| 92 | + random_state=0, |
| 93 | + ) |
| 94 | + _, y_res_1 = smote.fit_resample(X, y) |
| 95 | + in_danger_indices_borderline_1 = smote.in_danger_indices |
| 96 | + smote.set_params(kind="borderline-2") |
| 97 | + _, y_res_2 = smote.fit_resample(X, y) |
| 98 | + in_danger_indices_borderline_2 = smote.in_danger_indices |
51 | 99 |
|
52 | | - assert_allclose(X_res_1, X_res_2) |
53 | | - assert_array_equal(y_res_1, y_res_2) |
| 100 | + for key1, key2 in zip( |
| 101 | + in_danger_indices_borderline_1, in_danger_indices_borderline_2 |
| 102 | + ): |
| 103 | + assert_array_equal( |
| 104 | + in_danger_indices_borderline_1[key1], in_danger_indices_borderline_2[key2] |
| 105 | + ) |
| 106 | + assert len(in_danger_indices_borderline_1) == len(in_danger_indices_borderline_2) |
| 107 | + counter = Counter(y_res_1) |
| 108 | + assert counter[0] == counter[1] == counter[2] |
| 109 | + counter = Counter(y_res_2) |
| 110 | + assert counter[0] == counter[1] == counter[2] |
0 commit comments