|
3 | 3 | Sample selection in NearMiss |
4 | 4 | ============================ |
5 | 5 |
|
6 | | -This example illustrates the different way of selecting example in NearMiss. |
7 | | -
|
| 6 | +This example illustrates the different way of selecting example in |
| 7 | +:class:`~imblearn.under_sampling.NearMiss`. |
8 | 8 | """ |
9 | 9 |
|
10 | 10 | # Authors: Guillaume Lemaitre <[email protected]> |
11 | 11 | # License: MIT |
12 | 12 |
|
13 | | -import matplotlib.pyplot as plt |
14 | | -import numpy as np |
| 13 | +# %% |
| 14 | +print(__doc__) |
15 | 15 |
|
16 | | -from sklearn.neighbors import NearestNeighbors |
| 16 | +import seaborn as sns |
17 | 17 |
|
18 | | -print(__doc__) |
| 18 | +sns.set_context("poster") |
19 | 19 |
|
20 | | -rng = np.random.RandomState(18) |
| 20 | +# %% [markdown] |
| 21 | +# We define a function allowing to make some nice decoration on the plot. |
21 | 22 |
|
22 | | -############################################################################### |
23 | | -# This function allows to make nice plotting |
| 23 | +# %% |
24 | 24 |
|
25 | 25 |
|
26 | 26 | def make_plot_despine(ax): |
27 | | - ax.spines["top"].set_visible(False) |
28 | | - ax.spines["right"].set_visible(False) |
29 | | - ax.get_xaxis().tick_bottom() |
30 | | - ax.get_yaxis().tick_left() |
31 | | - ax.spines["left"].set_position(("outward", 10)) |
32 | | - ax.spines["bottom"].set_position(("outward", 10)) |
33 | | - ax.set_xlim([0.0, 3.5]) |
34 | | - ax.set_ylim([0.0, 3.5]) |
| 27 | + sns.despine(ax=ax, offset=10) |
| 28 | + ax.set_xlim([0, 3.5]) |
| 29 | + ax.set_ylim([0, 3.5]) |
| 30 | + ax.set_xticks(np.arange(0, 3.6, 0.5)) |
| 31 | + ax.set_yticks(np.arange(0, 3.6, 0.5)) |
35 | 32 | ax.set_xlabel(r"$X_1$") |
36 | 33 | ax.set_ylabel(r"$X_2$") |
37 | | - ax.legend() |
| 34 | + ax.legend(loc="upper left") |
38 | 35 |
|
39 | 36 |
|
40 | | -############################################################################### |
| 37 | +# %% [markdown] |
41 | 38 | # We can start by generating some data to later illustrate the principle of |
42 | | -# each NearMiss heuritic rules. |
| 39 | +# each :class:`~imblearn.under_sampling.NearMiss` heuristic rules. |
| 40 | + |
| 41 | +# %% |
| 42 | +import numpy as np |
| 43 | + |
| 44 | +rng = np.random.RandomState(18) |
43 | 45 |
|
44 | | -# minority class |
45 | 46 | X_minority = np.transpose( |
46 | 47 | [[1.1, 1.3, 1.15, 0.8, 0.8, 0.6, 0.55], [1.0, 1.5, 1.7, 2.5, 2.0, 1.2, 0.55]] |
47 | 48 | ) |
48 | | -# majority class |
49 | 49 | X_majority = np.transpose( |
50 | 50 | [ |
51 | 51 | [2.1, 2.12, 2.13, 2.14, 2.2, 2.3, 2.5, 2.45], |
52 | 52 | [1.5, 2.1, 2.7, 0.9, 1.0, 1.4, 2.4, 2.9], |
53 | 53 | ] |
54 | 54 | ) |
55 | 55 |
|
56 | | -############################################################################### |
| 56 | +# %% [mardown] |
57 | 57 | # NearMiss-1 |
58 | | -############################################################################### |
59 | | - |
60 | | -############################################################################### |
| 58 | +# ---------- |
| 59 | +# |
61 | 60 | # NearMiss-1 selects samples from the majority class for which the average |
62 | 61 | # distance to some nearest neighbours is the smallest. In the following |
63 | 62 | # example, we use a 3-NN to compute the average distance on 2 specific samples |
64 | 63 | # of the majority class. Therefore, in this case the point linked by the |
65 | 64 | # green-dashed line will be selected since the average distance is smaller. |
66 | 65 |
|
67 | | -fig, ax = plt.subplots(1, 1, figsize=(6, 6)) |
| 66 | +# %% |
| 67 | +import matplotlib.pyplot as plt |
| 68 | +from sklearn.neighbors import NearestNeighbors |
| 69 | + |
| 70 | +fig, ax = plt.subplots(figsize=(8, 8)) |
68 | 71 | ax.scatter( |
69 | 72 | X_minority[:, 0], |
70 | 73 | X_minority[:, 1], |
@@ -99,18 +102,18 @@ def make_plot_despine(ax): |
99 | 102 | ax.set_title("NearMiss-1") |
100 | 103 | make_plot_despine(ax) |
101 | 104 |
|
102 | | -############################################################################### |
| 105 | +# %% [mardown] |
103 | 106 | # NearMiss-2 |
104 | | -############################################################################### |
105 | | - |
106 | | -############################################################################### |
| 107 | +# ---------- |
| 108 | +# |
107 | 109 | # NearMiss-2 selects samples from the majority class for which the average |
108 | 110 | # distance to the farthest neighbors is the smallest. With the same |
109 | 111 | # configuration as previously presented, the sample linked to the green-dashed |
110 | 112 | # line will be selected since its distance the 3 farthest neighbors is the |
111 | 113 | # smallest. |
112 | 114 |
|
113 | | -fig, ax = plt.subplots(1, 1, figsize=(6, 6)) |
| 115 | +# %% |
| 116 | +fig, ax = plt.subplots(figsize=(8, 8)) |
114 | 117 | ax.scatter( |
115 | 118 | X_minority[:, 0], |
116 | 119 | X_minority[:, 1], |
@@ -147,17 +150,17 @@ def make_plot_despine(ax): |
147 | 150 | ax.set_title("NearMiss-2") |
148 | 151 | make_plot_despine(ax) |
149 | 152 |
|
150 | | -############################################################################### |
| 153 | +# %% [mardown] |
151 | 154 | # NearMiss-3 |
152 | | -############################################################################### |
153 | | - |
154 | | -############################################################################### |
| 155 | +# ---------- |
| 156 | +# |
155 | 157 | # NearMiss-3 can be divided into 2 steps. First, a nearest-neighbors is used to |
156 | 158 | # short-list samples from the majority class (i.e. correspond to the |
157 | 159 | # highlighted samples in the following plot). Then, the sample with the largest |
158 | 160 | # average distance to the *k* nearest-neighbors are selected. |
159 | 161 |
|
160 | | -fig, ax = plt.subplots(1, 1, figsize=(6, 6)) |
| 162 | +# %% |
| 163 | +fig, ax = plt.subplots(figsize=(8.5, 8.5)) |
161 | 164 | ax.scatter( |
162 | 165 | X_minority[:, 0], |
163 | 166 | X_minority[:, 1], |
|
0 commit comments