1212# Authors: Guillaume Lemaitre <[email protected] > 1313# License: MIT
1414
15- from collections import Counter
16-
17- import numpy as np
18- import matplotlib .pyplot as plt
19-
20- from sklearn .datasets import load_iris
21-
22- from imblearn .datasets import make_imbalance
23-
24- from imblearn .over_sampling import RandomOverSampler
25- from imblearn .under_sampling import RandomUnderSampler
26- from imblearn .under_sampling import TomekLinks
27-
15+ # %%
2816print (__doc__ )
17+ import seaborn as sns
2918
19+ sns .set_context ("poster" )
3020
31- def plot_pie (y ):
32- target_stats = Counter (y )
33- labels = list (target_stats .keys ())
34- sizes = list (target_stats .values ())
35- explode = tuple ([0.1 ] * len (target_stats ))
36-
37- def make_autopct (values ):
38- def my_autopct (pct ):
39- total = sum (values )
40- val = int (round (pct * total / 100.0 ))
41- return f"{ pct :.2f} % ({ val :d} )"
42-
43- return my_autopct
44-
45- fig , ax = plt .subplots ()
46- ax .pie (
47- sizes ,
48- explode = explode ,
49- labels = labels ,
50- shadow = True ,
51- autopct = make_autopct (sizes ),
52- )
53- ax .axis ("equal" )
54-
55-
56- ###############################################################################
21+ # %% [markdown]
22+ # Create an imbalanced dataset
23+ # ----------------------------
24+ #
5725# First, we will create an imbalanced data set from a the iris data set.
5826
59- iris = load_iris ()
27+ # %%
28+ from sklearn .datasets import load_iris
29+ from imblearn .datasets import make_imbalance
6030
61- print (f"Information of the original iris data set: \n { Counter (iris .target )} " )
62- plot_pie (iris .target )
31+ iris = load_iris (as_frame = True )
6332
6433sampling_strategy = {0 : 10 , 1 : 20 , 2 : 47 }
6534X , y = make_imbalance (iris .data , iris .target , sampling_strategy = sampling_strategy )
6635
67- print (
68- f"Information of the iris data set after making it"
69- f" imbalanced using a dict: \n sampling_strategy={ sampling_strategy } \n "
70- f"y: { Counter (y )} "
71- )
72- plot_pie (y )
36+ # %%
37+ import matplotlib .pyplot as plt
7338
74- ###############################################################################
75- # Using ``sampling_strategy`` in resampling algorithms
76- ###############################################################################
39+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
40+ autopct = "%.2f"
41+ iris .target .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
42+ axs [0 ].set_title ("Original" )
43+ y .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
44+ axs [1 ].set_title ("Imbalanced" )
45+ fig .tight_layout ()
7746
78- ###############################################################################
79- # ``sampling_strategy`` as a ``float``
80- # ....................................
47+ # %% [markdown]
48+ # Using ``sampling_strategy`` in resampling algorithms
49+ # ====================================================
8150#
82- # ``sampling_strategy`` can be given a ``float``. For **under-sampling
51+ # `sampling_strategy` as a `float`
52+ # --------------------------------
53+ #
54+ # `sampling_strategy` can be given a `float`. For **under-sampling
8355# methods**, it corresponds to the ratio :math:`\\alpha_{us}` defined by
8456# :math:`N_{rM} = \\alpha_{us} \\times N_{m}` where :math:`N_{rM}` and
8557# :math:`N_{m}` are the number of samples in the majority class after
8658# resampling and the number of samples in the minority class, respectively.
8759
60+ # %%
61+ import numpy as np
62+
8863# select only 2 classes since the ratio make sense in this case
8964binary_mask = np .bitwise_or (y == 0 , y == 2 )
9065binary_y = y [binary_mask ]
9166binary_X = X [binary_mask ]
9267
93- sampling_strategy = 0.8
68+ # %%
69+ from imblearn .under_sampling import RandomUnderSampler
9470
71+ sampling_strategy = 0.8
9572rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
9673X_res , y_res = rus .fit_resample (binary_X , binary_y )
97- print (
98- f"Information of the iris data set after making it "
99- f"balanced using a float and an under-sampling method: \n "
100- f"sampling_strategy={ sampling_strategy } \n y: { Counter (y_res )} "
101- )
102- plot_pie (y_res )
103-
104- ###############################################################################
74+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
75+ _ = ax .set_title ("Under-sampling" )
76+
77+ # %% [markdown]
10578# For **over-sampling methods**, it correspond to the ratio
10679# :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} \\times N_{M}`
10780# where :math:`N_{rm}` and :math:`N_{M}` are the number of samples in the
10881# minority class after resampling and the number of samples in the majority
10982# class, respectively.
11083
84+ # %%
85+ from imblearn .over_sampling import RandomOverSampler
86+
11187ros = RandomOverSampler (sampling_strategy = sampling_strategy )
11288X_res , y_res = ros .fit_resample (binary_X , binary_y )
113- print (
114- f"Information of the iris data set after making it "
115- f"balanced using a float and an over-sampling method: \n "
116- f"sampling_strategy={ sampling_strategy } \n y: { Counter (y_res )} "
117- )
118- plot_pie (y_res )
119-
120- ###############################################################################
121- # ``sampling_strategy`` has a ``str``
122- # ...................................
89+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
90+ _ = ax .set_title ("Over-sampling" )
91+
92+ # %% [markdown]
93+ # `sampling_strategy` has a `str`
94+ # -------------------------------
12395#
124- # `` sampling_strategy` ` can be given as a string which specify the class
96+ # `sampling_strategy` can be given as a string which specify the class
12597# targeted by the resampling. With under- and over-sampling, the number of
12698# samples will be equalized.
12799#
128100# Note that we are using multiple classes from now on.
129101
102+ # %%
130103sampling_strategy = "not minority"
131104
105+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
132106rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
133107X_res , y_res = rus .fit_resample (X , y )
134- print (
135- f"Information of the iris data set after making it "
136- f"balanced by under-sampling: \n sampling_strategy={ sampling_strategy } \n "
137- f" y: { Counter (y_res )} "
138- )
139- plot_pie (y_res )
108+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
109+ axs [0 ].set_title ("Under-sampling" )
140110
141111sampling_strategy = "not majority"
142-
143112ros = RandomOverSampler (sampling_strategy = sampling_strategy )
144113X_res , y_res = ros .fit_resample (X , y )
145- print (
146- f"Information of the iris data set after making it "
147- f"balanced by over-sampling: \n sampling_strategy={ sampling_strategy } \n "
148- f"y: { Counter (y_res )} "
149- )
150- plot_pie (y_res )
151-
152- ###############################################################################
114+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
115+ axs [1 ].set_title ("Over-sampling" )
116+
117+ # %% [markdown]
153118# With **cleaning method**, the number of samples in each class will not be
154119# equalized even if targeted.
155120
121+ # %%
122+ from imblearn .under_sampling import TomekLinks
123+
156124sampling_strategy = "not minority"
157125tl = TomekLinks (sampling_strategy )
158126X_res , y_res = tl .fit_resample (X , y )
159- print (
160- f"Information of the iris data set after making it "
161- f"balanced by cleaning sampling: \n sampling_strategy={ sampling_strategy } \n "
162- f"y: { Counter (y_res )} "
163- )
164- plot_pie (y_res )
165-
166- ###############################################################################
167- # ``sampling_strategy`` as a ``dict``
168- # ...................................
127+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
128+ _ = ax .set_title ("Cleaning" )
129+
130+ # %% [markdown]
131+ # `sampling_strategy as a `dict`
132+ # ------------------------------
169133#
170- # When `` sampling_strategy`` is a `` dict` `, the keys correspond to the targeted
134+ # When `sampling_strategy` is a `dict`, the keys correspond to the targeted
171135# classes. The values correspond to the desired number of samples for each
172136# targeted class. This is working for both **under- and over-sampling**
173- # algorithms but not for the **cleaning algorithms**. Use a `` list` ` instead.
137+ # algorithms but not for the **cleaning algorithms**. Use a `list` instead.
174138
139+ # %%
140+ fig , axs = plt .subplots (ncols = 2 , figsize = (10 , 5 ))
175141
176142sampling_strategy = {0 : 10 , 1 : 15 , 2 : 20 }
177-
178143rus = RandomUnderSampler (sampling_strategy = sampling_strategy )
179144X_res , y_res = rus .fit_resample (X , y )
180- print (
181- f"Information of the iris data set after making it "
182- f"balanced by under-sampling: \n sampling_strategy={ sampling_strategy } \n "
183- f"y: { Counter (y_res )} "
184- )
185- plot_pie (y_res )
145+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [0 ])
146+ axs [0 ].set_title ("Under-sampling" )
186147
187148sampling_strategy = {0 : 25 , 1 : 35 , 2 : 47 }
188-
189149ros = RandomOverSampler (sampling_strategy = sampling_strategy )
190150X_res , y_res = ros .fit_resample (X , y )
191- print (
192- f"Information of the iris data set after making it "
193- f"balanced by over-sampling: \n sampling_strategy={ sampling_strategy } \n "
194- f"y: { Counter (y_res )} "
195- )
196- plot_pie (y_res )
197-
198- ###############################################################################
199- # ``sampling_strategy`` as a ``list``
200- # ...................................
151+ y_res .value_counts ().plot .pie (autopct = autopct , ax = axs [1 ])
152+ axs [1 ].set_title ("Under-sampling" )
153+
154+ # %% [markdown]
155+ # `sampling_strategy` as a `list`
156+ # -------------------------------
201157#
202- # When `` sampling_strategy`` is a `` list` `, the list contains the targeted
158+ # When `sampling_strategy` is a `list`, the list contains the targeted
203159# classes. It is used only for **cleaning methods** and raise an error
204160# otherwise.
205161
162+ # %%
206163sampling_strategy = [0 , 1 , 2 ]
207164tl = TomekLinks (sampling_strategy = sampling_strategy )
208165X_res , y_res = tl .fit_resample (X , y )
209- print (
210- f"Information of the iris data set after making it "
211- f"balanced by cleaning sampling: \n sampling_strategy={ sampling_strategy } "
212- f"\n y: { Counter (y_res )} "
213- )
214- plot_pie (y_res )
215-
216- ###############################################################################
217- # ``sampling_strategy`` as a callable
218- # ...................................
166+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
167+ _ = ax .set_title ("Cleaning" )
168+
169+ # %% [markdown]
170+ # `sampling_strategy` as a callable
171+ # ---------------------------------
219172#
220- # When callable, function taking ``y`` and returns a `` dict` `. The keys
173+ # When callable, function taking `y` and returns a `dict`. The keys
221174# correspond to the targeted classes. The values correspond to the desired
222175# number of samples for each class.
223176
224177
178+ # %%
225179def ratio_multiplier (y ):
180+ from collections import Counter
181+
226182 multiplier = {1 : 0.7 , 2 : 0.95 }
227183 target_stats = Counter (y )
228184 for key , value in target_stats .items ():
@@ -232,11 +188,6 @@ def ratio_multiplier(y):
232188
233189
234190X_res , y_res = RandomUnderSampler (sampling_strategy = ratio_multiplier ).fit_resample (X , y )
235-
236- print (
237- f"Information of the iris data set after balancing using a callable"
238- f" mode:\n ratio={ ratio_multiplier } \n y: { Counter (y_res )} "
239- )
240- plot_pie (y_res )
241-
191+ ax = y_res .value_counts ().plot .pie (autopct = autopct )
192+ ax .set_title ("Under-sampling" )
242193plt .show ()
0 commit comments