@@ -106,16 +106,15 @@ def plot_decision_function(X, y, clf, ax):
106106# data using a linear SVM classifier. Greater is the difference between the
107107# number of samples in each class, poorer are the classfication results.
108108
109- fig , (( ax1 , ax2 ), ( ax3 , ax4 )) = plt .subplots (2 , 2 , figsize = (15 , 12 ))
109+ fig , axs = plt .subplots (2 , 2 , figsize = (15 , 12 ))
110110
111- ax_arr = (ax1 , ax2 , ax3 , ax4 )
112111weights_arr = (
113112 (0.01 , 0.01 , 0.98 ),
114113 (0.01 , 0.05 , 0.94 ),
115114 (0.2 , 0.1 , 0.7 ),
116115 (0.33 , 0.33 , 0.33 ),
117116)
118- for ax , weights in zip (ax_arr , weights_arr ):
117+ for ax , weights in zip (axs . ravel () , weights_arr ):
119118 X , y = create_dataset (n_samples = 1000 , weights = weights )
120119 clf = LinearSVC ().fit (X , y )
121120 plot_decision_function (X , y , clf , ax )
@@ -129,20 +128,40 @@ def plot_decision_function(X, y, clf, ax):
129128###############################################################################
130129# Random over-sampling can be used to repeat some samples and balance the
131130# number of samples between the dataset. It can be seen that with this trivial
132- # approach the boundary decision is already less biaised toward the majority
131+ # approach the boundary decision is already less biased toward the majority
133132# class.
134133
135- fig , ( ax1 , ax2 ) = plt .subplots (1 , 2 , figsize = (15 , 7 ))
134+ fig , axs = plt .subplots (1 , 2 , figsize = (15 , 7 ))
136135X , y = create_dataset (n_samples = 10000 , weights = (0.01 , 0.05 , 0.94 ))
137136clf = LinearSVC ().fit (X , y )
138- plot_decision_function (X , y , clf , ax1 )
139- ax1 .set_title (f"Linear SVC with y={ Counter (y )} " )
137+ plot_decision_function (X , y , clf , axs [ 0 ] )
138+ axs [ 0 ] .set_title (f"Linear SVC with y={ Counter (y )} " )
140139pipe = make_pipeline (RandomOverSampler (random_state = 0 ), LinearSVC ())
141140pipe .fit (X , y )
142- plot_decision_function (X , y , pipe , ax2 )
143- ax2 .set_title ("Decision function for RandomOverSampler" )
141+ plot_decision_function (X , y , pipe , axs [ 1 ] )
142+ axs [ 1 ] .set_title ("Decision function for RandomOverSampler" )
144143fig .tight_layout ()
145144
145+ ###############################################################################
146+ # By default, random over-sampling generates a bootstrap. The parameter
147+ # `smoothed_bootstrap` allows adding a small perturbation to the generated data
148+ # to generate a smoothed bootstrap instead. The plot below shows the difference
149+ # between the two data generation strategies.
150+
151+ fig , axs = plt .subplots (1 , 2 , figsize = (15 , 7 ))
152+ sampler = RandomOverSampler (random_state = 0 )
153+ plot_resampling (X , y , sampler , ax = axs [0 ])
154+ axs [0 ].set_title ("RandomOverSampler with normal bootstrap" )
155+ sampler = RandomOverSampler (smoothed_bootstrap = True , shrinkage = 0.2 , random_state = 0 )
156+ plot_resampling (X , y , sampler , ax = axs [1 ])
157+ axs [1 ].set_title ("RandomOverSampler with smoothed bootstrap" )
158+ fig .tight_layout ()
159+
160+ ###############################################################################
161+ # It looks like more samples are generated with smoothed bootstrap. This is due
162+ # to the fact that the samples generated are not superimposing with the
163+ # original samples.
164+ #
146165###############################################################################
147166# More advanced over-sampling using ADASYN and SMOTE
148167###############################################################################
@@ -161,16 +180,15 @@ def _fit_resample(self, X, y):
161180 return X , y
162181
163182
164- fig , (( ax1 , ax2 ), ( ax3 , ax4 )) = plt .subplots (2 , 2 , figsize = (15 , 15 ))
183+ fig , axs = plt .subplots (2 , 2 , figsize = (15 , 15 ))
165184X , y = create_dataset (n_samples = 10000 , weights = (0.01 , 0.05 , 0.94 ))
166185sampler = FakeSampler ()
167186clf = make_pipeline (sampler , LinearSVC ())
168- plot_resampling (X , y , sampler , ax1 )
169- ax1 .set_title (f"Original data - y={ Counter (y )} " )
187+ plot_resampling (X , y , sampler , axs [ 0 , 0 ] )
188+ axs [ 0 , 0 ] .set_title (f"Original data - y={ Counter (y )} " )
170189
171- ax_arr = (ax2 , ax3 , ax4 )
172190for ax , sampler in zip (
173- ax_arr ,
191+ axs . ravel ()[ 1 :] ,
174192 (
175193 RandomOverSampler (random_state = 0 ),
176194 SMOTE (random_state = 0 ),
@@ -189,33 +207,32 @@ def _fit_resample(self, X, y):
189207# nearest-neighbors rule while regular SMOTE will not make any distinction.
190208# Therefore, the decision function depending of the algorithm.
191209
192- fig , ( ax1 , ax2 , ax3 ) = plt .subplots (1 , 3 , figsize = (20 , 6 ))
210+ fig , axs = plt .subplots (1 , 3 , figsize = (20 , 6 ))
193211X , y = create_dataset (n_samples = 10000 , weights = (0.01 , 0.05 , 0.94 ))
194212
195213clf = LinearSVC ().fit (X , y )
196- plot_decision_function (X , y , clf , ax1 )
197- ax1 .set_title (f"Linear SVC with y={ Counter (y )} " )
214+ plot_decision_function (X , y , clf , axs [ 0 ] )
215+ axs [ 0 ] .set_title (f"Linear SVC with y={ Counter (y )} " )
198216sampler = SMOTE ()
199217clf = make_pipeline (sampler , LinearSVC ())
200218clf .fit (X , y )
201- plot_decision_function (X , y , clf , ax2 )
202- ax2 .set_title (f"Decision function for { sampler .__class__ .__name__ } " )
219+ plot_decision_function (X , y , clf , axs [ 1 ] )
220+ axs [ 1 ] .set_title (f"Decision function for { sampler .__class__ .__name__ } " )
203221sampler = ADASYN ()
204222clf = make_pipeline (sampler , LinearSVC ())
205223clf .fit (X , y )
206- plot_decision_function (X , y , clf , ax3 )
207- ax3 .set_title (f"Decision function for { sampler .__class__ .__name__ } " )
224+ plot_decision_function (X , y , clf , axs [ 2 ] )
225+ axs [ 2 ] .set_title (f"Decision function for { sampler .__class__ .__name__ } " )
208226fig .tight_layout ()
209227
210228###############################################################################
211229# Due to those sampling particularities, it can give rise to some specific
212230# issues as illustrated below.
213231
214- fig , (( ax1 , ax2 ), ( ax3 , ax4 )) = plt .subplots (2 , 2 , figsize = (15 , 15 ))
232+ fig , axs = plt .subplots (2 , 2 , figsize = (15 , 15 ))
215233X , y = create_dataset (n_samples = 5000 , weights = (0.01 , 0.05 , 0.94 ), class_sep = 0.8 )
216234
217- ax_arr = ((ax1 , ax2 ), (ax3 , ax4 ))
218- for ax , sampler in zip (ax_arr , (SMOTE (random_state = 0 ), ADASYN (random_state = 0 ))):
235+ for ax , sampler in zip (axs , (SMOTE (random_state = 0 ), ADASYN (random_state = 0 ))):
219236 clf = make_pipeline (sampler , LinearSVC ())
220237 clf .fit (X , y )
221238 plot_decision_function (X , y , clf , ax [0 ])
@@ -232,16 +249,11 @@ def _fit_resample(self, X, y):
232249# the KMeans version will make a clustering before to generate samples in each
233250# cluster independently depending each cluster density.
234251
235- (
236- fig ,
237- ((ax1 , ax2 ), (ax3 , ax4 ), (ax5 , ax6 ), (ax7 , ax8 ), (ax9 , ax10 )),
238- ) = plt .subplots (5 , 2 , figsize = (15 , 30 ))
252+ fig , axs = plt .subplots (5 , 2 , figsize = (15 , 30 ))
239253X , y = create_dataset (n_samples = 5000 , weights = (0.01 , 0.05 , 0.94 ), class_sep = 0.8 )
240254
241-
242- ax_arr = ((ax1 , ax2 ), (ax3 , ax4 ), (ax5 , ax6 ), (ax7 , ax8 ), (ax9 , ax10 ))
243255for ax , sampler in zip (
244- ax_arr ,
256+ axs ,
245257 (
246258 SMOTE (random_state = 0 ),
247259 BorderlineSMOTE (random_state = 0 , kind = "borderline-1" ),
@@ -282,5 +294,3 @@ def _fit_resample(self, X, y):
282294print (sorted (Counter (y_resampled ).items ()))
283295print ("SMOTE-NC will generate categories for the categorical features:" )
284296print (X_resampled [- 5 :])
285-
286- plt .show ()
0 commit comments