scikit-learn-contrib · glemaitre · Feb 12, 2021 · Feb 9, 2021 · Feb 9, 2021 · Feb 9, 2021
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -16,6 +16,20 @@ jobs:
         ./build_tools/circle/linting.sh
       displayName: Run linting
 
+- template: build_tools/azure/posix.yml
+  parameters:
+    name: Linux_Runs
+    vmImage: ubuntu-18.04
+    matrix:
+      pylatest_pip_openblas_pandas:
+        DISTRIB: 'conda-pip-latest'
+        PYTHON_VERSION: '3.9'
+        COVERAGE: 'true'
+        PANDAS_VERSION: '*'
+        TEST_DOCSTRINGS: 'true'
+        JOBLIB_VERSION: '*'
+        CHECK_WARNINGS: 'true'
+
 - template: build_tools/azure/posix.yml
   parameters:
     name: Linux
@@ -29,15 +43,6 @@ jobs:
         DISTRIB: 'ubuntu'
         PYTHON_VERSION: '3.6'
         JOBLIB_VERSION: '*'
-      # Linux environment to test the latest available dependencies and MKL.
-      pylatest_pip_openblas_pandas:
-        DISTRIB: 'conda-pip-latest'
-        PYTHON_VERSION: '3.9'
-        COVERAGE: 'true'
-        PANDAS_VERSION: '*'
-        TEST_DOCSTRINGS: 'true'
-        JOBLIB_VERSION: '*'
-        CHECK_WARNINGS: 'true'
       pylatest_conda_pandas_keras:
         DISTRIB: 'conda'
         PYTHON_VERSION: '3.7'

diff --git a/build_tools/circle/linting.sh b/build_tools/circle/linting.sh
@@ -140,7 +140,7 @@ else
 
     check_files "$(echo "$MODIFIED_FILES" | grep -v ^examples)"
     check_files "$(echo "$MODIFIED_FILES" | grep ^examples)" \
-        --config ./examples/.flake8
+        --config ./setup.cfg
 fi
 echo -e "No problem detected by flake8\n"
 

diff --git a/doc/api.rst b/doc/api.rst
@@ -76,7 +76,6 @@ Prototype selection
    over_sampling.SMOTE
    over_sampling.SMOTENC
    over_sampling.SVMSMOTE
-   over_sampling.ROSE
 
 
 .. _combine_ref:

diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst
@@ -80,6 +80,19 @@ It would also work with pandas dataframe::
   >>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult)
   >>> df_resampled.head()  # doctest: +SKIP
 
+If repeating samples is an issue, the parameter `smoothed_bootstrap` can be
+turned to `True` to create a smoothed bootstrap. However, the original data
+needs to be numerical. The `shrinkage` parameter controls the dispersion of the
+new generated samples. We show an example illustrate that the new samples are
+not overlapping anymore once using a smoothed bootstrap. This ways of
+generating smoothed bootstrap is also known a Random Over-Sampler Examples
+(ROSE) :cite:`torelli2014rose`.
+
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png
+   :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
+   :scale: 60
+   :align: center
+
 .. _smote_adasyn:
 
 From random over-sampling to SMOTE and ADASYN
@@ -104,7 +117,7 @@ the same manner::
 The figure below illustrates the major difference of the different
 over-sampling methods.
 
-.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_004.png
    :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
    :scale: 60
    :align: center
@@ -122,14 +135,14 @@ implementation of :class:`SMOTE` will not make any distinction between easy and
 hard samples to be classified using the nearest neighbors rule. Therefore, the
 decision function found during training will be different among the algorithms.
 
-.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_004.png
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_005.png
    :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
    :align: center
 
 The sampling particularities of these two algorithms can lead to some peculiar
 behavior as shown below.
 
-.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_005.png
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_006.png
    :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
    :scale: 60
    :align: center
@@ -144,7 +157,7 @@ samples. Those methods focus on samples near of the border of the optimal
 decision function and will generate samples in the opposite direction of the
 nearest neighbors class. Those variants are presented in the figure below.
 
-.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_006.png
+.. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_007.png
    :target: ./auto_examples/over-sampling/plot_comparison_over_sampling.html
    :scale: 60
    :align: center
@@ -198,29 +211,14 @@ Therefore, it can be seen that the samples generated in the first and last
 columns are belonging to the same categories originally presented without any
 other extra interpolation.
 
-.. _rose:
-
-ROSE (Random Over-Sampling Examples)
-------------------------------------
-
-ROSE uses smoothed bootstrapping to draw artificial samples from the
-feature space neighborhood around selected classes, using a multivariate
-Gaussian kernel around randomly selected samples. First, random samples are
-selected from original classes. Then the smoothing kernel distribution
-is computed around the samples: :math:`\hat f(x|y=Y_i) = \sum_i^{n_j}
-p_i Pr(x|x_i)=\sum_i^{n_j} \frac{1}{n_j} Pr(x|x_i)=\sum_i^{n_j}
-\frac{1}{n_j} K_{H_j}(x|x_i)`.
-
-Then new samples are drawn from the computed distribution.
-
 Mathematical formulation
 ========================
 
 Sample generation
 -----------------
 
-Both SMOTE and ADASYN use the same algorithm to generate new samples.
-Considering a sample :math:`x_i`, a new sample :math:`x_{new}` will be
+Both :class:`SMOTE` and :class:`ADASYN` use the same algorithm to generate new
+samples. Considering a sample :math:`x_i`, a new sample :math:`x_{new}` will be
 generated considering its k neareast-neighbors (corresponding to
 ``k_neighbors``). For instance, the 3 nearest-neighbors are included in the
 blue circle as illustrated in the figure below. Then, one of these

diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst
@@ -72,8 +72,12 @@ Enhancements
 - Lazy import `keras` module when importing `imblearn.keras`
   :pr:`719` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- Added Random Over-Sampling Examples (ROSE) class.
-  :pr:`754` by :user:`Andrea Lorenzon <andrealorenzon>`.
+- Added an option to generate smoothed bootstrap in
+  :class:`imblearn.over_sampling.RandomOverSampler`. It is controls by the
+  parameters `smoothed_bootstrap` and `shrinkage`. This method is also known as
+  Random Over-Sampling Examples (ROSE).
+  :pr:`754` by :user:`Andrea Lorenzon <andrealorenzon>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
 
 - Add option `output_dict` in
   :func:`imblearn.metrics.classification_report_imbalanced` to return a

diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py
@@ -106,16 +106,15 @@ def plot_decision_function(X, y, clf, ax):
 # data using a linear SVM classifier. Greater is the difference between the
 # number of samples in each class, poorer are the classfication results.
 
-fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
+fig, axs = plt.subplots(2, 2, figsize=(15, 12))
 
-ax_arr = (ax1, ax2, ax3, ax4)
 weights_arr = (
     (0.01, 0.01, 0.98),
     (0.01, 0.05, 0.94),
     (0.2, 0.1, 0.7),
     (0.33, 0.33, 0.33),
 )
-for ax, weights in zip(ax_arr, weights_arr):
+for ax, weights in zip(axs.ravel(), weights_arr):
     X, y = create_dataset(n_samples=1000, weights=weights)
     clf = LinearSVC().fit(X, y)
     plot_decision_function(X, y, clf, ax)
@@ -129,20 +128,40 @@ def plot_decision_function(X, y, clf, ax):
 ###############################################################################
 # Random over-sampling can be used to repeat some samples and balance the
 # number of samples between the dataset. It can be seen that with this trivial
-# approach the boundary decision is already less biaised toward the majority
+# approach the boundary decision is already less biased toward the majority
 # class.
 
-fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
+fig, axs = plt.subplots(1, 2, figsize=(15, 7))
 X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
 clf = LinearSVC().fit(X, y)
-plot_decision_function(X, y, clf, ax1)
-ax1.set_title(f"Linear SVC with y={Counter(y)}")
+plot_decision_function(X, y, clf, axs[0])
+axs[0].set_title(f"Linear SVC with y={Counter(y)}")
 pipe = make_pipeline(RandomOverSampler(random_state=0), LinearSVC())
 pipe.fit(X, y)
-plot_decision_function(X, y, pipe, ax2)
-ax2.set_title("Decision function for RandomOverSampler")
+plot_decision_function(X, y, pipe, axs[1])
+axs[1].set_title("Decision function for RandomOverSampler")
 fig.tight_layout()
 
+###############################################################################
+# By default, random over-sampling generates a bootstrap. The parameter
+# `smoothed_bootstrap` allows adding a small perturbation to the generated data
+# to generate a smoothed bootstrap instead. The plot below shows the difference
+# between the two data generation strategies.
+
+fig, axs = plt.subplots(1, 2, figsize=(15, 7))
+sampler = RandomOverSampler(random_state=0)
+plot_resampling(X, y, sampler, ax=axs[0])
+axs[0].set_title("RandomOverSampler with normal bootstrap")
+sampler = RandomOverSampler(smoothed_bootstrap=True, shrinkage=0.2, random_state=0)
+plot_resampling(X, y, sampler, ax=axs[1])
+axs[1].set_title("RandomOverSampler with smoothed bootstrap")
+fig.tight_layout()
+
+###############################################################################
+# It looks like more samples are generated with smoothed bootstrap. This is due
+# to the fact that the samples generated are not superimposing with the
+# original samples.
+#
 ###############################################################################
 # More advanced over-sampling using ADASYN and SMOTE
 ###############################################################################
@@ -161,16 +180,15 @@ def _fit_resample(self, X, y):
         return X, y
 
 
-fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
+fig, axs = plt.subplots(2, 2, figsize=(15, 15))
 X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
 sampler = FakeSampler()
 clf = make_pipeline(sampler, LinearSVC())
-plot_resampling(X, y, sampler, ax1)
-ax1.set_title(f"Original data - y={Counter(y)}")
+plot_resampling(X, y, sampler, axs[0, 0])
+axs[0, 0].set_title(f"Original data - y={Counter(y)}")
 
-ax_arr = (ax2, ax3, ax4)
 for ax, sampler in zip(
-    ax_arr,
+    axs.ravel()[1:],
     (
         RandomOverSampler(random_state=0),
         SMOTE(random_state=0),
@@ -189,33 +207,32 @@ def _fit_resample(self, X, y):
 # nearest-neighbors rule while regular SMOTE will not make any distinction.
 # Therefore, the decision function depending of the algorithm.
 
-fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
+fig, axs = plt.subplots(1, 3, figsize=(20, 6))
 X, y = create_dataset(n_samples=10000, weights=(0.01, 0.05, 0.94))
 
 clf = LinearSVC().fit(X, y)
-plot_decision_function(X, y, clf, ax1)
-ax1.set_title(f"Linear SVC with y={Counter(y)}")
+plot_decision_function(X, y, clf, axs[0])
+axs[0].set_title(f"Linear SVC with y={Counter(y)}")
 sampler = SMOTE()
 clf = make_pipeline(sampler, LinearSVC())
 clf.fit(X, y)
-plot_decision_function(X, y, clf, ax2)
-ax2.set_title(f"Decision function for {sampler.__class__.__name__}")
+plot_decision_function(X, y, clf, axs[1])
+axs[1].set_title(f"Decision function for {sampler.__class__.__name__}")
 sampler = ADASYN()
 clf = make_pipeline(sampler, LinearSVC())
 clf.fit(X, y)
-plot_decision_function(X, y, clf, ax3)
-ax3.set_title(f"Decision function for {sampler.__class__.__name__}")
+plot_decision_function(X, y, clf, axs[2])
+axs[2].set_title(f"Decision function for {sampler.__class__.__name__}")
 fig.tight_layout()
 
 ###############################################################################
 # Due to those sampling particularities, it can give rise to some specific
 # issues as illustrated below.
 
-fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 15))
+fig, axs = plt.subplots(2, 2, figsize=(15, 15))
 X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8)
 
-ax_arr = ((ax1, ax2), (ax3, ax4))
-for ax, sampler in zip(ax_arr, (SMOTE(random_state=0), ADASYN(random_state=0))):
+for ax, sampler in zip(axs, (SMOTE(random_state=0), ADASYN(random_state=0))):
     clf = make_pipeline(sampler, LinearSVC())
     clf.fit(X, y)
     plot_decision_function(X, y, clf, ax[0])
@@ -232,16 +249,11 @@ def _fit_resample(self, X, y):
 # the KMeans version will make a clustering before to generate samples in each
 # cluster independently depending each cluster density.
 
-(
-    fig,
-    ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)),
-) = plt.subplots(5, 2, figsize=(15, 30))
+fig, axs = plt.subplots(5, 2, figsize=(15, 30))
 X, y = create_dataset(n_samples=5000, weights=(0.01, 0.05, 0.94), class_sep=0.8)
 
-
-ax_arr = ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10))
 for ax, sampler in zip(
-    ax_arr,
+    axs,
     (
         SMOTE(random_state=0),
         BorderlineSMOTE(random_state=0, kind="borderline-1"),
@@ -282,5 +294,3 @@ def _fit_resample(self, X, y):
 print(sorted(Counter(y_resampled).items()))
 print("SMOTE-NC will generate categories for the categorical features:")
 print(X_resampled[-5:])
-
-plt.show()