From f78273946f21cac4b065341bf5952fe4ada30549 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 12:12:13 +0100 Subject: [PATCH 1/6] FIX remove smoothed_bootstrap and use only shrinkage param --- doc/over_sampling.rst | 12 ++-- doc/whats_new/v0.7.rst | 4 +- .../plot_comparison_over_sampling.py | 4 +- .../over-sampling/plot_shrinkage_effect.py | 8 +-- .../over_sampling/_random_over_sampler.py | 71 +++++++++---------- .../tests/test_random_over_sampler.py | 40 ++++------- 6 files changed, 61 insertions(+), 78 deletions(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 623b61bf2..bf9a111a0 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -80,12 +80,12 @@ It would also work with pandas dataframe:: >>> df_resampled, y_resampled = ros.fit_resample(df_adult, y_adult) >>> df_resampled.head() # doctest: +SKIP -If repeating samples is an issue, the parameter `smoothed_bootstrap` can be -turned to `True` to create a smoothed bootstrap. However, the original data -needs to be numerical. The `shrinkage` parameter controls the dispersion of the -new generated samples. We show an example illustrate that the new samples are -not overlapping anymore once using a smoothed bootstrap. This ways of -generating smoothed bootstrap is also known a Random Over-Sampler Examples +If repeating samples is an issue, the parameter `shrinkage` allows to create a +smoothed bootstrap. However, the original data needs to be numerical. The +`shrinkage` parameter controls the dispersion of the new generated samples. We +show an example illustrate that the new samples are not overlapping anymore +once using a smoothed bootstrap. This ways of generating smoothed bootstrap is +also known a Random Over-Sampler Examples (ROSE) :cite:`torelli2014rose`. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png diff --git a/doc/whats_new/v0.7.rst b/doc/whats_new/v0.7.rst index 78502200f..06d725a56 100644 --- a/doc/whats_new/v0.7.rst +++ b/doc/whats_new/v0.7.rst @@ -74,8 +74,8 @@ Enhancements - Added an option to generate smoothed bootstrap in :class:`imblearn.over_sampling.RandomOverSampler`. It is controls by the - parameters `smoothed_bootstrap` and `shrinkage`. This method is also known as - Random Over-Sampling Examples (ROSE). + parameter `shrinkage`. This method is also known as Random Over-Sampling + Examples (ROSE). :pr:`754` by :user:`Andrea Lorenzon ` and :user:`Guillaume Lemaitre `. diff --git a/examples/over-sampling/plot_comparison_over_sampling.py b/examples/over-sampling/plot_comparison_over_sampling.py index 37c370c38..e0691cc1a 100644 --- a/examples/over-sampling/plot_comparison_over_sampling.py +++ b/examples/over-sampling/plot_comparison_over_sampling.py @@ -144,7 +144,7 @@ def plot_decision_function(X, y, clf, ax): ############################################################################### # By default, random over-sampling generates a bootstrap. The parameter -# `smoothed_bootstrap` allows adding a small perturbation to the generated data +# `shrinkage` allows adding a small perturbation to the generated data # to generate a smoothed bootstrap instead. The plot below shows the difference # between the two data generation strategies. @@ -152,7 +152,7 @@ def plot_decision_function(X, y, clf, ax): sampler = RandomOverSampler(random_state=0) plot_resampling(X, y, sampler, ax=axs[0]) axs[0].set_title("RandomOverSampler with normal bootstrap") -sampler = RandomOverSampler(smoothed_bootstrap=True, shrinkage=0.2, random_state=0) +sampler = RandomOverSampler(shrinkage=0.2, random_state=0) plot_resampling(X, y, sampler, ax=axs[1]) axs[1].set_title("RandomOverSampler with smoothed bootstrap") fig.tight_layout() diff --git a/examples/over-sampling/plot_shrinkage_effect.py b/examples/over-sampling/plot_shrinkage_effect.py index 14504cef5..3d2e9e41f 100644 --- a/examples/over-sampling/plot_shrinkage_effect.py +++ b/examples/over-sampling/plot_shrinkage_effect.py @@ -61,9 +61,9 @@ # from the majority class. Indeed, it is due to the fact that these samples # of the minority class are repeated during the bootstrap generation. # -# We can set `smoothed_bootstrap=True` to add a small perturbation to the +# We can set `shrinkage` to a floating value to add a small perturbation to the # samples created and therefore create a smoothed bootstrap. -sampler = RandomOverSampler(smoothed_bootstrap=True, random_state=0) +sampler = RandomOverSampler(shrinkage=1, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) @@ -81,7 +81,7 @@ # # The parameter `shrinkage` allows to add more or less perturbation. Let's # add more perturbation when generating the smoothed bootstrap. -sampler = RandomOverSampler(smoothed_bootstrap=True, shrinkage=3, random_state=0) +sampler = RandomOverSampler(shrinkage=3, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) @@ -96,7 +96,7 @@ # %% # Increasing the value of `shrinkage` will disperse the new samples. Forcing # the shrinkage to 0 will be equivalent to generating a normal bootstrap. -sampler = RandomOverSampler(smoothed_bootstrap=True, shrinkage=0, random_state=0) +sampler = RandomOverSampler(shrinkage=0, random_state=0) X_res, y_res = sampler.fit_resample(X, y) Counter(y_res) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 928e5d24d..8284c5b2d 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -37,20 +37,17 @@ class RandomOverSampler(BaseOverSampler): {random_state} - smoothed_bootstrap : bool, default=False - Whether or not to generate smoothed bootstrap samples. When this option - is triggered, be aware that the data to be resampled needs to be - numerical data since a Gaussian perturbation will be generated and - added to the bootstrap. - - .. versionadded:: 0.7 - - shrinkage : float or dict, default=1.0 - Factor to shrink the covariance matrix used to generate the - smoothed bootstrap. A factor could be shared by all classes by - providing a floating number or different for each class over-sampled - by providing a dictionary where the key are the class targeted and the - value is the shrinkage factor. + shrinkage : float or dict, default=None + Parameter controlling the shrinkage applied to the covariance matrix + when a smoothed bootstrap is generated. The options are: + + - if `None`, a normal bootstrap will be generated without perturbation. + It is equivalent to `shrinkage=0` as well; + - if a `float` is given, the shrinkage factor will be used for all + classes to generate the smoothed bootstrap; + - if a `dict` is given, the shrinkage factor will specific for each + class. The key correspond to the targeted class and the value is + the shrinkage factor. .. versionadded:: 0.7 @@ -63,7 +60,8 @@ class RandomOverSampler(BaseOverSampler): shrinkage_ : dict or None The per-class shrinkage factor used to generate the smoothed bootstrap - sample. `None` when `smoothed_bootstrap=False`. + sample. `None` when `shrinkage=None` meaning that a normal bootstrap + will be generated. .. versionadded:: 0.7 @@ -125,12 +123,10 @@ def __init__( *, sampling_strategy="auto", random_state=None, - smoothed_bootstrap=False, - shrinkage=1.0, + shrinkage=None, ): super().__init__(sampling_strategy=sampling_strategy) self.random_state = random_state - self.smoothed_bootstrap = smoothed_bootstrap self.shrinkage = shrinkage def _check_X_y(self, X, y): @@ -148,34 +144,35 @@ def _check_X_y(self, X, y): def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) - if self.smoothed_bootstrap: - if isinstance(self.shrinkage, Real): - self.shrinkage_ = { - klass: self.shrinkage for klass in self.sampling_strategy_ - } - else: - missing_shrinkage_keys = ( - self.sampling_strategy_.keys() - self.shrinkage.keys() + if self.shrinkage is None: + self.shrinkage_ = None + elif isinstance(self.shrinkage, Real): + self.shrinkage_ = { + klass: self.shrinkage for klass in self.sampling_strategy_ + } + else: + missing_shrinkage_keys = ( + self.sampling_strategy_.keys() - self.shrinkage.keys() + ) + if missing_shrinkage_keys: + raise ValueError( + f"`shrinkage` should contain a shrinkage factor for " + f"each class that will be resampled. The missing " + f"classes are: {repr(missing_shrinkage_keys)}" ) - if missing_shrinkage_keys: - raise ValueError( - f"`shrinkage` should contain a shrinkage factor for " - f"each class that will be resampled. The missing " - f"classes are: {repr(missing_shrinkage_keys)}" - ) - self.shrinkage_ = self.shrinkage + self.shrinkage_ = self.shrinkage + + if self.shrinkage_ is not None: # smoothed bootstrap imposes to make numerical operation; we need # to be sure to have only numerical data in X try: X = check_array(X, accept_sparse=["csr", "csc"], dtype="numeric") except ValueError as exc: raise ValueError( - "When smoothed_bootstrap=True, X needs to contain only " + "When shrinkage is not None, X needs to contain only " "numerical data to later generate a smoothed bootstrap " "sample." ) from exc - else: - self.shrinkage_ = None X_resampled = [X.copy()] y_resampled = [y.copy()] @@ -189,7 +186,7 @@ def _fit_resample(self, X, y): replace=True, ) sample_indices = np.append(sample_indices, bootstrap_indices) - if self.smoothed_bootstrap: + if self.shrinkage_ is not None: # generate a smoothed bootstrap with a perturbation n_samples, n_features = X.shape smoothing_constant = (4 / ((n_features + 2) * n_samples)) ** ( diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index fb448970a..c620cbd1e 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -43,10 +43,7 @@ def test_ros_init(): assert ros.random_state == RND_SEED -@pytest.mark.parametrize( - "params", - [{"smoothed_bootstrap": False}, {"smoothed_bootstrap": True, "shrinkage": 0}] -) +@pytest.mark.parametrize("params", [{"shrinkage": None}, {"shrinkage": 0}]) @pytest.mark.parametrize("X_type", ["array", "dataframe"]) def test_ros_fit_resample(X_type, data, params): X, Y = data @@ -80,16 +77,13 @@ def test_ros_fit_resample(X_type, data, params): assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - if not params["smoothed_bootstrap"]: + if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0} -@pytest.mark.parametrize( - "params", - [{"smoothed_bootstrap": False}, {"smoothed_bootstrap": True, "shrinkage": 0}] -) +@pytest.mark.parametrize("params", [{"shrinkage": None}, {"shrinkage": 0}]) def test_ros_fit_resample_half(data, params): X, Y = data sampling_strategy = {0: 3, 1: 7} @@ -115,16 +109,13 @@ def test_ros_fit_resample_half(data, params): assert_allclose(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) - if not params["smoothed_bootstrap"]: + if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0, 1: 0} -@pytest.mark.parametrize( - "params", - [{"smoothed_bootstrap": False}, {"smoothed_bootstrap": True, "shrinkage": 0}] -) +@pytest.mark.parametrize("params", [{"shrinkage": None}, {"shrinkage": 0}]) def test_multiclass_fit_resample(data, params): # check the random over-sampling with a multiclass problem X, Y = data @@ -138,7 +129,7 @@ def test_multiclass_fit_resample(data, params): assert count_y_res[1] == 5 assert count_y_res[2] == 5 - if not params["smoothed_bootstrap"]: + if params["shrinkage"] is None: assert ros.shrinkage_ is None else: assert ros.shrinkage_ == {0: 0, 2: 0} @@ -188,11 +179,8 @@ def test_random_over_sampling_heterogeneous_data_smoothed_bootstrap(): [["xxx", 1, 1.0], ["yyy", 2, 2.0], ["zzz", 3, 3.0]], dtype=object ) y = np.array([0, 0, 1]) - ros = RandomOverSampler( - smoothed_bootstrap=True, - random_state=RND_SEED, - ) - err_msg = "When smoothed_bootstrap=True, X needs to contain only numerical" + ros = RandomOverSampler(shrinkage=1, random_state=RND_SEED) + err_msg = "When shrinkage is not None, X needs to contain only numerical" with pytest.raises(ValueError, match=err_msg): ros.fit_resample(X_hetero, y) @@ -201,7 +189,7 @@ def test_random_over_sampling_heterogeneous_data_smoothed_bootstrap(): def test_random_over_sampler_smoothed_bootstrap(X_type, data): # check that smoothed bootstrap is working for numerical array X, y = data - sampler = RandomOverSampler(smoothed_bootstrap=True, shrinkage=1) + sampler = RandomOverSampler(shrinkage=1) X = _convert_container(X, X_type) X_res, y_res = sampler.fit_resample(X, y) @@ -217,10 +205,8 @@ def test_random_over_sampler_equivalence_shrinkage(data): # bootstrap X, y = data - ros_not_shrink = RandomOverSampler( - smoothed_bootstrap=True, shrinkage=0, random_state=0 - ) - ros_hard_bootstrap = RandomOverSampler(smoothed_bootstrap=False, random_state=0) + ros_not_shrink = RandomOverSampler(shrinkage=0, random_state=0) + ros_hard_bootstrap = RandomOverSampler(shrinkage=None, random_state=0) X_res_not_shrink, y_res_not_shrink = ros_not_shrink.fit_resample(X, y) X_res, y_res = ros_hard_bootstrap.fit_resample(X, y) @@ -240,7 +226,7 @@ def test_random_over_sampler_shrinkage_behaviour(data): # should also be larger. X, y = data - ros = RandomOverSampler(smoothed_bootstrap=True, shrinkage=1, random_state=0) + ros = RandomOverSampler(shrinkage=1, random_state=0) X_res_shink_1, y_res_shrink_1 = ros.fit_resample(X, y) ros.set_params(shrinkage=5) @@ -257,7 +243,7 @@ def test_random_over_sampler_shrinkage_error(data): # necessary information X, y = data shrinkage = {} - ros = RandomOverSampler(smoothed_bootstrap=True, shrinkage=shrinkage) + ros = RandomOverSampler(shrinkage=shrinkage) err_msg = "`shrinkage` should contain a shrinkage factor for each class" with pytest.raises(ValueError, match=err_msg): ros.fit_resample(X, y) From 4596caae70efb6c2e5395cbfa1795bac42a503ba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 12:22:40 +0100 Subject: [PATCH 2/6] TST check lower bound shrinkage --- .../over_sampling/_random_over_sampler.py | 23 +++++++++++++------ .../tests/test_random_over_sampler.py | 15 ++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 8284c5b2d..3b385b702 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -38,7 +38,7 @@ class RandomOverSampler(BaseOverSampler): {random_state} shrinkage : float or dict, default=None - Parameter controlling the shrinkage applied to the covariance matrix + Parameter controlling the shrinkage applied to the covariance matrix. when a smoothed bootstrap is generated. The options are: - if `None`, a normal bootstrap will be generated without perturbation. @@ -49,6 +49,9 @@ class RandomOverSampler(BaseOverSampler): class. The key correspond to the targeted class and the value is the shrinkage factor. + The value needs of the shrinkage parameter needs to be higher or equal + to 0. + .. versionadded:: 0.7 Attributes @@ -144,15 +147,16 @@ def _check_X_y(self, X, y): def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) - if self.shrinkage is None: - self.shrinkage_ = None - elif isinstance(self.shrinkage, Real): + if isinstance(self.shrinkage, Real): self.shrinkage_ = { klass: self.shrinkage for klass in self.sampling_strategy_ } else: + self.shrinkage_ = self.shrinkage + + if self.shrinkage_ is not None: missing_shrinkage_keys = ( - self.sampling_strategy_.keys() - self.shrinkage.keys() + self.sampling_strategy_.keys() - self.shrinkage_.keys() ) if missing_shrinkage_keys: raise ValueError( @@ -160,9 +164,14 @@ def _fit_resample(self, X, y): f"each class that will be resampled. The missing " f"classes are: {repr(missing_shrinkage_keys)}" ) - self.shrinkage_ = self.shrinkage - if self.shrinkage_ is not None: + for klass, shrink_factor in self.shrinkage_.items(): + if shrink_factor < 0: + raise ValueError( + f"The shrinkage factor needs to be >= 0. " + f"Got {shrink_factor} for class {klass}." + ) + # smoothed bootstrap imposes to make numerical operation; we need # to be sure to have only numerical data in X try: diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index c620cbd1e..34b43947e 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -238,12 +238,17 @@ def test_random_over_sampler_shrinkage_behaviour(data): assert disperstion_shrink_1 < disperstion_shrink_5 -def test_random_over_sampler_shrinkage_error(data): - # check that we raise proper error when shrinkage do not contain the - # necessary information +@pytest.mark.parametrize( + "shrinkage, err_msg", + [ + ({}, "`shrinkage` should contain a shrinkage factor for each class"), + (-1, "The shrinkage factor needs to be >= 0"), + ({0: -1}, "The shrinkage factor needs to be >= 0"), + ] +) +def test_random_over_sampler_shrinkage_error(data, shrinkage, err_msg): + # check the validation of the shrinkage parameter X, y = data - shrinkage = {} ros = RandomOverSampler(shrinkage=shrinkage) - err_msg = "`shrinkage` should contain a shrinkage factor for each class" with pytest.raises(ValueError, match=err_msg): ros.fit_resample(X, y) From 484e3731c7bcceb12c919d417be6a24547afd3ba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 13:30:08 +0100 Subject: [PATCH 3/6] Apply suggestions from code review Co-authored-by: Christos Aridas --- doc/over_sampling.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index bf9a111a0..9ad88097b 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -85,7 +85,7 @@ smoothed bootstrap. However, the original data needs to be numerical. The `shrinkage` parameter controls the dispersion of the new generated samples. We show an example illustrate that the new samples are not overlapping anymore once using a smoothed bootstrap. This ways of generating smoothed bootstrap is -also known a Random Over-Sampler Examples +also known a Random Over-Sampling Examples (ROSE) :cite:`torelli2014rose`. .. image:: ./auto_examples/over-sampling/images/sphx_glr_plot_comparison_over_sampling_003.png From c921eb70e71ce9fc0a600c88de522bfaadfaab77 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 13:44:54 +0100 Subject: [PATCH 4/6] iter --- imblearn/over_sampling/_random_over_sampler.py | 9 ++++++++- imblearn/over_sampling/tests/test_random_over_sampler.py | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 3b385b702..5ff20303b 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -4,6 +4,7 @@ # Christos Aridas # License: MIT +from collections.abc import Mapping from numbers import Real import numpy as np @@ -151,8 +152,14 @@ def _fit_resample(self, X, y): self.shrinkage_ = { klass: self.shrinkage for klass in self.sampling_strategy_ } - else: + elif self.shrinkage is None or isinstance(self.shrinkage, Mapping): self.shrinkage_ = self.shrinkage + else: + raise ValueError( + f"`shrinkage` should either be a positive floating number or " + f"a dictionary mapping a class to a positive floating number. " + f"Got {repr(self.shrinkage)} instead." + ) if self.shrinkage_ is not None: missing_shrinkage_keys = ( diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 34b43947e..a30738d0a 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -244,6 +244,7 @@ def test_random_over_sampler_shrinkage_behaviour(data): ({}, "`shrinkage` should contain a shrinkage factor for each class"), (-1, "The shrinkage factor needs to be >= 0"), ({0: -1}, "The shrinkage factor needs to be >= 0"), + ([1, ], "`shrinkage` should either be a positive floating number or") ] ) def test_random_over_sampler_shrinkage_error(data, shrinkage, err_msg): From 6c7f0e37d1ad244e19146379e44cfd33c4f99b94 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 15:18:55 +0100 Subject: [PATCH 5/6] Update imblearn/over_sampling/_random_over_sampler.py Co-authored-by: Christos Aridas --- imblearn/over_sampling/_random_over_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 5ff20303b..27b0b934e 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -64,7 +64,7 @@ class RandomOverSampler(BaseOverSampler): shrinkage_ : dict or None The per-class shrinkage factor used to generate the smoothed bootstrap - sample. `None` when `shrinkage=None` meaning that a normal bootstrap + sample. When `shrinkage=None` a normal bootstrap will be generated. .. versionadded:: 0.7 From c5149b5d2cb9a56774b989efc4cef0dfaed25405 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Feb 2021 15:19:50 +0100 Subject: [PATCH 6/6] Update _random_over_sampler.py --- imblearn/over_sampling/_random_over_sampler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 27b0b934e..1801e258f 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -64,8 +64,7 @@ class RandomOverSampler(BaseOverSampler): shrinkage_ : dict or None The per-class shrinkage factor used to generate the smoothed bootstrap - sample. When `shrinkage=None` a normal bootstrap - will be generated. + sample. When `shrinkage=None` a normal bootstrap will be generated. .. versionadded:: 0.7