iter

glemaitre · glemaitre · commit d1d95464816a · 2021-02-18T10:52:45.000+01:00
diff --git a/examples/ensemble/plot_bagging_classifier.py b/examples/ensemble/plot_bagging_classifier.py
@@ -105,40 +105,56 @@
 # %% [markdown]
 # Roughly Balanced Bagging
 # ------------------------
-# FIXME: narration based on [3]_.
+# While using a :class:`~imblearn.under_sampling.RandomUnderSampler` or
+# :class:`~imblearn.over_sampling.RandomOverSampler` will create exactly the
+# desired number of samples, it does not follow the statistical spirit wanted
+# in the bagging framework. The authors in [3]_ proposes to use a negative
+# binomial distribution to compute the number of samples of the majority
+# class to be selected and then perform a random under-sampling.
+#
+# Here, we illustrate this method by implementing a function in charge of
+# resampling and use the :class:`~imblearn.FunctionSampler` to integrate it
+# within a :class:`~imblearn.pipeline.Pipeline` and
+# :class:`~sklearn.model_selection.cross_validate`.
 
 # %%
 from collections import Counter
 import numpy as np
 from imblearn import FunctionSampler
 
 
-def binomial_resampling(X, y):
+def roughly_balanced_bagging(X, y, replace=False):
+    """Implementation of Roughly Balanced Bagging for binary problem."""
+    # find the minority and majority classes
     class_counts = Counter(y)
     majority_class = max(class_counts, key=class_counts.get)
     minority_class = min(class_counts, key=class_counts.get)
 
+    # compute the number of sample to draw from the majority class using
+    # a negative binomial distribution
     n_minority_class = class_counts[minority_class]
-    n_majority_resampled = np.random.negative_binomial(n_minority_class, 0.5)
+    n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5)
 
+    # draw randomly with or without replacement
     majority_indices = np.random.choice(
         np.flatnonzero(y == majority_class),
         size=n_majority_resampled,
-        replace=True,
+        replace=replace,
     )
     minority_indices = np.random.choice(
         np.flatnonzero(y == minority_class),
         size=n_minority_class,
-        replace=True,
+        replace=replace,
     )
     indices = np.hstack([majority_indices, minority_indices])
 
-    X_res, y_res = X[indices], y[indices]
-    return X_res, y_res
+    return X[indices], y[indices]
 
 
 # Roughly Balanced Bagging
-rbb = BalancedBaggingClassifier(sampler=FunctionSampler(func=binomial_resampling))
+rbb = BalancedBaggingClassifier(
+    sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True})
+)
 cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy")
 
 print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}")
diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pytest
 
-from sklearn.datasets import load_iris, make_hastie_10_2
+from sklearn.datasets import load_iris, make_hastie_10_2, make_classification
 from sklearn.model_selection import (
     GridSearchCV,
     ParameterGrid,
@@ -24,6 +24,7 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
 
+from imblearn import FunctionSampler
 from imblearn.datasets import make_imbalance
 from imblearn.ensemble import BalancedBaggingClassifier
 from imblearn.over_sampling import RandomOverSampler, SMOTE
@@ -550,3 +551,56 @@ def test_balanced_bagging_classifier_samplers(sampler, n_samples_bootstrap):
     assert_array_equal(
         list(clf.estimators_[0][-1].class_counts_.values()), n_samples_bootstrap
     )
+
+
+@pytest.mark.parametrize("replace", [True, False])
+def test_balanced_bagging_classifier_with_function_sampler(replace):
+    # check that we can provide a FunctionSampler in BalancedBaggingClassifier
+    X, y = make_classification(
+        n_samples=1_000,
+        n_features=10,
+        n_classes=2,
+        weights=[0.3, 0.7],
+        random_state=0,
+    )
+
+    def roughly_balanced_bagging(X, y, replace=False):
+        """Implementation of Roughly Balanced Bagging for binary problem."""
+        # find the minority and majority classes
+        class_counts = Counter(y)
+        majority_class = max(class_counts, key=class_counts.get)
+        minority_class = min(class_counts, key=class_counts.get)
+
+        # compute the number of sample to draw from the majority class using
+        # a negative binomial distribution
+        n_minority_class = class_counts[minority_class]
+        n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5)
+
+        # draw randomly with or without replacement
+        majority_indices = np.random.choice(
+            np.flatnonzero(y == majority_class),
+            size=n_majority_resampled,
+            replace=replace,
+        )
+        minority_indices = np.random.choice(
+            np.flatnonzero(y == minority_class),
+            size=n_minority_class,
+            replace=replace,
+        )
+        indices = np.hstack([majority_indices, minority_indices])
+
+        return X[indices], y[indices]
+
+    # Roughly Balanced Bagging
+    rbb = BalancedBaggingClassifier(
+        base_estimator=CountDecisionTreeClassifier(),
+        n_estimators=2,
+        sampler=FunctionSampler(
+            func=roughly_balanced_bagging, kw_args={"replace": replace}
+        ),
+    )
+    rbb.fit(X, y)
+
+    for estimator in rbb.estimators_:
+        class_counts = estimator[-1].class_counts_
+        assert (class_counts[0] / class_counts[1]) > 0.9