|
105 | 105 | # %% [markdown] |
106 | 106 | # Roughly Balanced Bagging |
107 | 107 | # ------------------------ |
108 | | -# FIXME: narration based on [3]_. |
| 108 | +# While using a :class:`~imblearn.under_sampling.RandomUnderSampler` or |
| 109 | +# :class:`~imblearn.over_sampling.RandomOverSampler` will create exactly the |
| 110 | +# desired number of samples, it does not follow the statistical spirit wanted |
| 111 | +# in the bagging framework. The authors in [3]_ proposes to use a negative |
| 112 | +# binomial distribution to compute the number of samples of the majority |
| 113 | +# class to be selected and then perform a random under-sampling. |
| 114 | +# |
| 115 | +# Here, we illustrate this method by implementing a function in charge of |
| 116 | +# resampling and use the :class:`~imblearn.FunctionSampler` to integrate it |
| 117 | +# within a :class:`~imblearn.pipeline.Pipeline` and |
| 118 | +# :class:`~sklearn.model_selection.cross_validate`. |
109 | 119 |
|
110 | 120 | # %% |
111 | 121 | from collections import Counter |
112 | 122 | import numpy as np |
113 | 123 | from imblearn import FunctionSampler |
114 | 124 |
|
115 | 125 |
|
116 | | -def binomial_resampling(X, y): |
| 126 | +def roughly_balanced_bagging(X, y, replace=False): |
| 127 | + """Implementation of Roughly Balanced Bagging for binary problem.""" |
| 128 | + # find the minority and majority classes |
117 | 129 | class_counts = Counter(y) |
118 | 130 | majority_class = max(class_counts, key=class_counts.get) |
119 | 131 | minority_class = min(class_counts, key=class_counts.get) |
120 | 132 |
|
| 133 | + # compute the number of sample to draw from the majority class using |
| 134 | + # a negative binomial distribution |
121 | 135 | n_minority_class = class_counts[minority_class] |
122 | | - n_majority_resampled = np.random.negative_binomial(n_minority_class, 0.5) |
| 136 | + n_majority_resampled = np.random.negative_binomial(n=n_minority_class, p=0.5) |
123 | 137 |
|
| 138 | + # draw randomly with or without replacement |
124 | 139 | majority_indices = np.random.choice( |
125 | 140 | np.flatnonzero(y == majority_class), |
126 | 141 | size=n_majority_resampled, |
127 | | - replace=True, |
| 142 | + replace=replace, |
128 | 143 | ) |
129 | 144 | minority_indices = np.random.choice( |
130 | 145 | np.flatnonzero(y == minority_class), |
131 | 146 | size=n_minority_class, |
132 | | - replace=True, |
| 147 | + replace=replace, |
133 | 148 | ) |
134 | 149 | indices = np.hstack([majority_indices, minority_indices]) |
135 | 150 |
|
136 | | - X_res, y_res = X[indices], y[indices] |
137 | | - return X_res, y_res |
| 151 | + return X[indices], y[indices] |
138 | 152 |
|
139 | 153 |
|
140 | 154 | # Roughly Balanced Bagging |
141 | | -rbb = BalancedBaggingClassifier(sampler=FunctionSampler(func=binomial_resampling)) |
| 155 | +rbb = BalancedBaggingClassifier( |
| 156 | + sampler=FunctionSampler(func=roughly_balanced_bagging, kw_args={"replace": True}) |
| 157 | +) |
142 | 158 | cv_results = cross_validate(rbb, X, y, scoring="balanced_accuracy") |
143 | 159 |
|
144 | 160 | print(f"{cv_results['test_score'].mean():.3f} +/- {cv_results['test_score'].std():.3f}") |
|
0 commit comments