From 39dac14a125b4cb68b4b2a7a25b1493183db5d9f Mon Sep 17 00:00:00 2001 From: Gonenc Mogol Date: Tue, 14 Nov 2023 22:13:54 +0000 Subject: [PATCH 1/4] Update _instance_hardness_threshold.py Added the option to add a pipeline as an estimator for instance hardness threshold. Currently using a pipeline as an estimator fails because of the instance check. I think it's useful to be able to use pipelines as estimators and have thus added it. --- .../_prototype_selection/_instance_hardness_threshold.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 52d9280b6..1e41a8768 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -20,6 +20,7 @@ from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods from ..base import BaseUnderSampler +from sklearn.pipeline import Pipeline @Substitution( @@ -140,7 +141,9 @@ def _validate_estimator(self, random_state): if ( self.estimator is not None - and isinstance(self.estimator, ClassifierMixin) + and (isinstance(self.estimator, ClassifierMixin) or + isinstance(self.estimator, ClassifierMixin) ) + and hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) From 97d5470cdaa5cb2310eb45b4de41a5be906cb448 Mon Sep 17 00:00:00 2001 From: Gonenc Mogol Date: Tue, 14 Nov 2023 22:20:40 +0000 Subject: [PATCH 2/4] Update _instance_hardness_threshold.py fixed the silly mistake --- .../_prototype_selection/_instance_hardness_threshold.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 1e41a8768..ca02c4c59 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -142,7 +142,7 @@ def _validate_estimator(self, random_state): if ( self.estimator is not None and (isinstance(self.estimator, ClassifierMixin) or - isinstance(self.estimator, ClassifierMixin) ) + isinstance(self.estimator, Pipeline) ) and hasattr(self.estimator, "predict_proba") ): From 5eb9e6cb42505f54150f6500ecae3c8a6c3f5a73 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Mar 2024 19:03:19 +0100 Subject: [PATCH 3/4] use is_classifier instead of mixin --- .../_instance_hardness_threshold.py | 7 ++----- .../tests/test_instance_hardness_threshold.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index ca02c4c59..dac3f3c33 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -10,7 +10,7 @@ from collections import Counter import numpy as np -from sklearn.base import ClassifierMixin, clone +from sklearn.base import clone, is_classifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold, cross_val_predict @@ -20,7 +20,6 @@ from ...utils._docstring import _n_jobs_docstring, _random_state_docstring from ...utils._param_validation import HasMethods from ..base import BaseUnderSampler -from sklearn.pipeline import Pipeline @Substitution( @@ -141,9 +140,7 @@ def _validate_estimator(self, random_state): if ( self.estimator is not None - and (isinstance(self.estimator, ClassifierMixin) or - isinstance(self.estimator, Pipeline) ) - + and is_classifier(self.estimator) and hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 5d7008747..a63bb45a0 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -6,6 +6,7 @@ import numpy as np from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.naive_bayes import GaussianNB as NB +from sklearn.pipeline import make_pipeline from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold @@ -93,3 +94,19 @@ def test_iht_fit_resample_default_estimator(): assert isinstance(iht.estimator_, RandomForestClassifier) assert X_resampled.shape == (12, 2) assert y_resampled.shape == (12,) + + +def test_iht_estimator_pipeline(): + """Check that we can pass a pipeline containing a classifier. + + Checking if we have a classifier should not be based on inheriting from + `ClassifierMixin`. + + Non-regression test for: + https://github.com/scikit-learn-contrib/imbalanced-learn/pull/1049 + """ + model = make_pipeline(GradientBoostingClassifier(random_state=RND_SEED)) + iht = InstanceHardnessThreshold(estimator=model, random_state=RND_SEED) + X_resampled, y_resampled = iht.fit_resample(X, Y) + assert X_resampled.shape == (12, 2) + assert y_resampled.shape == (12,) From 080a37a4d114aaae67eeb8250f31b2d266346ae9 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Mar 2024 19:06:23 +0100 Subject: [PATCH 4/4] add changelog --- doc/whats_new/v0.12.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst index 40bd1056e..6585c3b58 100644 --- a/doc/whats_new/v0.12.rst +++ b/doc/whats_new/v0.12.rst @@ -1,5 +1,20 @@ .. _changes_0_12: +Version 0.12.1 +============== + +**In progress** + +Changelog +--------- + +Bug fixes +......... + +- Fix a bug in :class:`~imblearn.under_sampling.InstanceHardnessThreshold` where + `estimator` could not be a :class:`~sklearn.pipeline.Pipeline` object. + :pr:`1049` by :user:`Gonenc Mogol `. + Version 0.12.0 ==============