From 1476dab125611dfcab4b3d11bb602855592779c4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 00:05:03 +0100 Subject: [PATCH 1/8] MAINT validate parameters for public functions --- imblearn/metrics/_classification.py | 125 +++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 20 deletions(-) diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 797fb56a8..3504a29a7 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -15,6 +15,7 @@ # License: MIT import functools +import numbers import warnings from inspect import signature @@ -26,7 +27,23 @@ from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_consistent_length, column_or_1d +from ..utils._param_validation import Interval, StrOptions, validate_params + +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "pos_label": [str, numbers.Integral, None], + "average": [ + None, + StrOptions({"binary", "micro", "macro", "weighted", "samples"}), + ], + "warn_for": ["array-like"], + "sample_weight": ["array-like", None], + } +) def sensitivity_specificity_support( y_true, y_pred, @@ -57,13 +74,13 @@ def sensitivity_specificity_support( Parameters ---------- - y_true : ndarray of shape (n_samples,) + y_true : array-like of shape (n_samples,) Ground truth (correct) target values. - y_pred : ndarray of shape (n_samples,) + y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. - labels : list, default=None + labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a @@ -72,7 +89,7 @@ def sensitivity_specificity_support( labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. - pos_label : str or int, default=1 + pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report @@ -105,7 +122,7 @@ def sensitivity_specificity_support( This determines which warnings will be made in the case that this function is being used to return only one of its metrics. - sample_weight : ndarray of shape (n_samples,), default=None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -274,6 +291,19 @@ def sensitivity_specificity_support( return sensitivity, specificity, true_sum +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "pos_label": [str, numbers.Integral, None], + "average": [ + None, + StrOptions({"binary", "micro", "macro", "weighted", "samples"}), + ], + "sample_weight": ["array-like", None], + } +) def sensitivity_score( y_true, y_pred, @@ -295,20 +325,20 @@ def sensitivity_score( Parameters ---------- - y_true : ndarray of shape (n_samples,) + y_true : array-like of shape (n_samples,) Ground truth (correct) target values. - y_pred : ndarray of shape (n_samples,) + y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. - labels : list, default=None + labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. - pos_label : str or int, default=1 + pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report @@ -337,7 +367,7 @@ def sensitivity_score( meaningful for multilabel classification where this differs from :func:`accuracy_score`). - sample_weight : ndarray of shape (n_samples,), default=None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -374,6 +404,19 @@ def sensitivity_score( return s +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "pos_label": [str, numbers.Integral, None], + "average": [ + None, + StrOptions({"binary", "micro", "macro", "weighted", "samples"}), + ], + "sample_weight": ["array-like", None], + } +) def specificity_score( y_true, y_pred, @@ -395,20 +438,20 @@ def specificity_score( Parameters ---------- - y_true : ndarray of shape (n_samples,) + y_true : array-like of shape (n_samples,) Ground truth (correct) target values. - y_pred : ndarray of shape (n_samples,) + y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. - labels : list, default=None + labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. - pos_label : str or int, default=1 + pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report @@ -437,7 +480,7 @@ def specificity_score( meaningful for multilabel classification where this differs from :func:`accuracy_score`). - sample_weight : ndarray of shape (n_samples,), default=None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -474,6 +517,22 @@ def specificity_score( return s +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "pos_label": [str, numbers.Integral, None], + "average": [ + None, + StrOptions( + {"binary", "micro", "macro", "weighted", "samples", "multiclass"} + ), + ], + "sample_weight": ["array-like", None], + "correction": [Interval(numbers.Real, 0, None, closed="left")], + } +) def geometric_mean_score( y_true, y_pred, @@ -507,20 +566,20 @@ class is unrecognized by the classifier, G-mean resolves to zero. To Parameters ---------- - y_true : ndarray of shape (n_samples,) + y_true : array-like of shape (n_samples,) Ground truth (correct) target values. - y_pred : ndarray of shape (n_samples,) + y_pred : array-like of shape (n_samples,) Estimated targets as returned by a classifier. - labels : list, default=None + labels : array-like, default=None The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. - pos_label : str or int, default=1 + pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report @@ -539,6 +598,8 @@ class is unrecognized by the classifier, G-mean resolves to zero. To ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. + ``'multiclass'``: + No average is taken. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This @@ -549,7 +610,7 @@ class is unrecognized by the classifier, G-mean resolves to zero. To meaningful for multilabel classification where this differs from :func:`accuracy_score`). - sample_weight : ndarray of shape (n_samples,), default=None + sample_weight : array-like of shape (n_samples,), default=None Sample weights. correction : float, default=0.0 @@ -658,6 +719,7 @@ class is unrecognized by the classifier, G-mean resolves to zero. To return gmean +@validate_params({"alpha": [numbers.Real], "squared": ["boolean"]}) def make_index_balanced_accuracy(*, alpha=0.1, squared=True): """Balance any scoring function using the index balanced accuracy. @@ -763,6 +825,22 @@ def compute_score(*args, **kwargs): return decorate +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "labels": ["array-like", None], + "target_names": ["array-like", None], + "sample_weight": ["array-like", None], + "digits": [Interval(numbers.Integral, 0, None, closed="left")], + "alpha": [numbers.Real], + "output_dict": ["boolean"], + "zero_division": [ + StrOptions({"warn"}), + Interval(numbers.Integral, 0, 1, closed="both"), + ], + } +) def classification_report_imbalanced( y_true, y_pred, @@ -970,6 +1048,13 @@ class 2 1.00 0.67 1.00 0.80 0.82 0.64\ return report +@validate_params( + { + "y_true": ["array-like"], + "y_pred": ["array-like"], + "sample_weight": ["array-like", None], + } +) def macro_averaged_mean_absolute_error(y_true, y_pred, *, sample_weight=None): """Compute Macro-Averaged MAE for imbalanced ordinal classification. From d5ed9af79a9bc7b86548b84734f7c89dece9220e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 10:34:08 +0100 Subject: [PATCH 2/8] TST add common tests --- imblearn/metrics/_classification.py | 10 +++ imblearn/tests/test_public_functions.py | 98 +++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 imblearn/tests/test_public_functions.py diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py index 3504a29a7..b377db592 100644 --- a/imblearn/metrics/_classification.py +++ b/imblearn/metrics/_classification.py @@ -91,6 +91,9 @@ def sensitivity_specificity_support( pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. + If ``pos_label is None`` and in binary classification, this function + returns the average sensitivity and specificity if ``average`` + is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -340,6 +343,8 @@ def sensitivity_score( pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. + If ``pos_label is None`` and in binary classification, this function + returns the average sensitivity if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -453,6 +458,8 @@ def specificity_score( pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. + If ``pos_label is None`` and in binary classification, this function + returns the average specificity if ``average`` is one of ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. @@ -581,6 +588,9 @@ class is unrecognized by the classifier, G-mean resolves to zero. To pos_label : str, int or None, default=1 The class to report if ``average='binary'`` and the data is binary. + If ``pos_label is None`` and in binary classification, this function + returns the average geometric mean if ``average`` is one of + ``'weighted'``. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. diff --git a/imblearn/tests/test_public_functions.py b/imblearn/tests/test_public_functions.py new file mode 100644 index 000000000..510ebe204 --- /dev/null +++ b/imblearn/tests/test_public_functions.py @@ -0,0 +1,98 @@ +"""This is a copy of sklearn/tests/test_public_functions.py. It can be +removed when we support scikit-learn >= 1.2. +""" +from importlib import import_module +from inspect import signature + +import pytest + +from imblearn.utils._param_validation import ( + generate_invalid_param_val, + generate_valid_param, + make_constraint, +) + +PARAM_VALIDATION_FUNCTION_LIST = [ + "imblearn.metrics.sensitivity_specificity_support", + "imblearn.metrics.sensitivity_score", + "imblearn.metrics.specificity_score", + "imblearn.metrics.geometric_mean_score", + "imblearn.metrics.make_index_balanced_accuracy", + "imblearn.metrics.classification_report_imbalanced", + "imblearn.metrics.macro_averaged_mean_absolute_error", +] + + +@pytest.mark.parametrize("func_module", PARAM_VALIDATION_FUNCTION_LIST) +def test_function_param_validation(func_module): + """Check that an informative error is raised when the value of a parameter does not + have an appropriate type or value. + """ + module_name, func_name = func_module.rsplit(".", 1) + module = import_module(module_name) + func = getattr(module, func_name) + + func_sig = signature(func) + func_params = [ + p.name + for p in func_sig.parameters.values() + if p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) + ] + parameter_constraints = getattr(func, "_skl_parameter_constraints") + + # generate valid values for the required parameters + required_params = [ + p.name for p in func_sig.parameters.values() if p.default is p.empty + ] + valid_required_params = {} + for param_name in required_params: + if parameter_constraints[param_name] == "no_validation": + valid_required_params[param_name] = 1 + else: + valid_required_params[param_name] = generate_valid_param( + make_constraint(parameter_constraints[param_name][0]) + ) + + # check that there is a constraint for each parameter + if func_params: + validation_params = parameter_constraints.keys() + unexpected_params = set(validation_params) - set(func_params) + missing_params = set(func_params) - set(validation_params) + err_msg = ( + "Mismatch between _parameter_constraints and the parameters of" + f" {func_name}.\nConsider the unexpected parameters {unexpected_params} and" + f" expected but missing parameters {missing_params}\n" + ) + assert set(validation_params) == set(func_params), err_msg + + # this object does not have a valid type for sure for all params + param_with_bad_type = type("BadType", (), {})() + + for param_name in func_params: + constraints = parameter_constraints[param_name] + + if constraints == "no_validation": + # This parameter is not validated + continue + + match = ( + rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead." + ) + + # First, check that the error is raised if param doesn't match any valid type. + with pytest.raises(ValueError, match=match): + func(**{**valid_required_params, param_name: param_with_bad_type}) + + # Then, for constraints that are more than a type constraint, check that the + # error is raised if param does match a valid type but does not match any valid + # value for this type. + constraints = [make_constraint(constraint) for constraint in constraints] + + for constraint in constraints: + try: + bad_value = generate_invalid_param_val(constraint) + except NotImplementedError: + continue + + with pytest.raises(ValueError, match=match): + func(**{**valid_required_params, param_name: bad_value}) From 16b58e0ae4a4f1f0d7a684fdc06ab22b27ed29b4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 10:39:43 +0100 Subject: [PATCH 3/8] iter --- imblearn/datasets/_imbalance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index ffa822037..e0820a03a 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -26,7 +26,7 @@ def make_imbalance( X : {array-like, dataframe} of shape (n_samples, n_features) Matrix containing the data to be imbalanced. - y : ndarray of shape (n_samples,) + y : array-like of shape (n_samples,) Corresponding label for each sample in X. sampling_strategy : dict or callable, From 7ab8e14c3741af2bb690bc21cc21a3661973617d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 14:23:58 +0100 Subject: [PATCH 4/8] iter --- imblearn/datasets/_imbalance.py | 15 +++++++++++++-- imblearn/over_sampling/base.py | 3 ++- imblearn/tests/test_public_functions.py | 9 +++++++-- imblearn/under_sampling/base.py | 3 ++- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index e0820a03a..e935f1b9b 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -6,11 +6,22 @@ # License: MIT from collections import Counter +from collections.abc import Mapping from ..under_sampling import RandomUnderSampler from ..utils import check_sampling_strategy - - +from ..utils._param_validation import validate_params + + +@validate_params( + { + "X": ["array-like", "dataframe"], + "y": ["array-like"], + "sampling_strategy": [Mapping, callable, None], + "random_state": ["random_state"], + "verbose": ["boolean"], + } +) def make_imbalance( X, y, *, sampling_strategy=None, random_state=None, verbose=False, **kwargs ): diff --git a/imblearn/over_sampling/base.py b/imblearn/over_sampling/base.py index 4bc08e91a..d4e4a4541 100644 --- a/imblearn/over_sampling/base.py +++ b/imblearn/over_sampling/base.py @@ -6,6 +6,7 @@ # License: MIT import numbers +from collections.abc import Mapping from ..base import BaseSampler from ..utils._param_validation import Interval, StrOptions @@ -61,7 +62,7 @@ class BaseOverSampler(BaseSampler): "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), - dict, + Mapping, callable, ], "random_state": ["random_state"], diff --git a/imblearn/tests/test_public_functions.py b/imblearn/tests/test_public_functions.py index 510ebe204..1b94b16df 100644 --- a/imblearn/tests/test_public_functions.py +++ b/imblearn/tests/test_public_functions.py @@ -13,6 +13,7 @@ ) PARAM_VALIDATION_FUNCTION_LIST = [ + "imblearn.datasets.make_imbalance", "imblearn.metrics.sensitivity_specificity_support", "imblearn.metrics.sensitivity_score", "imblearn.metrics.specificity_score", @@ -40,9 +41,13 @@ def test_function_param_validation(func_module): ] parameter_constraints = getattr(func, "_skl_parameter_constraints") - # generate valid values for the required parameters + # Generate valid values for the required parameters + # The parameters `*args` and `**kwargs` are ignored since we cannot generate + # constraints. required_params = [ - p.name for p in func_sig.parameters.values() if p.default is p.empty + p.name + for p in func_sig.parameters.values() + if p.default is p.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) ] valid_required_params = {} for param_name in required_params: diff --git a/imblearn/under_sampling/base.py b/imblearn/under_sampling/base.py index e36d8c31f..92da45723 100644 --- a/imblearn/under_sampling/base.py +++ b/imblearn/under_sampling/base.py @@ -5,6 +5,7 @@ # License: MIT import numbers +from collections.abc import Mapping from ..base import BaseSampler from ..utils._param_validation import Interval, StrOptions @@ -61,7 +62,7 @@ class BaseUnderSampler(BaseSampler): "sampling_strategy": [ Interval(numbers.Real, 0, 1, closed="right"), StrOptions({"auto", "majority", "not minority", "not majority", "all"}), - dict, + Mapping, callable, ], } From 3f0741064504567840fc89c239ac176215dce636 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 15:06:50 +0100 Subject: [PATCH 5/8] iter --- imblearn/datasets/_zenodo.py | 12 ++++++++++++ imblearn/datasets/tests/test_imbalance.py | 1 - imblearn/pipeline.py | 23 ++++++++++++++++++++--- imblearn/tests/test_common.py | 1 - imblearn/tests/test_pipeline.py | 8 ++++++++ imblearn/tests/test_public_functions.py | 10 ++++++---- 6 files changed, 46 insertions(+), 9 deletions(-) diff --git a/imblearn/datasets/_zenodo.py b/imblearn/datasets/_zenodo.py index 72bafe7a6..3a2c679a0 100644 --- a/imblearn/datasets/_zenodo.py +++ b/imblearn/datasets/_zenodo.py @@ -54,6 +54,8 @@ from sklearn.datasets import get_data_home from sklearn.utils import Bunch, check_random_state +from ..utils._param_validation import validate_params + URL = "https://zenodo.org/record/61452/files/benchmark-imbalanced-learn.tar.gz" PRE_FILENAME = "x" POST_FILENAME = "data.npz" @@ -95,6 +97,16 @@ MAP_ID_NAME[v + 1] = k +@validate_params( + { + "data_home": [None, str], + "filter_data": [None, tuple], + "download_if_missing": ["boolean"], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "verbose": ["boolean"], + } +) def fetch_datasets( *, data_home=None, diff --git a/imblearn/datasets/tests/test_imbalance.py b/imblearn/datasets/tests/test_imbalance.py index 1b98d3aae..2d8e278fa 100644 --- a/imblearn/datasets/tests/test_imbalance.py +++ b/imblearn/datasets/tests/test_imbalance.py @@ -22,7 +22,6 @@ def iris(): [ ({0: -100, 1: 50, 2: 50}, "in a class cannot be negative"), ({0: 10, 1: 70}, "should be less or equal to the original"), - ("random-string", "has to be a dictionary or a function"), ], ) def test_make_imbalance_error(iris, sampling_strategy, err_msg): diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py index f6b5d5d24..738f89b49 100644 --- a/imblearn/pipeline.py +++ b/imblearn/pipeline.py @@ -12,16 +12,19 @@ # Christos Aridas # Guillaume Lemaitre # License: BSD +import joblib from sklearn import pipeline from sklearn.base import clone from sklearn.utils import _print_elapsed_time from sklearn.utils.metaestimators import available_if -from sklearn.utils.validation import check_memory + +from .base import _ParamsValidationMixin +from .utils._param_validation import HasMethods, validate_params __all__ = ["Pipeline", "make_pipeline"] -class Pipeline(pipeline.Pipeline): +class Pipeline(pipeline.Pipeline, _ParamsValidationMixin): """Pipeline of transforms and resamples with a final estimator. Sequentially apply a list of transforms, sampling, and a final estimator. @@ -128,6 +131,12 @@ class Pipeline(pipeline.Pipeline): """ + _parameter_constraints: dict = { + "steps": "no_validation", # validated in `_validate_steps` + "memory": [None, str, HasMethods(["cache"])], + "verbose": ["boolean"], + } + # BaseEstimator interface def _validate_steps(self): @@ -201,7 +210,10 @@ def _fit(self, X, y=None, **fit_params_steps): self.steps = list(self.steps) self._validate_steps() # Setup the memory - memory = check_memory(self.memory) + if self.memory is None or isinstance(self.memory, str): + memory = joblib.Memory(location=self.memory, verbose=0) + else: + memory = self.memory fit_transform_one_cached = memory.cache(pipeline._fit_transform_one) fit_resample_one_cached = memory.cache(_fit_resample_one) @@ -276,6 +288,7 @@ def fit(self, X, y=None, **fit_params): self : Pipeline This estimator. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt, yt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): @@ -311,6 +324,7 @@ def fit_transform(self, X, y=None, **fit_params): Xt : array-like of shape (n_samples, n_transformed_features) Transformed samples. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt, yt = self._fit(X, y, **fit_params_steps) @@ -354,6 +368,7 @@ def fit_resample(self, X, y=None, **fit_params): yt : array-like of shape (n_samples, n_transformed_features) Transformed target. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt, yt = self._fit(X, y, **fit_params_steps) last_step = self._final_estimator @@ -392,6 +407,7 @@ def fit_predict(self, X, y=None, **fit_params): y_pred : ndarray of shape (n_samples,) The predicted target. """ + self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt, yt = self._fit(X, y, **fit_params_steps) @@ -408,6 +424,7 @@ def _fit_resample_one(sampler, X, y, message_clsname="", message=None, **fit_par return X_res, y_res, sampler +@validate_params({"memory": [None, str, HasMethods(["cache"])], "verbose": ["boolean"]}) def make_pipeline(*steps, memory=None, verbose=False): """Construct a Pipeline from the given estimators. diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 9ec5764d3..036d84476 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -70,6 +70,5 @@ def test_estimators_imblearn(estimator, check, request): ) def test_check_param_validation(estimator): name = estimator.__class__.__name__ - print(name) _set_checking_parameters(estimator) check_param_validation(name, estimator) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index d2f0b8f5c..8b512659b 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -35,6 +35,7 @@ from imblearn.pipeline import Pipeline, make_pipeline from imblearn.under_sampling import EditedNearestNeighbours as ENN from imblearn.under_sampling import RandomUnderSampler +from imblearn.utils.estimator_checks import check_param_validation JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", @@ -1341,3 +1342,10 @@ def test_pipeline_score_samples_pca_lof_multiclass(): # Check the values lof.fit(pca.fit_transform(X)) assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) + + +def test_pipeline_param_validation(): + model = Pipeline( + [("sampler", RandomUnderSampler()), ("classifier", LogisticRegression())] + ) + check_param_validation("Pipeline", model) diff --git a/imblearn/tests/test_public_functions.py b/imblearn/tests/test_public_functions.py index 1b94b16df..d84732007 100644 --- a/imblearn/tests/test_public_functions.py +++ b/imblearn/tests/test_public_functions.py @@ -13,14 +13,16 @@ ) PARAM_VALIDATION_FUNCTION_LIST = [ + "imblearn.datasets.fetch_datasets", "imblearn.datasets.make_imbalance", + "imblearn.metrics.classification_report_imbalanced", + "imblearn.metrics.geometric_mean_score", + "imblearn.metrics.macro_averaged_mean_absolute_error", + "imblearn.metrics.make_index_balanced_accuracy", "imblearn.metrics.sensitivity_specificity_support", "imblearn.metrics.sensitivity_score", "imblearn.metrics.specificity_score", - "imblearn.metrics.geometric_mean_score", - "imblearn.metrics.make_index_balanced_accuracy", - "imblearn.metrics.classification_report_imbalanced", - "imblearn.metrics.macro_averaged_mean_absolute_error", + "imblearn.pipeline.make_pipeline", ] From 83b30788fd745fcef4fa03f8b6c6c3532475f2db Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 15:07:57 +0100 Subject: [PATCH 6/8] TST remove redundant test --- imblearn/tests/test_pipeline.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py index 8b512659b..8355f0228 100644 --- a/imblearn/tests/test_pipeline.py +++ b/imblearn/tests/test_pipeline.py @@ -643,22 +643,6 @@ def test_classes_property(): assert_array_equal(clf.classes_, np.unique(y)) -def test_pipeline_wrong_memory(): - # Test that an error is raised when memory is not a string or a Memory - # instance - iris = load_iris() - X = iris.data - y = iris.target - # Define memory as an integer - memory = 1 - cached_pipe = Pipeline( - [("transf", DummyTransf()), ("svc", SVC(gamma="scale"))], memory=memory - ) - error_regex = "string or have the same interface as" - with raises(ValueError, match=error_regex): - cached_pipe.fit(X, y) - - def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data From 517f5c63f52414d76f6d9d089aa0a82eb632ca3b Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 16:03:36 +0100 Subject: [PATCH 7/8] revert dataframe --- imblearn/datasets/_imbalance.py | 10 ++---- imblearn/utils/_param_validation.py | 21 +---------- imblearn/utils/tests/test_param_validation.py | 36 ------------------- 3 files changed, 3 insertions(+), 64 deletions(-) diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py index e935f1b9b..8c1c15aec 100644 --- a/imblearn/datasets/_imbalance.py +++ b/imblearn/datasets/_imbalance.py @@ -15,7 +15,7 @@ @validate_params( { - "X": ["array-like", "dataframe"], + "X": ["array-like"], "y": ["array-like"], "sampling_strategy": [Mapping, callable, None], "random_state": ["random_state"], @@ -97,16 +97,10 @@ def make_imbalance( """ target_stats = Counter(y) # restrict ratio to be a dict or a callable - if isinstance(sampling_strategy, dict) or callable(sampling_strategy): + if isinstance(sampling_strategy, Mapping) or callable(sampling_strategy): sampling_strategy_ = check_sampling_strategy( sampling_strategy, y, "under-sampling", **kwargs ) - else: - raise ValueError( - f"'sampling_strategy' has to be a dictionary or a " - f"function returning a dictionary. Got {type(sampling_strategy)} " - f"instead." - ) if verbose: print(f"The original target distribution in the dataset is: {target_stats}") diff --git a/imblearn/utils/_param_validation.py b/imblearn/utils/_param_validation.py index a45292c63..005595276 100644 --- a/imblearn/utils/_param_validation.py +++ b/imblearn/utils/_param_validation.py @@ -23,7 +23,7 @@ # if sklearn_version < parse_version("1.2"): if True: # TODO: remove `if True` when we have clear support for: - # - dataframe + # - ignoring `*args` and `**kwargs` in the signature def validate_parameter_constraints(parameter_constraints, params, caller_name): """Validate types and values of given parameters. @@ -38,7 +38,6 @@ def validate_parameter_constraints(parameter_constraints, params, caller_name): Constraints can be: - an Interval object, representing a continuous or discrete range of numbers - the string "array-like" - - the string "dataframe" - the string "sparse matrix" - the string "random_state" - callable @@ -119,8 +118,6 @@ def make_constraint(constraint): return _ArrayLikes() if isinstance(constraint, str) and constraint == "sparse matrix": return _SparseMatrices() - if isinstance(constraint, str) and constraint == "dataframe": - return _DataFrames() if isinstance(constraint, str) and constraint == "random_state": return _RandomStates() if constraint is callable: @@ -472,17 +469,6 @@ def is_satisfied_by(self, val): def __str__(self): return "a sparse matrix" - class _DataFrames(_Constraint): - """Constraint representing a DataFrame""" - - def is_satisfied_by(self, val): - # Let's first try the dataframe protocol and then duck-typing for the older - # pandas versions. - return hasattr(val, "__dataframe__") or hasattr(val, "iloc") - - def __str__(self): - return "a DataFrame" - class _Callables(_Constraint): """Constraint representing callables.""" @@ -862,11 +848,6 @@ def generate_valid_param(constraint): if isinstance(constraint, _SparseMatrices): return csr_matrix([[0, 1], [1, 0]]) - if isinstance(constraint, _DataFrames): - import pandas as pd - - return pd.DataFrame({"a": [1, 2, 3]}) - if isinstance(constraint, _RandomStates): return np.random.RandomState(42) diff --git a/imblearn/utils/tests/test_param_validation.py b/imblearn/utils/tests/test_param_validation.py index ec3a37e13..dae58a790 100644 --- a/imblearn/utils/tests/test_param_validation.py +++ b/imblearn/utils/tests/test_param_validation.py @@ -21,7 +21,6 @@ _Booleans, _Callables, _CVObjects, - _DataFrames, _InstancesOf, _IterablesNotString, _MissingValues, @@ -37,15 +36,6 @@ ) -def has_pandas(): - try: - import pandas as pd - - return True, pd.DataFrame({"a": [1, 2, 3]}) - except ImportError: - return False, None - - # Some helpers for the tests @validate_params({"a": [Real], "b": [Real], "c": [Real], "d": [Real]}) def _func(a, b=0, *args, c, d=0, **kwargs): @@ -327,12 +317,6 @@ def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval) "constraints", [ [_ArrayLikes()], - pytest.param( - [_DataFrames()], - marks=pytest.mark.skipif( - not has_pandas()[0], reason="Pandas not installed" - ), - ), [_InstancesOf(list)], [_Callables()], [_NoneConstraint()], @@ -358,12 +342,6 @@ def test_generate_invalid_param_val_all_valid(constraints): "constraint", [ _ArrayLikes(), - pytest.param( - _DataFrames(), - marks=pytest.mark.skipif( - not has_pandas()[0], reason="Pandas not installed" - ), - ), _Callables(), _InstancesOf(list), _NoneConstraint(), @@ -403,13 +381,6 @@ def test_generate_valid_param(constraint): (None, None), ("array-like", [[1, 2], [3, 4]]), ("array-like", np.array([[1, 2], [3, 4]])), - pytest.param( - "dataframe", - has_pandas()[1], - marks=pytest.mark.skipif( - not has_pandas()[0], reason="Pandas not installed" - ), - ), ("sparse matrix", csr_matrix([[1, 2], [3, 4]])), ("random_state", 0), ("random_state", np.random.RandomState(0)), @@ -443,13 +414,6 @@ def test_is_satisfied_by(constraint_declaration, value): (Options(Real, {0.42, 1.23}), Options), ("array-like", _ArrayLikes), ("sparse matrix", _SparseMatrices), - pytest.param( - "dataframe", - _DataFrames, - marks=pytest.mark.skipif( - not has_pandas()[0], reason="Pandas not installed" - ), - ), ("random_state", _RandomStates), (None, _NoneConstraint), (callable, _Callables), From 2506fc8f0c6271dfdc2a6e40c5513b52ee205018 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 16:41:05 +0100 Subject: [PATCH 8/8] iter --- imblearn/utils/_param_validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/imblearn/utils/_param_validation.py b/imblearn/utils/_param_validation.py index 005595276..ae3855945 100644 --- a/imblearn/utils/_param_validation.py +++ b/imblearn/utils/_param_validation.py @@ -20,8 +20,7 @@ sklearn_version = parse_version(sklearn.__version__) -# if sklearn_version < parse_version("1.2"): -if True: +if sklearn_version < parse_version("1.2"): # TODO: remove `if True` when we have clear support for: # - ignoring `*args` and `**kwargs` in the signature