From f3d167fc35f5733ab203a7457b9588a35d4271b5 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 18:22:05 +0100 Subject: [PATCH 1/6] MAINT add support for feature_names_in --- imblearn/base.py | 10 +- imblearn/combine/_smote_enn.py | 6 + imblearn/combine/_smote_tomek.py | 6 + imblearn/metrics/pairwise.py | 11 + imblearn/over_sampling/_adasyn.py | 6 + .../over_sampling/_random_over_sampler.py | 6 + imblearn/over_sampling/_smote/base.py | 18 ++ imblearn/over_sampling/_smote/cluster.py | 6 + imblearn/over_sampling/_smote/filter.py | 12 + imblearn/tests/test_common.py | 16 ++ .../_cluster_centroids.py | 6 + .../_condensed_nearest_neighbour.py | 6 + .../_edited_nearest_neighbours.py | 18 ++ .../_instance_hardness_threshold.py | 6 + .../_prototype_selection/_nearmiss.py | 6 + .../_neighbourhood_cleaning_rule.py | 6 + .../_one_sided_selection.py | 6 + .../_random_under_sampler.py | 6 + .../_prototype_selection/_tomek_links.py | 6 + imblearn/utils/estimator_checks.py | 249 +++++++++++++++++- 20 files changed, 408 insertions(+), 4 deletions(-) diff --git a/imblearn/base.py b/imblearn/base.py index 4241d0db3..d02aea9a5 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -7,7 +7,7 @@ from abc import ABCMeta, abstractmethod import numpy as np -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, OneToOneFeatureMixin from sklearn.preprocessing import label_binarize from sklearn.utils.multiclass import check_classification_targets @@ -133,7 +133,7 @@ class attribute, which is a dictionary `param_name: list of constraints`. See ) -class BaseSampler(SamplerMixin, _ParamsValidationMixin): +class BaseSampler(SamplerMixin, OneToOneFeatureMixin, _ParamsValidationMixin): """Base class for sampling algorithms. Warning: This class should not be used directly. Use the derive classes @@ -260,6 +260,12 @@ class FunctionSampler(BaseSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- sklearn.preprocessing.FunctionTransfomer : Stateless transformer. diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py index 241fc0f70..1b0ffe0b8 100644 --- a/imblearn/combine/_smote_enn.py +++ b/imblearn/combine/_smote_enn.py @@ -67,6 +67,12 @@ class SMOTEENN(BaseSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTETomek : Over-sample using SMOTE followed by under-sampling removing diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py index 9a4bc13e6..94d7c4d01 100644 --- a/imblearn/combine/_smote_tomek.py +++ b/imblearn/combine/_smote_tomek.py @@ -66,6 +66,12 @@ class SMOTETomek(BaseSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTEENN : Over-sample using SMOTE followed by under-sampling using Edited diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py index ceec92802..4aa7977ef 100644 --- a/imblearn/metrics/pairwise.py +++ b/imblearn/metrics/pairwise.py @@ -71,6 +71,17 @@ class ValueDifferenceMetric(BaseEstimator, _ParamsValidationMixin): List of length `n_features` containing the conditional probabilities for each category given a class. + n_features_in_ : int + Number of features in the input dataset. + + .. versionadded:: 0.10 + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- sklearn.neighbors.DistanceMetric : Interface for fast metric computation. diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index 6f4b81fd5..54e88b79f 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -73,6 +73,12 @@ class ADASYN(BaseOverSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_random_over_sampler.py b/imblearn/over_sampling/_random_over_sampler.py index 1f12619dd..7175855ea 100644 --- a/imblearn/over_sampling/_random_over_sampler.py +++ b/imblearn/over_sampling/_random_over_sampler.py @@ -76,6 +76,12 @@ class RandomOverSampler(BaseOverSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- BorderlineSMOTE : Over-sample using the borderline-SMOTE variant. diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 15b4664c8..266349ee5 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -264,6 +264,12 @@ class SMOTE(BaseSMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTENC : Over-sample using SMOTE for continuous and categorical features. @@ -442,6 +448,12 @@ class SMOTENC(SMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. @@ -759,6 +771,12 @@ class SMOTEN(SMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py index ccfe07a7e..4ca87e9a0 100644 --- a/imblearn/over_sampling/_smote/cluster.py +++ b/imblearn/over_sampling/_smote/cluster.py @@ -93,6 +93,12 @@ class KMeansSMOTE(BaseSMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index cf014b9ea..4966b211b 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -100,6 +100,12 @@ class BorderlineSMOTE(BaseSMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. @@ -352,6 +358,12 @@ class SVMSMOTE(BaseSMOTE): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- SMOTE : Over-sample using SMOTE. diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index 0c2e2f301..d78cafd83 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -3,6 +3,7 @@ # Christos Aridas # License: MIT +import warnings from collections import OrderedDict import numpy as np @@ -19,6 +20,7 @@ from imblearn.under_sampling import NearMiss, RandomUnderSampler from imblearn.utils.estimator_checks import ( _set_checking_parameters, + check_dataframe_column_names_consistency, check_param_validation, parametrize_with_checks, ) @@ -92,3 +94,17 @@ def test_strategy_as_ordered_dict(Sampler): X_res, y_res = sampler.fit_resample(X, y) assert X_res.shape[0] == sum(strategy.values()) assert y_res.shape[0] == sum(strategy.values()) + + +@pytest.mark.parametrize( + "estimator", _tested_estimators(), ids=_get_check_estimator_ids +) +def test_pandas_column_name_consistency(estimator): + _set_checking_parameters(estimator) + with ignore_warnings(category=(FutureWarning)): + with warnings.catch_warnings(record=True) as record: + check_dataframe_column_names_consistency( + estimator.__class__.__name__, estimator + ) + for warning in record: + assert "was fitted without feature names" not in str(warning.message) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 5be949ed5..5e2ca3a82 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -78,6 +78,12 @@ class ClusterCentroids(BaseUnderSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- EditedNearestNeighbours : Under-sampling by editing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index b0d9109cf..2f03ca8a8 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -69,6 +69,12 @@ class CondensedNearestNeighbour(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- EditedNearestNeighbours : Undersample by editing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py index 84694e746..64419ccdf 100644 --- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py @@ -76,6 +76,12 @@ class EditedNearestNeighbours(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. @@ -251,6 +257,12 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- CondensedNearestNeighbour : Undersample by condensing samples. @@ -454,6 +466,12 @@ class without early stopping. .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- CondensedNearestNeighbour: Under-sampling by condensing samples. diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index b1a6e1150..2d8bfce6c 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -67,6 +67,12 @@ class InstanceHardnessThreshold(BaseUnderSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- NearMiss : Undersample based on near-miss search. diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py index 83f94d890..70f647fa5 100644 --- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py @@ -72,6 +72,12 @@ class NearMiss(BaseUnderSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- RandomUnderSampler : Random undersample the dataset. diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py index 00be9ca71..f9c08ea56 100644 --- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py @@ -83,6 +83,12 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 0a1866075..42e9a6edd 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -68,6 +68,12 @@ class OneSidedSelection(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- EditedNearestNeighbours : Undersample by editing noisy samples. diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py index a7c735fa6..ed47fe586 100644 --- a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py +++ b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py @@ -50,6 +50,12 @@ class RandomUnderSampler(BaseUnderSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- NearMiss : Undersample using near-miss samples. diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py index 31d62675b..b0f954959 100644 --- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py +++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py @@ -48,6 +48,12 @@ class TomekLinks(BaseCleaningSampler): .. versionadded:: 0.9 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during `fit`. Defined only when `X` has feature + names that are all strings. + + .. versionadded:: 0.10 + See Also -------- EditedNearestNeighbours : Undersample by samples edition. diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 4c4c72741..254b4e236 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -4,6 +4,7 @@ # Authors: Guillaume Lemaitre # License: MIT +import re import sys import traceback import warnings @@ -14,27 +15,39 @@ import pytest import sklearn from scipy import sparse -from sklearn.base import clone +from sklearn.base import clone, is_classifier, is_regressor from sklearn.cluster import KMeans from sklearn.datasets import ( # noqa load_iris, + make_blobs, make_classification, make_multilabel_classification, ) from sklearn.exceptions import SkipTestWarning -from sklearn.preprocessing import label_binarize +from sklearn.preprocessing import StandardScaler, label_binarize from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import ( + SkipTest, assert_allclose, assert_array_equal, assert_raises_regex, raises, + set_random_state, ) from sklearn.utils.estimator_checks import ( _enforce_estimator_tags_y, _get_check_estimator_ids, _maybe_mark_xfail, ) + +try: + from sklearn.utils.estimator_checks import _enforce_estimator_tags_x +except ImportError: + # scikit-learn >= 1.2 + from sklearn.utils.estimator_checks import ( + _enforce_estimator_tags_X as _enforce_estimator_tags_x, + ) + from sklearn.utils.fixes import parse_version from sklearn.utils.multiclass import type_of_target @@ -87,6 +100,8 @@ def _yield_sampler_checks(sampler): # stipulated yield check_samplers_sample_indices yield check_samplers_2d_target + yield check_sampler_get_feature_names_out + yield check_sampler_get_feature_names_out_pandas def _yield_classifier_checks(classifier): @@ -567,3 +582,233 @@ def check_param_validation(name, estimator_orig): getattr(estimator, method)(y) # pragma: no cover else: getattr(estimator, method)(X, y) + + +def check_dataframe_column_names_consistency(name, estimator_orig): + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column name consistency for pandas" + ) + + tags = _safe_tags(estimator_orig) + is_supported_X_types = ( + "2darray" in tags["X_types"] or "categorical" in tags["X_types"] + ) + + if not is_supported_X_types or tags["no_validation"]: + return + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + + X_orig = rng.normal(size=(150, 8)) + + X_orig = _enforce_estimator_tags_x(estimator, X_orig) + n_samples, n_features = X_orig.shape + + names = np.array([f"col_{i}" for i in range(n_features)]) + X = pd.DataFrame(X_orig, columns=names) + + if is_regressor(estimator): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + # Check that calling `fit` does not raise any warnings about feature names. + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", + message="X does not have valid feature names", + category=UserWarning, + module="imblearn", + ) + estimator.fit(X, y) + + if not hasattr(estimator, "feature_names_in_"): + raise ValueError( + "Estimator does not have a feature_names_in_ " + "attribute after fitting with a dataframe" + ) + assert isinstance(estimator.feature_names_in_, np.ndarray) + assert estimator.feature_names_in_.dtype == object + assert_array_equal(estimator.feature_names_in_, names) + + # Only check imblearn estimators for feature_names_in_ in docstring + module_name = estimator_orig.__module__ + if ( + module_name.startswith("imblearn.") + and not ("test_" in module_name or module_name.endswith("_testing")) + and ("feature_names_in_" not in (estimator_orig.__doc__)) + ): + raise ValueError( + f"Estimator {name} does not document its feature_names_in_ attribute" + ) + + check_methods = [] + for method in ( + "predict", + "transform", + "decision_function", + "predict_proba", + "score", + "score_samples", + "predict_log_proba", + ): + if not hasattr(estimator, method): + continue + + callable_method = getattr(estimator, method) + if method == "score": + callable_method = partial(callable_method, y=y) + check_methods.append((method, callable_method)) + + for _, method in check_methods: + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", + message="X does not have valid feature names", + category=UserWarning, + module="sklearn", + ) + method(X) # works without UserWarning for valid features + + invalid_names = [ + (names[::-1], "Feature names must be in the same order as they were in fit."), + ( + [f"another_prefix_{i}" for i in range(n_features)], + "Feature names unseen at fit time:\n- another_prefix_0\n-" + " another_prefix_1\n", + ), + ( + names[:3], + f"Feature names seen at fit time, yet now missing:\n- {min(names[3:])}\n", + ), + ] + params = { + key: value + for key, value in estimator.get_params().items() + if "early_stopping" in key + } + early_stopping_enabled = any(value is True for value in params.values()) + + for invalid_name, additional_message in invalid_names: + X_bad = pd.DataFrame(X, columns=invalid_name) + + expected_msg = re.escape( + "The feature names should match those that were passed during fit.\n" + f"{additional_message}" + ) + for name, method in check_methods: + with raises( + ValueError, match=expected_msg, err_msg=f"{name} did not raise" + ): + method(X_bad) + + # partial_fit checks on second call + # Do not call partial fit if early_stopping is on + if not hasattr(estimator, "partial_fit") or early_stopping_enabled: + continue + + estimator = clone(estimator_orig) + if is_classifier(estimator): + classes = np.unique(y) + estimator.partial_fit(X, y, classes=classes) + else: + estimator.partial_fit(X, y) + + with raises(ValueError, match=expected_msg): + estimator.partial_fit(X_bad, y) + + +def check_sampler_get_feature_names_out(name, sampler_orig): + tags = sampler_orig._get_tags() + if "2darray" not in tags["X_types"] or tags["no_validation"]: + return + + X, y = make_blobs( + n_samples=30, + centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, + n_features=2, + cluster_std=0.1, + ) + X = StandardScaler().fit_transform(X) + + sampler = clone(sampler_orig) + X = _enforce_estimator_tags_x(sampler, X) + + n_features = X.shape[1] + set_random_state(sampler) + + y_ = y + X_res, y_res = sampler.fit_resample(X, y=y_) + input_features = [f"feature{i}" for i in range(n_features)] + + # input_features names is not the same length as n_features_in_ + with raises(ValueError, match="input_features should have length equal"): + sampler.get_feature_names_out(input_features[::2]) + + feature_names_out = sampler.get_feature_names_out(input_features) + assert feature_names_out is not None + assert isinstance(feature_names_out, np.ndarray) + assert feature_names_out.dtype == object + assert all(isinstance(name, str) for name in feature_names_out) + + n_features_out = X_res.shape[1] + + assert ( + len(feature_names_out) == n_features_out + ), f"Expected {n_features_out} feature names, got {len(feature_names_out)}" + + +def check_sampler_get_feature_names_out_pandas(name, sampler_orig): + try: + import pandas as pd + except ImportError: + raise SkipTest( + "pandas is not installed: not checking column name consistency for pandas" + ) + + tags = sampler_orig._get_tags() + if "2darray" not in tags["X_types"] or tags["no_validation"]: + return + + X, y = make_blobs( + n_samples=30, + centers=[[0, 0, 0], [1, 1, 1]], + random_state=0, + n_features=2, + cluster_std=0.1, + ) + X = StandardScaler().fit_transform(X) + + sampler = clone(sampler_orig) + X = _enforce_estimator_tags_x(sampler, X) + + n_features = X.shape[1] + set_random_state(sampler) + + y_ = y + feature_names_in = [f"col{i}" for i in range(n_features)] + df = pd.DataFrame(X, columns=feature_names_in) + X_res, y_res = sampler.fit_resample(df, y=y_) + + # error is raised when `input_features` do not match feature_names_in + invalid_feature_names = [f"bad{i}" for i in range(n_features)] + with raises(ValueError, match="input_features is not equal to feature_names_in_"): + sampler.get_feature_names_out(invalid_feature_names) + + feature_names_out_default = sampler.get_feature_names_out() + feature_names_in_explicit_names = sampler.get_feature_names_out(feature_names_in) + assert_array_equal(feature_names_out_default, feature_names_in_explicit_names) + + n_features_out = X_res.shape[1] + + assert ( + len(feature_names_out_default) == n_features_out + ), f"Expected {n_features_out} feature names, got {len(feature_names_out_default)}" From e7687b7198089f29b08320112cb4dbfe07e1621e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 18:23:04 +0100 Subject: [PATCH 2/6] add changelog --- doc/whats_new/v0.10.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst index 00809a1d4..2a1fd714e 100644 --- a/doc/whats_new/v0.10.rst +++ b/doc/whats_new/v0.10.rst @@ -22,6 +22,10 @@ Compatibility - Add support for automatic parameters validation as in scikit-learn >= 1.2. :pr:`955` by :user:`Guillaume Lemaitre `. +- Add support for `feature_names_in_` as well as `get_feature_names_out` for + all samplers. + :pr:`xxx` by :user:`Guillaume Lemaitre `. + Deprecation ........... From 6f07a1244af950eccdf5407bb852409e83d16def Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 18:24:15 +0100 Subject: [PATCH 3/6] update PR number --- doc/whats_new/v0.10.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst index 2a1fd714e..7ce0604b0 100644 --- a/doc/whats_new/v0.10.rst +++ b/doc/whats_new/v0.10.rst @@ -24,7 +24,7 @@ Compatibility - Add support for `feature_names_in_` as well as `get_feature_names_out` for all samplers. - :pr:`xxx` by :user:`Guillaume Lemaitre `. + :pr:`959` by :user:`Guillaume Lemaitre `. Deprecation ........... From f459738c73a78831ff4b776ea0d4bb91a82d02bf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 18:37:21 +0100 Subject: [PATCH 4/6] adapt import --- imblearn/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index d02aea9a5..012e06634 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -7,7 +7,13 @@ from abc import ABCMeta, abstractmethod import numpy as np -from sklearn.base import BaseEstimator, OneToOneFeatureMixin +from sklearn.base import BaseEstimator + +try: + # scikit-learn >= 1.2 + from sklearn.base import OneToOneFeatureMixin +except ImportError: + from sklearn.base import _OneToOneFeatureMixin # noqa from sklearn.preprocessing import label_binarize from sklearn.utils.multiclass import check_classification_targets From 1a17906ef43d1ffae1dc76b6a7f2fb8960afc038 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 18:37:31 +0100 Subject: [PATCH 5/6] adapt import --- imblearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/base.py b/imblearn/base.py index 012e06634..dd4e1b3a8 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -13,7 +13,7 @@ # scikit-learn >= 1.2 from sklearn.base import OneToOneFeatureMixin except ImportError: - from sklearn.base import _OneToOneFeatureMixin # noqa + from sklearn.base import _OneToOneFeatureMixin as OneToOneFeatureMixin from sklearn.preprocessing import label_binarize from sklearn.utils.multiclass import check_classification_targets From 62e4334c602b73ef9d8a74906625c95d93be21e1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 5 Dec 2022 19:02:24 +0100 Subject: [PATCH 6/6] TST update warns and raises --- imblearn/utils/estimator_checks.py | 35 +++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 254b4e236..e5f50a668 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -699,15 +699,34 @@ def check_dataframe_column_names_consistency(name, estimator_orig): for invalid_name, additional_message in invalid_names: X_bad = pd.DataFrame(X, columns=invalid_name) - expected_msg = re.escape( - "The feature names should match those that were passed during fit.\n" - f"{additional_message}" - ) for name, method in check_methods: - with raises( - ValueError, match=expected_msg, err_msg=f"{name} did not raise" - ): - method(X_bad) + if sklearn_version >= parse_version("1.2"): + expected_msg = re.escape( + "The feature names should match those that were passed during fit." + f"\n{additional_message}" + ) + with raises( + ValueError, match=expected_msg, err_msg=f"{name} did not raise" + ): + method(X_bad) + else: + expected_msg = re.escape( + "The feature names should match those that were passed " + "during fit. Starting version 1.2, an error will be raised.\n" + f"{additional_message}" + ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "error", + category=FutureWarning, + module="sklearn", + ) + with raises( + FutureWarning, + match=expected_msg, + err_msg=f"{name} did not raise", + ): + method(X_bad) # partial_fit checks on second call # Do not call partial fit if early_stopping is on