From 1468596be14966d6f7c718dcd9062d18c46201fb Mon Sep 17 00:00:00 2001 From: timschulz Date: Tue, 28 Nov 2023 14:12:47 +0100 Subject: [PATCH 1/5] Fix recreation of DataFrame from array to be able to handle sparse data --- imblearn/tests/test_common.py | 13 +++++++++++++ imblearn/utils/_validation.py | 6 +++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index d78cafd83..ccfd259c8 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -7,6 +7,7 @@ from collections import OrderedDict import numpy as np +import pandas as pd import pytest from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -108,3 +109,15 @@ def test_pandas_column_name_consistency(estimator): ) for warning in record: assert "was fitted without feature names" not in str(warning.message) + + +def test_pandas_sparsity_preserved(): + df = pd.DataFrame( + {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0) + ) + y = pd.Series([0] * 18 + [1] * 2) + + ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1) + new_df, new_y = ros.fit_resample(df, y) + for column_dtype in new_df.dtypes: + assert isinstance(column_dtype, pd.SparseDtype) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index a36e6d81b..bf1d8351f 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -10,6 +10,7 @@ from numbers import Integral, Real import numpy as np +from scipy.sparse import issparse from sklearn.base import clone from sklearn.neighbors import NearestNeighbors from sklearn.utils import check_array, column_or_1d @@ -61,7 +62,10 @@ def _transfrom_one(self, array, props): elif type_ == "dataframe": import pandas as pd - ret = pd.DataFrame(array, columns=props["columns"]) + if issparse(array): + ret = pd.DataFrame.sparse.from_spmatrix(array, columns=props["columns"]) + else: + ret = pd.DataFrame(array, columns=props["columns"]) ret = ret.astype(props["dtypes"]) elif type_ == "series": import pandas as pd From 70e3f6ee2e60c1cd1ea083a7e2e8e002ef0630ad Mon Sep 17 00:00:00 2001 From: timschulz Date: Tue, 28 Nov 2023 16:34:49 +0100 Subject: [PATCH 2/5] Move test for sparse DataFrame to test_random_over_sampler.py --- .../tests/test_random_over_sampler.py | 15 +++++++++++++++ imblearn/tests/test_common.py | 13 ------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 6ad4b75ef..2d198f609 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -287,3 +287,18 @@ def test_random_over_sampling_datetime(): pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) pd.testing.assert_index_equal(X_res.index, y_res.index) assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1])) + + +def test_pandas_sparsity_preserved(): + """Check that a sparse DataFrame can be handled and is still sparse + after oversampling.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame( + {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0) + ) + y = pd.Series([0] * 18 + [1] * 2) + + ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1) + new_df, new_y = ros.fit_resample(df, y) + for column_dtype in new_df.dtypes: + assert isinstance(column_dtype, pd.SparseDtype) diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py index ccfd259c8..d78cafd83 100644 --- a/imblearn/tests/test_common.py +++ b/imblearn/tests/test_common.py @@ -7,7 +7,6 @@ from collections import OrderedDict import numpy as np -import pandas as pd import pytest from sklearn.base import clone from sklearn.exceptions import ConvergenceWarning @@ -109,15 +108,3 @@ def test_pandas_column_name_consistency(estimator): ) for warning in record: assert "was fitted without feature names" not in str(warning.message) - - -def test_pandas_sparsity_preserved(): - df = pd.DataFrame( - {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0) - ) - y = pd.Series([0] * 18 + [1] * 2) - - ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1) - new_df, new_y = ros.fit_resample(df, y) - for column_dtype in new_df.dtypes: - assert isinstance(column_dtype, pd.SparseDtype) From d6da23bbf1f7b70c4dae960801498d8e58bb7c82 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jan 2024 17:34:19 +0100 Subject: [PATCH 3/5] add common test --- .../tests/test_random_over_sampler.py | 15 ---------- imblearn/utils/estimator_checks.py | 29 +++++++++++++++++++ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py index 2d198f609..6ad4b75ef 100644 --- a/imblearn/over_sampling/tests/test_random_over_sampler.py +++ b/imblearn/over_sampling/tests/test_random_over_sampler.py @@ -287,18 +287,3 @@ def test_random_over_sampling_datetime(): pd.testing.assert_series_equal(X_res.dtypes, X.dtypes) pd.testing.assert_index_equal(X_res.index, y_res.index) assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1])) - - -def test_pandas_sparsity_preserved(): - """Check that a sparse DataFrame can be handled and is still sparse - after oversampling.""" - pd = pytest.importorskip("pandas") - df = pd.DataFrame( - {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0) - ) - y = pd.Series([0] * 18 + [1] * 2) - - ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1) - new_df, new_y = ros.fit_resample(df, y) - for column_dtype in new_df.dtypes: - assert isinstance(column_dtype, pd.SparseDtype) diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index eae78099e..570427759 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -108,6 +108,7 @@ def _yield_sampler_checks(sampler): yield check_samplers_sparse if "dataframe" in tags["X_types"]: yield check_samplers_pandas + yield check_samplers_pandas_sparse if "string" in tags["X_types"]: yield check_samplers_string if tags["allow_nan"]: @@ -312,6 +313,34 @@ def check_samplers_sparse(name, sampler_orig): assert_allclose(y_res_sparse, y_res) +def check_samplers_pandas_sparse(name, sampler_orig): + pd = pytest.importorskip("pandas") + sampler = clone(sampler_orig) + # Check that the samplers handle pandas dataframe and pandas series + X, y = sample_dataset_generator() + X_df = pd.DataFrame( + X, columns=[str(i) for i in range(X.shape[1])], dtype=pd.SparseDtype(float, 0) + ) + y_s = pd.Series(y, name="class") + + X_res_df, y_res_s = sampler.fit_resample(X_df, y_s) + X_res, y_res = sampler.fit_resample(X, y) + + # check that we return the same type for dataframes or series types + assert isinstance(X_res_df, pd.DataFrame) + assert isinstance(y_res_s, pd.Series) + + for column_dtype in X_res_df.dtypes: + assert isinstance(column_dtype, pd.SparseDtype) + + assert X_df.columns.tolist() == X_res_df.columns.tolist() + assert y_s.name == y_res_s.name + + # FIXME: we should use to_numpy with pandas >= 0.25 + assert_allclose(X_res_df.values, X_res) + assert_allclose(y_res_s.values, y_res) + + def check_samplers_pandas(name, sampler_orig): pd = pytest.importorskip("pandas") sampler = clone(sampler_orig) From 2f179b98c0388c633288899596ea5399c001d09c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jan 2024 17:36:16 +0100 Subject: [PATCH 4/5] add changelog --- doc/whats_new/v0.12.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst index 1c4325356..bfb33c3f0 100644 --- a/doc/whats_new/v0.12.rst +++ b/doc/whats_new/v0.12.rst @@ -42,3 +42,9 @@ Deprecations - Deprecate `kind_sel` in :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule. It will be removed in 0.14. The parameter does not have any effect. :pr:`1012` by :user:`Guillaume Lemaitre `. + +Enhancements +............ + +- Preserve type from dataframe with sparse input. + :pr:`1059` by :user:`ts2095 `. From 7c8f25807c00a1b38b0d1603a395a362a0b84087 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 19 Jan 2024 17:39:30 +0100 Subject: [PATCH 5/5] update changelog --- doc/whats_new/v0.12.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst index bfb33c3f0..88017b547 100644 --- a/doc/whats_new/v0.12.rst +++ b/doc/whats_new/v0.12.rst @@ -46,5 +46,5 @@ Deprecations Enhancements ............ -- Preserve type from dataframe with sparse input. +- Allows to output dataframe with sparse format if provided as input. :pr:`1059` by :user:`ts2095 `.