From 1468596be14966d6f7c718dcd9062d18c46201fb Mon Sep 17 00:00:00 2001
From: timschulz <tim.schulz@ginkgo-analytics.com>
Date: Tue, 28 Nov 2023 14:12:47 +0100
Subject: [PATCH 1/5] Fix recreation of DataFrame from array to be able to
 handle sparse data

---
 imblearn/tests/test_common.py | 13 +++++++++++++
 imblearn/utils/_validation.py |  6 +++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py
index d78cafd83..ccfd259c8 100644
--- a/imblearn/tests/test_common.py
+++ b/imblearn/tests/test_common.py
@@ -7,6 +7,7 @@
 from collections import OrderedDict
 
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.base import clone
 from sklearn.exceptions import ConvergenceWarning
@@ -108,3 +109,15 @@ def test_pandas_column_name_consistency(estimator):
             )
         for warning in record:
             assert "was fitted without feature names" not in str(warning.message)
+
+
+def test_pandas_sparsity_preserved():
+    df = pd.DataFrame(
+        {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0)
+    )
+    y = pd.Series([0] * 18 + [1] * 2)
+
+    ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1)
+    new_df, new_y = ros.fit_resample(df, y)
+    for column_dtype in new_df.dtypes:
+        assert isinstance(column_dtype, pd.SparseDtype)
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index a36e6d81b..bf1d8351f 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -10,6 +10,7 @@
 from numbers import Integral, Real
 
 import numpy as np
+from scipy.sparse import issparse
 from sklearn.base import clone
 from sklearn.neighbors import NearestNeighbors
 from sklearn.utils import check_array, column_or_1d
@@ -61,7 +62,10 @@ def _transfrom_one(self, array, props):
         elif type_ == "dataframe":
             import pandas as pd
 
-            ret = pd.DataFrame(array, columns=props["columns"])
+            if issparse(array):
+                ret = pd.DataFrame.sparse.from_spmatrix(array, columns=props["columns"])
+            else:
+                ret = pd.DataFrame(array, columns=props["columns"])
             ret = ret.astype(props["dtypes"])
         elif type_ == "series":
             import pandas as pd

From 70e3f6ee2e60c1cd1ea083a7e2e8e002ef0630ad Mon Sep 17 00:00:00 2001
From: timschulz <tim.schulz@ginkgo-analytics.com>
Date: Tue, 28 Nov 2023 16:34:49 +0100
Subject: [PATCH 2/5] Move test for sparse DataFrame to
 test_random_over_sampler.py

---
 .../tests/test_random_over_sampler.py             | 15 +++++++++++++++
 imblearn/tests/test_common.py                     | 13 -------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index 6ad4b75ef..2d198f609 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -287,3 +287,18 @@ def test_random_over_sampling_datetime():
     pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
     pd.testing.assert_index_equal(X_res.index, y_res.index)
     assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1]))
+
+
+def test_pandas_sparsity_preserved():
+    """Check that a sparse DataFrame can be handled and is still sparse
+    after oversampling."""
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame(
+        {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0)
+    )
+    y = pd.Series([0] * 18 + [1] * 2)
+
+    ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1)
+    new_df, new_y = ros.fit_resample(df, y)
+    for column_dtype in new_df.dtypes:
+        assert isinstance(column_dtype, pd.SparseDtype)
diff --git a/imblearn/tests/test_common.py b/imblearn/tests/test_common.py
index ccfd259c8..d78cafd83 100644
--- a/imblearn/tests/test_common.py
+++ b/imblearn/tests/test_common.py
@@ -7,7 +7,6 @@
 from collections import OrderedDict
 
 import numpy as np
-import pandas as pd
 import pytest
 from sklearn.base import clone
 from sklearn.exceptions import ConvergenceWarning
@@ -109,15 +108,3 @@ def test_pandas_column_name_consistency(estimator):
             )
         for warning in record:
             assert "was fitted without feature names" not in str(warning.message)
-
-
-def test_pandas_sparsity_preserved():
-    df = pd.DataFrame(
-        {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0)
-    )
-    y = pd.Series([0] * 18 + [1] * 2)
-
-    ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1)
-    new_df, new_y = ros.fit_resample(df, y)
-    for column_dtype in new_df.dtypes:
-        assert isinstance(column_dtype, pd.SparseDtype)

From d6da23bbf1f7b70c4dae960801498d8e58bb7c82 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 19 Jan 2024 17:34:19 +0100
Subject: [PATCH 3/5] add common test

---
 .../tests/test_random_over_sampler.py         | 15 ----------
 imblearn/utils/estimator_checks.py            | 29 +++++++++++++++++++
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index 2d198f609..6ad4b75ef 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -287,18 +287,3 @@ def test_random_over_sampling_datetime():
     pd.testing.assert_series_equal(X_res.dtypes, X.dtypes)
     pd.testing.assert_index_equal(X_res.index, y_res.index)
     assert_array_equal(y_res.to_numpy(), np.array([0, 0, 0, 1, 1, 1]))
-
-
-def test_pandas_sparsity_preserved():
-    """Check that a sparse DataFrame can be handled and is still sparse
-    after oversampling."""
-    pd = pytest.importorskip("pandas")
-    df = pd.DataFrame(
-        {"a": [0, 1] * 10, "b": [0, 1] * 10}, dtype=pd.SparseDtype(float, 0)
-    )
-    y = pd.Series([0] * 18 + [1] * 2)
-
-    ros = RandomOverSampler(sampling_strategy=1, random_state=42, shrinkage=1)
-    new_df, new_y = ros.fit_resample(df, y)
-    for column_dtype in new_df.dtypes:
-        assert isinstance(column_dtype, pd.SparseDtype)
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index eae78099e..570427759 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -108,6 +108,7 @@ def _yield_sampler_checks(sampler):
         yield check_samplers_sparse
     if "dataframe" in tags["X_types"]:
         yield check_samplers_pandas
+        yield check_samplers_pandas_sparse
     if "string" in tags["X_types"]:
         yield check_samplers_string
     if tags["allow_nan"]:
@@ -312,6 +313,34 @@ def check_samplers_sparse(name, sampler_orig):
     assert_allclose(y_res_sparse, y_res)
 
 
+def check_samplers_pandas_sparse(name, sampler_orig):
+    pd = pytest.importorskip("pandas")
+    sampler = clone(sampler_orig)
+    # Check that the samplers handle pandas dataframe and pandas series
+    X, y = sample_dataset_generator()
+    X_df = pd.DataFrame(
+        X, columns=[str(i) for i in range(X.shape[1])], dtype=pd.SparseDtype(float, 0)
+    )
+    y_s = pd.Series(y, name="class")
+
+    X_res_df, y_res_s = sampler.fit_resample(X_df, y_s)
+    X_res, y_res = sampler.fit_resample(X, y)
+
+    # check that we return the same type for dataframes or series types
+    assert isinstance(X_res_df, pd.DataFrame)
+    assert isinstance(y_res_s, pd.Series)
+
+    for column_dtype in X_res_df.dtypes:
+        assert isinstance(column_dtype, pd.SparseDtype)
+
+    assert X_df.columns.tolist() == X_res_df.columns.tolist()
+    assert y_s.name == y_res_s.name
+
+    # FIXME: we should use to_numpy with pandas >= 0.25
+    assert_allclose(X_res_df.values, X_res)
+    assert_allclose(y_res_s.values, y_res)
+
+
 def check_samplers_pandas(name, sampler_orig):
     pd = pytest.importorskip("pandas")
     sampler = clone(sampler_orig)

From 2f179b98c0388c633288899596ea5399c001d09c Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 19 Jan 2024 17:36:16 +0100
Subject: [PATCH 4/5] add changelog

---
 doc/whats_new/v0.12.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst
index 1c4325356..bfb33c3f0 100644
--- a/doc/whats_new/v0.12.rst
+++ b/doc/whats_new/v0.12.rst
@@ -42,3 +42,9 @@ Deprecations
 - Deprecate `kind_sel` in :class:`~imblearn.under_sampling.NeighbourhoodCleaningRule.
   It will be removed in 0.14. The parameter does not have any effect.
   :pr:`1012` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+Enhancements
+............
+
+- Preserve type from dataframe with sparse input.
+  :pr:`1059` by :user:`ts2095 <ts2095>`.

From 7c8f25807c00a1b38b0d1603a395a362a0b84087 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 19 Jan 2024 17:39:30 +0100
Subject: [PATCH 5/5] update changelog

---
 doc/whats_new/v0.12.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst
index bfb33c3f0..88017b547 100644
--- a/doc/whats_new/v0.12.rst
+++ b/doc/whats_new/v0.12.rst
@@ -46,5 +46,5 @@ Deprecations
 Enhancements
 ............
 
-- Preserve type from dataframe with sparse input.
+- Allows to output dataframe with sparse format if provided as input.
   :pr:`1059` by :user:`ts2095 <ts2095>`.