From ff5d50c4678640df16b582cecc0bb4e975c3932a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 15:38:20 +0200 Subject: [PATCH 1/3] Backport PR #62323: String dtype: keep select_dtypes(include=object) selecting string columns --- doc/source/whatsnew/v2.3.3.rst | 10 ++++++++++ pandas/core/dtypes/cast.py | 4 +++- pandas/core/frame.py | 12 ++++++++---- .../tests/frame/methods/test_select_dtypes.py | 18 ++++++++++-------- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index aaed7544d9975..0a029f683b6cb 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -18,6 +18,16 @@ Most changes in this release are related to :class:`StringDtype` which will become the default string dtype in pandas 3.0. See :ref:`whatsnew_230.upcoming_changes` for more details. +.. _whatsnew_233.string_fixes.improvements: + +Improvements +^^^^^^^^^^^^ +- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when + specifying ``include=["object"]`` for backwards compatibility. In a future + release, this will be deprecated and code for pandas 3+ should be updated to + do ``include=["str"]`` (:issue:`61916`) + + .. _whatsnew_233.string_fixes.bugs: Bug fixes diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d4263f7488a14..9a7cfc0dec84e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -966,7 +966,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None: np.dtype(" np.ndarray: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8b2465f46d4d..7ff4fac3ef2b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5080,10 +5080,14 @@ def check_int_infer_dtype(dtypes): def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: # GH 46870: BooleanDtype._is_numeric == True but should be excluded dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype - return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set - and getattr(dtype, "_is_numeric", False) - and not is_bool_dtype(dtype) + return ( + issubclass(dtype.type, tuple(dtypes_set)) + or ( + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) + ) + or (dtype.type is str and np.object_ in dtypes_set) ) def predicate(arr: ArrayLike) -> bool: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 0354e9df3d168..d3e28d328c8fd 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ri = df.select_dtypes(include=[str]) tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=["object"]) + ei = df[["a"]] + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - if using_infer_string: - e = df[["b"]] - else: - e = df[["a", "b"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - if using_infer_string: - e = df[["b", "g"]] - else: - e = df[["a", "b", "g"]] + # if using_infer_string: + # TODO warn + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): From 5b348388b3d7514b2fe6351a69dac6fc35b02007 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 21 Sep 2025 15:58:14 +0200 Subject: [PATCH 2/3] limit change to default str dtype --- pandas/core/frame.py | 8 ++++++- .../tests/frame/methods/test_select_dtypes.py | 24 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7ff4fac3ef2b8..c99ba07a55d46 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -144,6 +144,7 @@ TimedeltaArray, ) from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import ( ensure_wrapped_if_datetimelike, sanitize_array, @@ -5087,7 +5088,12 @@ def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: and getattr(dtype, "_is_numeric", False) and not is_bool_dtype(dtype) ) - or (dtype.type is str and np.object_ in dtypes_set) + # backwards compat for the default `str` dtype being selected by object + or ( + isinstance(dtype, StringDtype) + and dtype.na_value is np.nan + and np.object_ in dtypes_set + ) ) def predicate(arr: ArrayLike) -> bool: diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index d3e28d328c8fd..19b4448521c62 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -485,3 +485,27 @@ def test_select_dtypes_no_view(self): result = df.select_dtypes(include=["number"]) result.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig) + + def test_select_dtype_object_and_str(self, using_infer_string): + # https://github.com/pandas-dev/pandas/issues/61916 + df = DataFrame( + { + "a": ["a", "b", "c"], + "b": [1, 2, 3], + "c": pd.array(["a", "b", "c"], dtype="string"), + } + ) + + # with "object" -> only select the object or default str dtype column + result = df.select_dtypes(include=["object"]) + expected = df[["a"]] + tm.assert_frame_equal(result, expected) + + # with "string" -> select both the default 'str' and the nullable 'string' + result = df.select_dtypes(include=["string"]) + if using_infer_string: + expected = df[["a", "c"]] + else: + expected = df[["c"]] + expected = df[["a", "c"]] + tm.assert_frame_equal(result, expected) From 2f14267abc8af150d11a07381ac692f3dd17da33 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Sep 2025 08:53:35 +0200 Subject: [PATCH 3/3] fixup --- pandas/tests/frame/methods/test_select_dtypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 19b4448521c62..1ba6b9c437726 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -507,5 +507,4 @@ def test_select_dtype_object_and_str(self, using_infer_string): expected = df[["a", "c"]] else: expected = df[["c"]] - expected = df[["a", "c"]] tm.assert_frame_equal(result, expected)