Skip to content
Merged
14 changes: 14 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2303,6 +2303,20 @@ def _groupby_op(
**kwargs,
):
if isinstance(self.dtype, StringDtype):
if how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
return super()._groupby_op(
how=how,
has_dropped_na=has_dropped_na,
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2608,6 +2608,20 @@ def _groupby_op(
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
if op.how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4162,9 +4162,9 @@ def quantile(
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
if is_object_dtype(vals.dtype):
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
raise TypeError(
"'quantile' cannot be performed against 'object' dtypes!"
f"dtype '{vals.dtype}' does not support operation 'quantile'"
)

inference: DtypeObj | None = None
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2113,7 +2113,7 @@ def test_unstack_period_frame(self):
@pytest.mark.filterwarnings(
"ignore:The previous implementation of stack is deprecated"
)
def test_stack_multiple_bug(self, future_stack):
def test_stack_multiple_bug(self, future_stack, using_infer_string):
# bug when some uniques are not present in the data GH#3170
id_col = ([1] * 3) + ([2] * 3)
name = (["a"] * 3) + (["b"] * 3)
Expand All @@ -2125,6 +2125,8 @@ def test_stack_multiple_bug(self, future_stack):
multi.columns.name = "Params"
unst = multi.unstack("ID")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
unst.resample("W-THU").mean()
down = unst.resample("W-THU").mean(numeric_only=True)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,11 @@ def test_cython_agg_return_dict():

def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)

grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
expected = grouped.agg(np.sum)
expected = grouped.agg(np.sum).astype(object)
Comment on lines 149 to +155
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was there a specific reason you added an explicit dtype=object here (since it seems you only added this in the last commit, after updating for sum() being implemented, so now this is actually no longer needed, I think) ?

tm.assert_series_equal(summed, expected)


Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
msg = "dtype 'object' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()


Expand Down Expand Up @@ -241,7 +242,6 @@ def test_groupby_quantile_nullable_array(values, q):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
Expand All @@ -251,9 +251,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
):
msg = "dtype '.*' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("a").quantile(q, numeric_only=numeric_only)


Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def test_frame_multi_key_function_list():
tm.assert_frame_equal(agged, expected)


def test_frame_multi_key_function_list_partial_failure():
def test_frame_multi_key_function_list_partial_failure(using_infer_string):
data = DataFrame(
{
"A": [
Expand Down Expand Up @@ -476,6 +476,8 @@ def test_frame_multi_key_function_list_partial_failure():
grouped = data.groupby(["A", "B"])
funcs = ["mean", "std"]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)

Expand Down Expand Up @@ -662,9 +664,11 @@ def test_groupby_multi_corner(df):
tm.assert_frame_equal(agged, expected)


def test_raises_on_nuisance(df):
def test_raises_on_nuisance(df, using_infer_string):
grouped = df.groupby("A")
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -699,15 +703,18 @@ def test_keep_nuisance_agg(df, agg_function):
["sum", "mean", "prod", "std", "var", "sem", "median"],
)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_omit_nuisance_agg(df, agg_function, numeric_only):
def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
# GH 38774, GH 38815
grouped = df.groupby("A")

no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
if agg_function in no_drop_nuisance and not numeric_only:
# Added numeric_only as part of GH#46560; these do not drop nuisance
# columns when numeric_only is False
if agg_function in ("std", "sem"):
if using_infer_string:
msg = f"dtype 'str' does not support operation '{agg_function}'"
klass = TypeError
elif agg_function in ("std", "sem"):
klass = ValueError
msg = "could not convert string to float: 'one'"
else:
Expand All @@ -728,16 +735,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
tm.assert_frame_equal(result, expected)


def test_raise_on_nuisance_python_single(df):
def test_raise_on_nuisance_python_single(df, using_infer_string):
# GH 38815
grouped = df.groupby("A")
with pytest.raises(ValueError, match="could not convert"):

err = ValueError
msg = "could not convert"
if using_infer_string:
err = TypeError
msg = "dtype 'str' does not support operation 'skew'"
with pytest.raises(err, match=msg):
grouped.skew()


def test_raise_on_nuisance_python_multiple(three_group):
def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
grouped = three_group.groupby(["A", "B"])
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
grouped.agg("mean")
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -775,12 +790,16 @@ def test_nonsense_func():
df.groupby(lambda x: x + "foo")


def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
def test_wrap_aggregated_output_multindex(
multiindex_dataframe_random_data, using_infer_string
):
df = multiindex_dataframe_random_data.T
df["baz", "two"] = "peekaboo"

keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
msg = re.escape("agg function failed [how->mean,dtype->")
if using_infer_string:
msg = "dtype 'str' does not support operation 'mean'"
with pytest.raises(TypeError, match=msg):
df.groupby(keys).agg("mean")
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
Expand Down Expand Up @@ -960,8 +979,10 @@ def test_groupby_with_hier_columns():

def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)
grouped2 = df.groupby(df["A"].rename(None))

result = grouped.sum()
expected = df.groupby(df["A"].rename(None)).sum()
expected = grouped2.sum()
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1457,8 +1478,8 @@ def test_no_dummy_key_names(df):
result = df.groupby(df["A"].values).sum()
assert result.index.name is None

result = df.groupby([df["A"].values, df["B"].values]).sum()
assert result.index.names == (None, None)
result2 = df.groupby([df["A"].values, df["B"].values]).sum()
assert result2.index.names == (None, None)


def test_groupby_sort_multiindex_series():
Expand Down Expand Up @@ -1761,6 +1782,7 @@ def get_categorical_invalid_expected():
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
is_dt64 = df.dtypes.iloc[0].kind == "M"
is_cat = isinstance(values, Categorical)
is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)

if (
isinstance(values, Categorical)
Expand All @@ -1785,13 +1807,15 @@ def get_categorical_invalid_expected():

if op in ["prod", "sum", "skew"]:
# ops that require more than just ordered-ness
if is_dt64 or is_cat or is_per:
if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
# GH#41291
# datetime64 -> prod and sum are invalid
if is_dt64:
msg = "datetime64 type does not support"
elif is_per:
msg = "Period type does not support"
elif is_str:
msg = f"dtype 'str' does not support operation '{op}'"
else:
msg = "category type does not support"
if op == "skew":
Expand Down Expand Up @@ -2714,7 +2738,7 @@ def test_obj_with_exclusions_duplicate_columns():
def test_groupby_numeric_only_std_no_result(numeric_only):
# GH 51080
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
df = DataFrame(dicts_non_numeric)
df = DataFrame(dicts_non_numeric, dtype=object)
dfgb = df.groupby("a", as_index=False, sort=False)

if numeric_only:
Expand Down Expand Up @@ -2773,10 +2797,14 @@ def test_grouping_with_categorical_interval_columns():
def test_groupby_sum_on_nan_should_return_nan(bug_var):
# GH 24196
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
if isinstance(bug_var, str):
df = df.astype(object)
dfgb = df.groupby(lambda x: x)
result = dfgb.sum(min_count=1)

expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
expected_df = DataFrame(
[bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
)
tm.assert_frame_equal(result, expected_df)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_groupby_subclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):

df = obj(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
Expand Down
20 changes: 14 additions & 6 deletions pandas/tests/groupby/test_numeric_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def df(self):
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"string": Series(["a", "b", "c"], dtype="str"),
"object": Series(["a", "b", "c"], dtype=object),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
Expand All @@ -40,6 +41,7 @@ def df(self):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -112,6 +114,7 @@ def test_first_last(self, df, method):
"int",
"float",
"string",
"object",
"category_string",
"category_int",
"datetime",
Expand Down Expand Up @@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):

# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if method.startswith("cum") else TypeError
exception = (
(NotImplementedError, TypeError) if method.startswith("cum") else TypeError
)

if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
Expand All @@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -180,7 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
"category type does not support sum operations",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand All @@ -198,7 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
re.escape(f"agg function failed [how->{method},dtype->str]"),
f"dtype 'str' does not support operation '{method}'",
]
)
with pytest.raises(exception, match=msg):
Expand Down Expand Up @@ -299,7 +305,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "idxmin":
if kernel == "quantile":
msg = "dtype 'object' does not support operation 'quantile'"
elif kernel == "idxmin":
msg = "'<' not supported between instances of 'type' and 'type'"
elif kernel == "idxmax":
msg = "'>' not supported between instances of 'type' and 'type'"
Expand Down Expand Up @@ -379,7 +387,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
# that succeed should not be allowed to fail (without deprecation, at least)
if groupby_func in fails_on_numeric_object and dtype is object:
if groupby_func == "quantile":
msg = "cannot be performed against 'object' dtypes"
msg = "dtype 'object' does not support operation 'quantile'"
else:
msg = "is not supported for object dtype"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading
Loading