diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d38ee7b8b589a..f0e8f79c542a8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -443,6 +443,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.cumsum`, :meth:`pandas.core.groupby.GroupBy.cumprod`, :meth:`pandas.core.groupby.GroupBy.cummin` and :meth:`pandas.core.groupby.GroupBy.cummax` with categorical column having absent categories, would return incorrect result or segfault (:issue:`16771`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where NA values in the grouping would return incorrect results (:issue:`26011`) - Bug in :meth:`pandas.core.groupby.SeriesGroupBy.transform` where transforming an empty group would raise error (:issue:`26208`) +- Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) Reshaping diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a19304f1a3ac5..b5cd73a81962b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -19,6 +19,8 @@ is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable, is_scalar, is_sequence, is_string_like) +from pandas._typing import ArrayLike + _POSSIBLY_CAST_DTYPES = {np.dtype(t).name for t in ['O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']} @@ -87,10 +89,10 @@ def ensure_categorical(arr): return arr -def ensure_int64_or_float64(arr, copy=False): +def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array: """ Ensure that an dtype array of some integer dtype - has an int64 dtype if possible + has an int64 dtype if possible. If it's not possible, potentially because of overflow, convert the array to float64 instead. @@ -107,9 +109,18 @@ def ensure_int64_or_float64(arr, copy=False): out_arr : The input array cast as int64 if possible without overflow. Otherwise the input array cast to float64. + + Notes + ----- + If the array is explicitly of type uint64 the type + will remain unchanged. """ try: return arr.astype('int64', copy=copy, casting='safe') + except TypeError: + pass + try: + return arr.astype('uint64', copy=copy, casting='safe') except TypeError: return arr.astype('float64', copy=copy) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 235975a9b68a0..4c6796fbc4ac8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -17,7 +17,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( - ensure_float64, ensure_int64, ensure_int64_or_float64, ensure_object, + ensure_float64, ensure_int64, ensure_int_or_float, ensure_object, ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_timedelta64_dtype, needs_i8_conversion) @@ -486,7 +486,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, if (values == iNaT).any(): values = ensure_float64(values) else: - values = ensure_int64_or_float64(values) + values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 66ea5ac244398..6f54d05680698 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -313,3 +313,16 @@ def test_order_aggregate_multiple_funcs(): expected = pd.Index(['sum', 'max', 'mean', 'ohlc', 'min']) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('dtype', [np.int64, np.uint64]) +@pytest.mark.parametrize('how', ['first', 'last', 'min', + 'max', 'mean', 'median']) +def test_uint64_type_handling(dtype, how): + # GH 26310 + df = pd.DataFrame({'x': 6903052872240755750, 'y': [1, 2]}) + expected = df.groupby('y').agg({'x': how}) + df.x = df.x.astype(dtype) + result = df.groupby('y').agg({'x': how}) + result.x = result.x.astype(np.int64) + tm.assert_frame_equal(result, expected, check_exact=True)