Skip to content
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in reductions (sum, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).


.. ---------------------------------------------------------------------------
Expand Down
48 changes: 31 additions & 17 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for missing values.
"""

from typing import Callable

import numpy as np

from pandas._libs import missing as libmissing
Expand All @@ -11,14 +13,19 @@
from pandas.core.nanops import check_below_min_count


def sum(
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
skipna: bool = True,
min_count: int = 0,
):
"""
Sum for 1D masked array.
Sum or product for 1D masked array.

Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
Expand All @@ -31,23 +38,33 @@ def sum(
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any():
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
if check_below_min_count(values.shape, None, min_count):
return libmissing.NA
return np.sum(values)
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
return func(values[~mask])
else:
return np.sum(values, where=~mask)
return func(values, where=~mask)


def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Expand All @@ -63,18 +80,15 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
Whether to skip NA.
"""
if not skipna:
if mask.any():
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
return func(values)
else:
subset = values[~mask]
if subset.size:
return func(values[~mask])
return func(subset)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
Expand Down
10 changes: 2 additions & 8 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.missing import isna

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
Expand Down Expand Up @@ -686,7 +686,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -700,12 +700,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if np.isnan(result):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
9 changes: 1 addition & 8 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

from pandas.core import nanops, ops
from pandas.core.array_algos import masked_reductions
import pandas.core.common as com
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -557,7 +556,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -576,12 +575,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in ["any", "all"]:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/boolean/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
if op == "sum":
assert isinstance(getattr(s, op)(), np.int_)
elif op == "prod":
assert isinstance(getattr(s, op)(), np.int64)
assert isinstance(getattr(s, op)(), np.int_)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op in {"sum", "min", "max"}:
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,10 @@ def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
if np.isnan(expected):
if not skipna and s.isna().any():
expected = pd.NA
else:
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


Expand Down