diff --git a/docs/conf.py b/docs/conf.py index f4c4764..80d7f70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,20 +51,23 @@ ) # Try overriding type paths qualname_overrides = autodoc_type_aliases = { + "np.dtype": "numpy.dtype", "ArrayLike": "numpy.typing.ArrayLike", + "DTypeLike": "numpy.typing.DTypeLike", + "NDArray": "numpy.typing.NDArray", "CSBase": "scipy.sparse.spmatrix", "CupyArray": "cupy.ndarray", "CupySparseMatrix": "cupyx.scipy.sparse.spmatrix", "DaskArray": "dask.array.Array", "H5Dataset": "h5py.Dataset", - "NDArray": "numpy.typing.NDArray", } # If that doesn’t work, ignore them nitpick_ignore = { ("py:class", "DT_co"), ("py:class", "fast_array_utils.types.T_co"), - # sphinx bugs, should be covered by `autodoc_type_aliases` below + # sphinx bugs, should be covered by `autodoc_type_aliases` above ("py:class", "ArrayLike"), + ("py:class", "DTypeLike"), ("py:class", "NDArray"), } diff --git a/docs/index.rst b/docs/index.rst index 57c5790..697b5c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,13 @@ :members: +``fast_array_utils.stats`` +-------------------------- + +.. automodule:: fast_array_utils.stats + :members: + + ``fast_array_utils.types`` -------------------------- diff --git a/src/fast_array_utils/__init__.py b/src/fast_array_utils/__init__.py index c203223..135e10e 100644 --- a/src/fast_array_utils/__init__.py +++ b/src/fast_array_utils/__init__.py @@ -3,9 +3,9 @@ from __future__ import annotations -from . import _patches, conv, types +from . import _patches, conv, stats, types -__all__ = ["conv", "types"] +__all__ = ["conv", "stats", "types"] _patches.patch_dask() diff --git a/src/fast_array_utils/stats/__init__.py b/src/fast_array_utils/stats/__init__.py new file mode 100644 index 0000000..530c562 --- /dev/null +++ b/src/fast_array_utils/stats/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: MPL-2.0 +"""Statistics utilities.""" + +from __future__ import annotations + +from ._sum import sum + + +__all__ = ["sum"] diff --git a/src/fast_array_utils/stats/_sum.py b/src/fast_array_utils/stats/_sum.py new file mode 100644 index 0000000..e4aea57 --- /dev/null +++ b/src/fast_array_utils/stats/_sum.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: MPL-2.0 +from __future__ import annotations + +from functools import partial, singledispatch +from typing import TYPE_CHECKING + +import numpy as np + +from ..types import CSBase, CSMatrix, DaskArray + + +if TYPE_CHECKING: + from typing import Literal, TypeVar + + from numpy.typing import ArrayLike, DTypeLike, NDArray + + DT_co = TypeVar("DT_co", covariant=True, bound=np.generic) + + +# TODO(flying-sheep): overload so axis=None returns np.floating # noqa: TD003 + + +@singledispatch +def sum( + x: ArrayLike, + *, + axis: Literal[0, 1, None] = None, + dtype: DTypeLike | np.dtype[DT_co] | None = None, +) -> NDArray[DT_co]: + """Sum over both or one axis.""" + return np.sum(x, axis=axis, dtype=dtype) # type: ignore[no-any-return] + + +@sum.register(CSBase) # type: ignore[misc,call-overload] +def _( + x: CSBase[DT_co], *, axis: Literal[0, 1, None] = None, dtype: DTypeLike | None = None +) -> NDArray[DT_co]: + import scipy.sparse as sp + + if isinstance(x, CSMatrix): + x = sp.csr_array(x) if x.format == "csr" else sp.csc_array(x) + return np.sum(x, axis=axis, dtype=dtype) # type: ignore[call-overload,no-any-return] + + +@sum.register(DaskArray) +def _( + x: DaskArray, *, axis: Literal[0, 1, None] = None, dtype: DTypeLike | None = None +) -> DaskArray: + if TYPE_CHECKING: + from dask.array.reductions import reduction + else: + from dask.array import reduction + + if isinstance(x._meta, np.matrix): # noqa: SLF001 + msg = "sum does not support numpy matrices" + raise TypeError(msg) + + def sum_drop_keepdims( + a: NDArray[DT_co] | CSBase[DT_co], + *, + axis: tuple[Literal[0], Literal[1]] | Literal[0, 1] | None = None, + dtype: np.dtype[DT_co] | None = None, + keepdims: bool = False, + ) -> NDArray[DT_co]: + del keepdims + match axis: + case (0 | 1 as n,): + axis = n + case (0, 1) | (1, 0): + axis = None + case tuple(): + msg = f"`sum` can only sum over `axis=0|1|(0,1)` but got {axis} instead" + raise ValueError(msg) + rv: NDArray[DT_co] | DT_co = sum(a, axis=axis, dtype=dtype) # type: ignore[arg-type] + rv = np.array(rv, ndmin=1) # make sure rv is at least 1D + return rv.reshape((1, len(rv))) + + if dtype is None: + # Explicitly use numpy result dtype (e.g. `NDArray[bool].sum().dtype == int64`) + dtype = np.zeros(1, dtype=x.dtype).sum().dtype + + return reduction( # type: ignore[no-any-return,no-untyped-call] + x, + sum_drop_keepdims, + partial(np.sum, dtype=dtype), + axis=axis, + dtype=dtype, + meta=np.array([], dtype=dtype), + ) diff --git a/src/testing/fast_array_utils/__init__.py b/src/testing/fast_array_utils/__init__.py new file mode 100644 index 0000000..90d9adf --- /dev/null +++ b/src/testing/fast_array_utils/__init__.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: MPL-2.0 +"""Testing utilities.""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import numpy as np + + +if TYPE_CHECKING: + from typing import Any, Generic, Literal, Protocol, SupportsFloat, TypeAlias, TypeVar + + from numpy.typing import ArrayLike, NDArray + + from fast_array_utils import types + from fast_array_utils.types import CSBase + + _SCT_co = TypeVar("_SCT_co", covariant=True, bound=np.generic) + _SCT_contra = TypeVar("_SCT_contra", contravariant=True, bound=np.generic) + _SCT_float = TypeVar("_SCT_float", np.float32, np.float64) + + Array: TypeAlias = ( + NDArray[_SCT_co] + | types.CSBase[_SCT_co] + | types.CupyArray[_SCT_co] + | types.DaskArray + | types.H5Dataset + | types.ZarrArray + ) + + class ToArray(Protocol, Generic[_SCT_contra]): + """Convert to a supported array.""" + + def __call__( # noqa: D102 + self, data: ArrayLike, /, *, dtype: _SCT_contra | None = None + ) -> Array[_SCT_contra]: ... + + +RE_ARRAY_QUAL = re.compile(r"(?P(?:\w+\.)*\w+)\.(?P[^\[]+)(?:\[(?P[\w.]+)\])?") + + +def get_array_cls(qualname: str) -> type[Array[Any]]: # noqa: PLR0911 + """Get a supported array class by qualname.""" + m = RE_ARRAY_QUAL.fullmatch(qualname) + assert m + match m["mod"], m["name"], m["inner"]: + case "numpy", "ndarray", None: + return np.ndarray + case "scipy.sparse", ( + "csr_array" | "csc_array" | "csr_matrix" | "csc_matrix" + ) as cls_name, None: + import scipy.sparse + + return getattr(scipy.sparse, cls_name) # type: ignore[no-any-return] + case "cupy", "ndarray", None: + import cupy as cp + + return cp.ndarray # type: ignore[no-any-return] + case "cupyx.scipy.sparse", ("csr_matrix" | "csc_matrix") as cls_name, None: + import cupyx.scipy.sparse as cu_sparse + + return getattr(cu_sparse, cls_name) # type: ignore[no-any-return] + case "dask.array", cls_name, _: + if TYPE_CHECKING: + from dask.array.core import Array as DaskArray + else: + from dask.array import Array as DaskArray + + return DaskArray + case "h5py", "Dataset", _: + import h5py + + return h5py.Dataset # type: ignore[no-any-return] + case "zarr", "Array", _: + import zarr + + return zarr.Array + case _: + msg = f"Unknown array class: {qualname}" + raise ValueError(msg) + + +def random_mat( + shape: tuple[int, int], + *, + density: SupportsFloat = 0.01, + format: Literal["csr", "csc"] = "csr", # noqa: A002 + dtype: np.dtype[_SCT_float] | type[_SCT_float] | None = None, + container: Literal["array", "matrix"] = "array", + gen: np.random.Generator | None = None, +) -> CSBase[_SCT_float]: + """Create a random matrix.""" + from scipy.sparse import random as random_spmat + from scipy.sparse import random_array as random_sparr + + m, n = shape + return ( + random_spmat(m, n, density=density, format=format, dtype=dtype, random_state=gen) + if container == "matrix" + else random_sparr(shape, density=density, format=format, dtype=dtype, random_state=gen) + ) + + +def random_array( + qualname: str, + shape: tuple[int, int], + *, + dtype: np.dtype[_SCT_float] | type[_SCT_float] | None, + gen: np.random.Generator | None = None, +) -> Array[_SCT_float]: + """Create a random array.""" + gen = np.random.default_rng(gen) + + m = RE_ARRAY_QUAL.fullmatch(qualname) + assert m + match m["mod"], m["name"], m["inner"]: + case "numpy", "ndarray", None: + return gen.random(shape, dtype=dtype or np.float64) + case "scipy.sparse", ( + "csr_array" | "csc_array" | "csr_matrix" | "csc_matrix" + ) as cls_name, None: + fmt, container = cls_name.split("_") + return random_mat(shape, format=fmt, container=container, dtype=dtype) # type: ignore[arg-type] + case "cupy", "ndarray", None: + raise NotImplementedError + case "cupyx.scipy.sparse", ("csr_matrix" | "csc_matrix") as cls_name, None: + raise NotImplementedError + case "dask.array", cls_name, _: + raise NotImplementedError + case "h5py", "Dataset", _: + raise NotImplementedError + case "zarr", "Array", _: + raise NotImplementedError + case _: + msg = f"Unknown array class: {qualname}" + raise ValueError(msg) diff --git a/src/testing/fast_array_utils/__init__.pyi b/src/testing/fast_array_utils/__init__.pyi deleted file mode 100644 index f54511a..0000000 --- a/src/testing/fast_array_utils/__init__.pyi +++ /dev/null @@ -1,26 +0,0 @@ -# SPDX-License-Identifier: MPL-2.0 -from typing import Generic, Protocol, TypeAlias, TypeVar - -import numpy as np -from numpy.typing import ArrayLike, NDArray - -from fast_array_utils import types - -_SCT_co = TypeVar("_SCT_co", covariant=True, bound=np.generic) -_SCT_contra = TypeVar("_SCT_contra", contravariant=True, bound=np.generic) - -_Array: TypeAlias = ( - NDArray[_SCT_co] - | types.CSBase[_SCT_co] - | types.CupyArray[_SCT_co] - | types.DaskArray - | types.H5Dataset - | types.ZarrArray -) - -class _ToArray(Protocol, Generic[_SCT_contra]): - def __call__( - self, data: ArrayLike, /, *, dtype: _SCT_contra | None = None - ) -> _Array[_SCT_contra]: ... - -__all__ = ["_Array", "_ToArray"] diff --git a/src/testing/fast_array_utils/pytest.py b/src/testing/fast_array_utils/pytest.py index 9720d87..100769c 100644 --- a/src/testing/fast_array_utils/pytest.py +++ b/src/testing/fast_array_utils/pytest.py @@ -4,7 +4,6 @@ from __future__ import annotations import os -import re from importlib.util import find_spec from typing import TYPE_CHECKING, cast @@ -13,6 +12,8 @@ from fast_array_utils import types +from . import get_array_cls + if TYPE_CHECKING: from collections.abc import Generator @@ -20,9 +21,9 @@ from numpy.typing import ArrayLike, DTypeLike - from testing.fast_array_utils import _ToArray + from testing.fast_array_utils import ToArray - from . import _Array + from . import Array _SCT_co = TypeVar("_SCT_co", covariant=True, bound=np.generic) @@ -57,66 +58,24 @@ def array_cls_name(request: pytest.FixtureRequest) -> str: @pytest.fixture(scope="session") -def array_cls(array_cls_name: str) -> type[_Array[Any]]: +def array_cls(array_cls_name: str) -> type[Array[Any]]: """Fixture for a supported array class.""" return get_array_cls(array_cls_name) -def get_array_cls(qualname: str) -> type[_Array[Any]]: # noqa: PLR0911 - """Get a supported array class by qualname.""" - m = re.fullmatch( - r"(?P(?:\w+\.)*\w+)\.(?P[^\[]+)(?:\[(?P[\w.]+)\])?", qualname - ) - assert m - match m["mod"], m["name"], m["inner"]: - case "numpy", "ndarray", None: - return np.ndarray - case "scipy.sparse", ( - "csr_array" | "csc_array" | "csr_matrix" | "csc_matrix" - ) as cls_name, None: - import scipy.sparse - - return getattr(scipy.sparse, cls_name) # type: ignore[no-any-return] - case "cupy", "ndarray", None: - import cupy as cp - - return cp.ndarray # type: ignore[no-any-return] - case "cupyx.scipy.sparse", ("csr_matrix" | "csc_matrix") as cls_name, None: - import cupyx.scipy.sparse as cu_sparse - - return getattr(cu_sparse, cls_name) # type: ignore[no-any-return] - case "dask.array", cls_name, _: - if TYPE_CHECKING: - from dask.array.core import Array as DaskArray - else: - from dask.array import Array as DaskArray - - return DaskArray - case "h5py", "Dataset", _: - import h5py - - return h5py.Dataset # type: ignore[no-any-return] - case "zarr", "Array", _: - import zarr - - return zarr.Array - case _: - pytest.fail(f"Unknown array class: {qualname}") - - @pytest.fixture(scope="session") def to_array( - request: pytest.FixtureRequest, array_cls: type[_Array[_SCT_co]], array_cls_name: str -) -> _ToArray[_SCT_co]: + request: pytest.FixtureRequest, array_cls: type[Array[_SCT_co]], array_cls_name: str +) -> ToArray[_SCT_co]: """Fixture for conversion into a supported array.""" return get_to_array(array_cls, array_cls_name, request) def get_to_array( - array_cls: type[_Array[_SCT_co]], + array_cls: type[Array[_SCT_co]], array_cls_name: str | None = None, request: pytest.FixtureRequest | None = None, -) -> _ToArray[_SCT_co]: +) -> ToArray[_SCT_co]: """Create a function to convert to a supported array.""" if array_cls is np.ndarray: return np.asarray # type: ignore[return-value] @@ -144,7 +103,7 @@ def half_rounded_up(x: int) -> int: return tuple(half_rounded_up(x) for x in a) -def to_dask_array(array_cls_name: str) -> _ToArray[Any]: +def to_dask_array(array_cls_name: str) -> ToArray[Any]: """Convert to a dask array.""" if TYPE_CHECKING: import dask.array.core as da @@ -153,7 +112,7 @@ def to_dask_array(array_cls_name: str) -> _ToArray[Any]: inner_cls_name = array_cls_name.removeprefix("dask.array.Array[").removesuffix("]") inner_cls = get_array_cls(inner_cls_name) - to_array_fn: _ToArray[Any] = get_to_array(array_cls=inner_cls) + to_array_fn: ToArray[Any] = get_to_array(array_cls=inner_cls) def to_dask_array(x: ArrayLike, *, dtype: DTypeLike | None = None) -> types.DaskArray: x = np.asarray(x, dtype=dtype) @@ -167,7 +126,7 @@ def to_dask_array(x: ArrayLike, *, dtype: DTypeLike | None = None) -> types.Dask def to_h5py_dataset( tmp_path_factory: pytest.TempPathFactory, worker_id: str = "serial", -) -> Generator[_ToArray[Any], None, None]: +) -> Generator[ToArray[Any], None, None]: """Convert to a h5py dataset.""" import h5py diff --git a/tests/test_asarray.py b/tests/test_asarray.py index 1c855ef..ba98e68 100644 --- a/tests/test_asarray.py +++ b/tests/test_asarray.py @@ -13,10 +13,10 @@ from numpy.typing import NDArray - from testing.fast_array_utils import _ToArray + from testing.fast_array_utils import ToArray -def test_asarray(to_array: _ToArray[Any]) -> None: +def test_asarray(to_array: ToArray[Any]) -> None: x = to_array([[1, 2, 3], [4, 5, 6]]) arr: NDArray[Any] = asarray(x) # type: ignore[arg-type] assert isinstance(arr, np.ndarray) diff --git a/tests/test_sparse.py b/tests/test_sparse.py index dd1972e..4f260ca 100644 --- a/tests/test_sparse.py +++ b/tests/test_sparse.py @@ -8,15 +8,14 @@ import pytest from fast_array_utils.conv.scipy import to_dense +from testing.fast_array_utils import random_mat if TYPE_CHECKING: - from typing import Literal, SupportsFloat, TypeVar + from typing import Literal, TypeVar from pytest_codspeed import BenchmarkFixture - from fast_array_utils.types import CSBase - DType = TypeVar("DType", bound=np.generic) DType_float = TypeVar("DType_float", np.float32, np.float64) @@ -39,24 +38,6 @@ def dtype(request: pytest.FixtureRequest) -> np.dtype[np.float32 | np.float64]: return np.dtype(request.param) -def random_mat( - shape: tuple[int, int], - *, - density: SupportsFloat = 0.01, - format: Literal["csr", "csc"] = "csr", # noqa: A002 - dtype: np.dtype[DType_float] | None = None, - container: Literal["array", "matrix"] = "array", -) -> CSBase[DType_float]: - from scipy.sparse import random, random_array - - m, n = shape - return ( - random(m, n, density=density, format=format, dtype=dtype) - if container == "matrix" - else random_array(shape, density=density, format=format, dtype=dtype) - ) - - @pytest.mark.parametrize("order", ["C", "F"]) def test_to_dense( order: Literal["C", "F"], diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..91eb374 --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: MPL-2.0 +from __future__ import annotations + +from importlib.util import find_spec +from typing import TYPE_CHECKING + +import numpy as np +import pytest + +from testing.fast_array_utils import random_array + + +if TYPE_CHECKING or find_spec("scipy"): + from scipy.sparse import sparray, spmatrix +else: + spmatrix = sparray = type("spmatrix", (), {}) + +from fast_array_utils import stats, types + + +if TYPE_CHECKING: + from typing import Any, Literal + + from pytest_codspeed import BenchmarkFixture + + from testing.fast_array_utils import Array, ToArray + + DTypeIn = type[np.float32 | np.float64 | np.int32 | np.bool_] + DTypeOut = type[np.float32 | np.float64 | np.int64] + + +@pytest.fixture(scope="session", params=[0, 1, None]) +def axis(request: pytest.FixtureRequest) -> Literal[0, 1, None]: + return request.param # type: ignore[no-any-return] + + +@pytest.fixture(scope="session", params=[np.float32, np.float64, np.int32, np.bool_]) +def dtype_in(request: pytest.FixtureRequest) -> DTypeIn: + return request.param # type: ignore[no-any-return] + + +@pytest.fixture(scope="session", params=[np.float32, np.float64, None]) +def dtype_arg(request: pytest.FixtureRequest) -> DTypeOut | None: + return request.param # type: ignore[no-any-return] + + +def test_sum( + array_cls: type[Array[Any]], + to_array: ToArray[Any], + dtype_in: DTypeIn, + dtype_arg: DTypeOut | None, + axis: Literal[0, 1, None], +) -> None: + np_arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype_in) + arr = to_array(np_arr.copy()) + assert arr.dtype == dtype_in + + sum_: Array[Any] | np.floating = stats.sum(arr, axis=axis, dtype=dtype_arg) # type: ignore[type-arg,arg-type] + + match axis, arr: + case _, types.DaskArray(): + assert isinstance(sum_, types.DaskArray), type(sum_) + sum_ = sum_.compute() # type: ignore[no-untyped-call] + case None, _: + assert isinstance(sum_, np.floating | np.integer), type(sum_) + case 0 | 1, spmatrix() | sparray() | types.ZarrArray() | types.H5Dataset(): + assert isinstance(sum_, np.ndarray), type(sum_) + case 0 | 1, _: + assert isinstance(sum_, array_cls), type(sum_) + case _: + pytest.fail(f"Unhandled case axis {axis} for {type(arr)}: {type(sum_)}") + + assert sum_.shape == () if axis is None else arr.shape[axis], (sum_.shape, arr.shape) + + if dtype_arg is not None: + assert sum_.dtype == dtype_arg, (sum_.dtype, dtype_arg) + elif dtype_in in {np.bool_, np.int32}: + assert sum_.dtype == np.int64 + else: + assert sum_.dtype == dtype_in + + np.testing.assert_array_equal(sum_, np.sum(np_arr, axis=axis, dtype=dtype_arg)) # type: ignore[arg-type] + + +@pytest.mark.benchmark +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) # random only supports float +def test_sum_benchmark( + benchmark: BenchmarkFixture, + array_cls_name: str, + axis: Literal[0, 1, None], + dtype: type[np.float32 | np.float64], +) -> None: + try: + shape = (1_000, 1_000) if "sparse" in array_cls_name else (100, 100) + arr = random_array(array_cls_name, shape, dtype=dtype) # type: ignore # noqa: PGH003 + except NotImplementedError: + pytest.skip("random_array not implemented for dtype") + + stats.sum(arr, axis=axis) # type: ignore[arg-type] # warmup: numba compile + benchmark(stats.sum, arr, axis=axis) diff --git a/tests/test_test_utils.py b/tests/test_test_utils.py index 74ceab8..cb69bcc 100644 --- a/tests/test_test_utils.py +++ b/tests/test_test_utils.py @@ -12,14 +12,14 @@ if TYPE_CHECKING: from typing import TypeVar - from testing.fast_array_utils import _Array, _ToArray + from testing.fast_array_utils import Array, ToArray DType_float = TypeVar("DType_float", np.float32, np.float64) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_conv( - array_cls: type[_Array[DType_float]], to_array: _ToArray[DType_float], dtype: DType_float + array_cls: type[Array[DType_float]], to_array: ToArray[DType_float], dtype: DType_float ) -> None: arr = to_array(np.arange(12).reshape(3, 4), dtype=dtype) assert isinstance(arr, array_cls)