diff --git a/docs/release-notes/1.10.0.md b/docs/release-notes/1.10.0.md index 6e0e35ec7f..604342cb73 100644 --- a/docs/release-notes/1.10.0.md +++ b/docs/release-notes/1.10.0.md @@ -10,6 +10,7 @@ * {func}`scanpy.experimental.pp.highly_variable_genes` using `flavor='pearson_residuals'` now uses numba for variance computation {pr}`2612` {smaller}`S Dicks & P Angerer` * {func}`scanpy.external.pp.harmony_integrate` now runs with 64 bit floats improving reproducibility {pr}`2655` {smaller}`S Dicks` +* Enhanced dask support for some internal utilities, paving the way for more extensive dask support {pr}`2696` {smaller}`P Angerer` ```{rubric} Docs ``` diff --git a/pyproject.toml b/pyproject.toml index 5914dcb942..fc85d3cea1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -163,6 +163,7 @@ xfail_strict = true nunit_attach_on = "fail" markers = [ "internet: tests which rely on internet resources (enable with `--internet-tests`)", + "gpu: tests that use a GPU (currently unused, but needs to be specified here as we import anndata.tests.helpers, which uses it)", ] [tool.coverage.run] diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 14bc1e3075..e1efd58f6f 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -3,6 +3,8 @@ This file largely consists of the old _utils.py file. Over time, these functions should be moved of this file. """ +from __future__ import annotations + import sys import inspect import warnings @@ -11,12 +13,13 @@ from pathlib import Path from weakref import WeakSet from collections import namedtuple -from functools import partial, wraps +from functools import partial, singledispatch, wraps from types import ModuleType, MethodType from typing import Union, Callable, Optional, Mapping, Any, Dict, Tuple, Literal import numpy as np from numpy import random +from numpy.typing import NDArray from scipy import sparse from anndata import AnnData, __version__ as anndata_version from textwrap import dedent @@ -24,8 +27,8 @@ from .._settings import settings from .. import logging as logg - -from .compute.is_constant import is_constant +from .._compat import DaskArray +from .compute.is_constant import is_constant # noqa: F401 class Empty(Enum): @@ -400,12 +403,12 @@ def identify_groups(ref_labels, pred_labels, return_overlaps=False): # backwards compat... remove this in the future -def sanitize_anndata(adata): +def sanitize_anndata(adata: AnnData) -> None: """Transform string annotations to categoricals.""" adata._sanitize() -def view_to_actual(adata): +def view_to_actual(adata: AnnData) -> None: if adata.is_view: warnings.warn( "Received a view of an AnnData. Making a copy.", @@ -483,8 +486,41 @@ def update_params( # -------------------------------------------------------------------------------- -def check_nonnegative_integers(X: Union[np.ndarray, sparse.spmatrix]): +_SparseMatrix = Union[sparse.csr_matrix, sparse.csc_matrix] +_MemoryArray = Union[NDArray, _SparseMatrix] +_SupportedArray = Union[_MemoryArray, DaskArray] + + +@singledispatch +def elem_mul(x: _SupportedArray, y: _SupportedArray) -> _SupportedArray: + raise NotImplementedError + + +@elem_mul.register(np.ndarray) +@elem_mul.register(sparse.spmatrix) +def _elem_mul_in_mem(x: _MemoryArray, y: _MemoryArray) -> _MemoryArray: + if isinstance(x, sparse.spmatrix): + # returns coo_matrix, so cast back to input type + return type(x)(x.multiply(y)) + return x * y + + +@elem_mul.register(DaskArray) +def _elem_mul_dask(x: DaskArray, y: DaskArray) -> DaskArray: + import dask.array as da + + return da.map_blocks(elem_mul, x, y) + + +@singledispatch +def check_nonnegative_integers(X: _SupportedArray) -> bool | DaskArray: """Checks values of X to ensure it is count data""" + raise NotImplementedError + + +@check_nonnegative_integers.register(np.ndarray) +@check_nonnegative_integers.register(sparse.spmatrix) +def _check_nonnegative_integers_in_mem(X: _MemoryArray) -> bool: from numbers import Integral data = X if isinstance(X, np.ndarray) else X.data @@ -494,13 +530,19 @@ def check_nonnegative_integers(X: Union[np.ndarray, sparse.spmatrix]): # Check all are integers elif issubclass(data.dtype.type, Integral): return True - elif np.any(~np.equal(np.mod(data, 1), 0)): - return False - else: - return True + return not np.any((data % 1) != 0) + +@check_nonnegative_integers.register(DaskArray) +def _check_nonnegative_integers_dask(X: DaskArray) -> DaskArray: + return X.map_blocks(check_nonnegative_integers, dtype=bool, drop_axis=(0, 1)) -def select_groups(adata, groups_order_subset="all", key="groups"): + +def select_groups( + adata: AnnData, + groups_order_subset: list[str] | Literal["all"] = "all", + key: str = "groups", +) -> tuple[list[str], NDArray[np.bool_]]: """Get subset of groups in adata.obs[key].""" groups_order = adata.obs[key].cat.categories if key + "_masks" in adata.uns: diff --git a/scanpy/experimental/pp/_highly_variable_genes.py b/scanpy/experimental/pp/_highly_variable_genes.py index 583de8aa03..723ea2222a 100644 --- a/scanpy/experimental/pp/_highly_variable_genes.py +++ b/scanpy/experimental/pp/_highly_variable_genes.py @@ -143,7 +143,7 @@ def _highly_variable_pearson_residuals( computed_on = layer if layer else "adata.X" # Check for raw counts - if check_values and (check_nonnegative_integers(X) is False): + if check_values and not check_nonnegative_integers(X): warnings.warn( "`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", UserWarning, diff --git a/scanpy/preprocessing/_utils.py b/scanpy/preprocessing/_utils.py index 752b75d05f..fe154919eb 100644 --- a/scanpy/preprocessing/_utils.py +++ b/scanpy/preprocessing/_utils.py @@ -1,14 +1,17 @@ +from typing import Literal import numpy as np -from scipy import sparse import numba +from scipy import sparse + +from .._utils import _SupportedArray, elem_mul -def _get_mean_var(X, *, axis=0): - if sparse.issparse(X): +def _get_mean_var(X: _SupportedArray, *, axis: Literal[0, 1] = 0) -> _SupportedArray: + if isinstance(X, sparse.spmatrix): mean, var = sparse_mean_variance_axis(X, axis=axis) else: - mean = np.mean(X, axis=axis, dtype=np.float64) - mean_sq = np.multiply(X, X).mean(axis=axis, dtype=np.float64) + mean = X.mean(axis=axis, dtype=np.float64) + mean_sq = elem_mul(X, X).mean(axis=axis, dtype=np.float64) var = mean_sq - mean**2 # enforce R convention (unbiased estimator) for variance var *= X.shape[axis] / (X.shape[axis] - 1) diff --git a/scanpy/testing/_pytest/fixtures/__init__.py b/scanpy/testing/_pytest/fixtures/__init__.py index c73ce960ad..1fed07c646 100644 --- a/scanpy/testing/_pytest/fixtures/__init__.py +++ b/scanpy/testing/_pytest/fixtures/__init__.py @@ -5,16 +5,9 @@ from __future__ import annotations from pathlib import Path -from collections.abc import Callable import pytest import numpy as np -from numpy.typing import ArrayLike -from scipy import sparse -from anndata.tests.helpers import asarray - -from ...._compat import DaskArray -from ..._pytest.marks import needs from .data import ( _pbmc3ks_parametrized_session, pbmc3k_parametrized, @@ -23,7 +16,6 @@ __all__ = [ - "array_type", "float_dtype", "doctest_env", "_pbmc3ks_parametrized_session", @@ -32,28 +24,6 @@ ] -def _as_dense_dask_array(x: ArrayLike) -> DaskArray: - import dask.array as da - - return da.from_array(asarray(x)) - - -@pytest.fixture( - params=[ - pytest.param(asarray, id="numpy-ndarray"), - pytest.param(sparse.csr_matrix, id="scipy-csr"), - pytest.param(sparse.csc_matrix, id="scipy-csc"), - # Dask doesn’t support scipy sparse matrices, so only dense here - pytest.param(_as_dense_dask_array, marks=[needs("dask")], id="dask-array"), - ] -) -def array_type( - request, -) -> Callable[[ArrayLike], DaskArray | np.ndarray | sparse.spmatrix]: - """Function which converts passed array to one of the common array types.""" - return request.param - - @pytest.fixture(params=[np.float64, np.float32]) def float_dtype(request): return request.param diff --git a/scanpy/testing/_pytest/params.py b/scanpy/testing/_pytest/params.py new file mode 100644 index 0000000000..c0748d2bd2 --- /dev/null +++ b/scanpy/testing/_pytest/params.py @@ -0,0 +1,75 @@ +"""Like fixtures, but more flexible""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import TYPE_CHECKING, Literal + +import pytest +from scipy import sparse +from anndata.tests.helpers import asarray, as_dense_dask_array, as_sparse_dask_array + +from .._pytest.marks import needs + +if TYPE_CHECKING: + from _pytest.mark.structures import ParameterSet + + +def param_with( + at: ParameterSet, + *, + marks: Iterable[pytest.Mark | pytest.MarkDecorator] = (), + id: str | None = None, +) -> ParameterSet: + return pytest.param(*at.values, marks=[*at.marks, *marks], id=id or at.id) + + +MAP_ARRAY_TYPES: dict[ + tuple[Literal["mem", "dask"], Literal["dense", "sparse"]], + tuple[ParameterSet, ...], +] = { + ("mem", "dense"): (pytest.param(asarray, id="numpy_ndarray"),), + ("mem", "sparse"): ( + pytest.param(sparse.csr_matrix, id="scipy_csr"), + pytest.param(sparse.csc_matrix, id="scipy_csc"), + ), + ("dask", "dense"): ( + pytest.param(as_dense_dask_array, marks=[needs("dask")], id="dask_array_dense"), + ), + ("dask", "sparse"): ( + pytest.param( + as_sparse_dask_array, marks=[needs("dask")], id="dask_array_sparse" + ), + # probably not necessary to also do csc + ), +} + +ARRAY_TYPES_MEM = tuple( + at for (strg, _), ats in MAP_ARRAY_TYPES.items() if strg == "mem" for at in ats +) +ARRAY_TYPES_DASK = tuple( + at for (strg, _), ats in MAP_ARRAY_TYPES.items() if strg == "dask" for at in ats +) + +ARRAY_TYPES_DENSE = tuple( + at for (_, spsty), ats in MAP_ARRAY_TYPES.items() if spsty == "dense" for at in ats +) +ARRAY_TYPES_SPARSE = tuple( + at for (_, spsty), ats in MAP_ARRAY_TYPES.items() if spsty == "dense" for at in ats +) + +ARRAY_TYPES_SUPPORTED = tuple( + ( + param_with(at, marks=[pytest.mark.xfail(reason="sparse-in-dask not supported")]) + if attrs == ("dask", "sparse") + else at + ) + for attrs, ats in MAP_ARRAY_TYPES.items() + for at in ats +) +""" +Sparse matrices in dask arrays aren’t officially supported upstream, +so add xfail to them. +""" + +ARRAY_TYPES = tuple(at for ats in MAP_ARRAY_TYPES.values() for at in ats) diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 3e182ad60d..874438dc2e 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -1,8 +1,10 @@ +from pathlib import Path + import pytest import pandas as pd import numpy as np import scanpy as sc -from pathlib import Path + from scanpy.testing._helpers import _check_check_values_warnings from scanpy.testing._helpers.data import pbmc3k, pbmc68k_reduced from scanpy.testing._pytest.marks import needs diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py index 7c6933c5a1..f7b0f2f390 100644 --- a/scanpy/tests/test_metrics.py +++ b/scanpy/tests/test_metrics.py @@ -12,6 +12,7 @@ from scanpy._compat import DaskArray from scanpy.testing._helpers.data import pbmc68k_reduced +from scanpy.testing._pytest.params import ARRAY_TYPES mark_flaky = pytest.mark.xfail( @@ -94,6 +95,7 @@ def test_correctness(metric, size, expected, assert_equal): assert metric(adata, vals=connected) == expected +@pytest.mark.parametrize("array_type", ARRAY_TYPES) def test_graph_metrics_w_constant_values(metric, array_type, assert_equal): # https://github.com/scverse/scanpy/issues/1806 pbmc = pbmc68k_reduced() diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index df9e54c7ef..31e390fb50 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -1,19 +1,11 @@ from __future__ import annotations -from collections.abc import Callable - import pytest import numpy as np from anndata import AnnData from scipy.sparse import csr_matrix from scipy import sparse - -from scanpy.testing._pytest.marks import needs - -try: - import dask.array as da -except ImportError: - da = None +from anndata.tests.helpers import assert_equal import scanpy as sc from scanpy.testing._helpers import ( @@ -21,13 +13,16 @@ check_rep_results, _check_check_values_warnings, ) -from anndata.tests.helpers import assert_equal, asarray + +# TODO: Add support for sparse-in-dask +from scanpy.testing._pytest.params import ARRAY_TYPES_SUPPORTED -X_total = [[1, 0], [3, 0], [5, 6]] -X_frac = [[1, 0, 1], [3, 0, 1], [5, 6, 1]] +X_total = np.array([[1, 0], [3, 0], [5, 6]]) +X_frac = np.array([[1, 0, 1], [3, 0, 1], [5, 6, 1]]) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) @pytest.mark.parametrize("dtype", ["float32", "int64"]) def test_normalize_total(array_type, dtype): adata = AnnData(array_type(X_total).astype(dtype)) @@ -41,6 +36,7 @@ def test_normalize_total(array_type, dtype): assert np.allclose(np.ravel(adata.X[:, 1:3].sum(axis=1)), [1.0, 1.0, 1.0]) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) @pytest.mark.parametrize("dtype", ["float32", "int64"]) def test_normalize_total_rep(array_type, dtype): # Test that layer kwarg works @@ -49,6 +45,7 @@ def test_normalize_total_rep(array_type, dtype): check_rep_results(sc.pp.normalize_total, X, fields=["layer"]) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) @pytest.mark.parametrize("dtype", ["float32", "int64"]) def test_normalize_total_layers(array_type, dtype): adata = AnnData(array_type(X_total).astype(dtype)) @@ -58,6 +55,7 @@ def test_normalize_total_layers(array_type, dtype): assert np.allclose(adata.layers["layer"].sum(axis=1), [3.0, 3.0, 3.0]) +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) @pytest.mark.parametrize("dtype", ["float32", "int64"]) def test_normalize_total_view(array_type, dtype): adata = AnnData(array_type(X_total).astype(dtype)) diff --git a/scanpy/tests/test_pca.py b/scanpy/tests/test_pca.py index 85ea82c7bf..50f7427bf1 100644 --- a/scanpy/tests/test_pca.py +++ b/scanpy/tests/test_pca.py @@ -2,21 +2,29 @@ import pytest import warnings from anndata import AnnData -from anndata.tests.helpers import as_dense_dask_array, asarray, assert_equal +from anndata.tests.helpers import ( + as_dense_dask_array, + as_sparse_dask_array, + assert_equal, + asarray, +) from scipy import sparse import scanpy as sc from scanpy.testing._helpers.data import pbmc3k_normalized from scanpy.testing._pytest.marks import needs +from scanpy.testing._pytest.params import ARRAY_TYPES_SUPPORTED, param_with -A_list = [ - [0, 0, 7, 0, 0], - [8, 5, 0, 2, 0], - [6, 0, 0, 2, 5], - [0, 0, 0, 1, 0], - [8, 8, 2, 1, 0], - [0, 0, 0, 4, 5], -] +A_list = np.array( + [ + [0, 0, 7, 0, 0], + [8, 5, 0, 2, 0], + [6, 0, 0, 2, 5], + [0, 0, 0, 1, 0], + [8, 8, 2, 1, 0], + [0, 0, 0, 4, 5], + ] +) A_pca = np.array( [ @@ -44,24 +52,21 @@ # If one uses dask for PCA it will always require dask-ml @pytest.fixture( params=[ - lambda: sparse.csr_matrix, - lambda: sparse.csc_matrix, - lambda: asarray, - pytest.param(lambda: as_dense_dask_array, marks=[needs("dask_ml")]), - ], - ids=["scipy-csr", "scipy-csc", "np-ndarray", "dask-array"], + param_with(at, marks=[needs("dask_ml")]) if "dask" in at.id else at + for at in ARRAY_TYPES_SUPPORTED + ] ) -def array_type(request): - return request.param() +def array_type(request: pytest.FixtureRequest): + return request.param @pytest.fixture(params=[None, "valid", "invalid"]) -def svd_solver_type(request): +def svd_solver_type(request: pytest.FixtureRequest): return request.param -@pytest.fixture(params=[True, False]) -def zero_center(request): +@pytest.fixture(params=[True, False], ids=["zero_center", "no_zero_center"]) +def zero_center(request: pytest.FixtureRequest): return request.param @@ -72,27 +77,30 @@ def pca_params(array_type, svd_solver_type, zero_center): expected_warning = None svd_solver = None if svd_solver_type is not None: - if array_type is as_dense_dask_array: + # TODO: are these right for sparse? + if array_type in {as_dense_dask_array, as_sparse_dask_array}: svd_solver = ( - ["auto", "full", "tsqr", "randomized"] + {"auto", "full", "tsqr", "randomized"} if zero_center - else ["tsqr", "randomized"] + else {"tsqr", "randomized"} ) - elif array_type in [sparse.csr_matrix, sparse.csc_matrix]: + elif array_type in {sparse.csr_matrix, sparse.csc_matrix}: svd_solver = ( - ["lobpcg", "arpack"] if zero_center else ["arpack", "randomized"] + {"lobpcg", "arpack"} if zero_center else {"arpack", "randomized"} ) - else: + elif array_type is asarray: svd_solver = ( - ["auto", "full", "arpack", "randomized"] + {"auto", "full", "arpack", "randomized"} if zero_center - else ["arpack", "randomized"] + else {"arpack", "randomized"} ) + else: + assert False, f"Unknown array type {array_type}" if svd_solver_type == "invalid": - svd_solver = list(all_svd_solvers.difference(svd_solver)) + svd_solver = all_svd_solvers - svd_solver expected_warning = "Ignoring" - svd_solver = np.random.choice(svd_solver) + svd_solver = np.random.choice(list(svd_solver)) # explicit check for special case if ( svd_solver == "randomized" @@ -113,9 +121,14 @@ def test_pca_warnings(array_type, zero_center, pca_params): with pytest.warns(UserWarning, match=expected_warning): sc.pp.pca(adata, svd_solver=svd_solver, zero_center=zero_center) else: - with warnings.catch_warnings(record=True) as record: + with warnings.catch_warnings(): + warnings.simplefilter("error") + warnings.filterwarnings( + "ignore", + "pkg_resources is deprecated as an API", + DeprecationWarning, + ) sc.pp.pca(adata, svd_solver=svd_solver, zero_center=zero_center) - assert len(record) == 0 # This warning test is out of the fixture because it is a special case in the logic of the function diff --git a/scanpy/tests/test_preprocessing.py b/scanpy/tests/test_preprocessing.py index af7142b90e..7c6fb1bafc 100644 --- a/scanpy/tests/test_preprocessing.py +++ b/scanpy/tests/test_preprocessing.py @@ -10,7 +10,8 @@ from anndata.tests.helpers import assert_equal, asarray from scanpy.testing._helpers import check_rep_mutation, check_rep_results -from scanpy.testing._helpers.data import pbmc68k_reduced +from scanpy.testing._helpers.data import pbmc3k, pbmc68k_reduced +from scanpy.testing._pytest.params import ARRAY_TYPES_SUPPORTED def test_log1p(tmp_path): @@ -46,6 +47,21 @@ def test_log1p_rep(count_matrix_format, base, dtype): check_rep_results(sc.pp.log1p, X, base=base) +# TODO: Add support for sparse-in-dask +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) +def test_mean_var(array_type): + pbmc = pbmc3k() + pbmc.X = array_type(pbmc.X) + + true_mean = np.mean(asarray(pbmc.X), axis=0) + true_var = np.var(asarray(pbmc.X), axis=0, dtype=np.float64, ddof=1) + + means, variances = sc.pp._utils._get_mean_var(pbmc.X) + + np.testing.assert_allclose(true_mean, means) + np.testing.assert_allclose(true_var, variances) + + def test_mean_var_sparse(): from sklearn.utils.sparsefuncs import mean_variance_axis diff --git a/scanpy/tests/test_rank_genes_groups.py b/scanpy/tests/test_rank_genes_groups.py index 1b60ef01cb..01760234fe 100644 --- a/scanpy/tests/test_rank_genes_groups.py +++ b/scanpy/tests/test_rank_genes_groups.py @@ -1,26 +1,32 @@ -import pytest +from __future__ import annotations -from packaging import version -from pathlib import Path import pickle +from pathlib import Path +from collections.abc import Callable +from typing import Any +import pytest import numpy as np import pandas as pd import scipy -from scipy import sparse as sp +from numpy.typing import NDArray +from anndata import AnnData +from packaging import version +from scipy import sparse from scipy.stats import mannwhitneyu +from anndata.tests.helpers import asarray from numpy.random import negative_binomial, binomial, seed import scanpy as sc -from anndata import AnnData from scanpy.testing._helpers.data import pbmc68k_reduced +from scanpy.testing._pytest.params import ARRAY_TYPES, ARRAY_TYPES_MEM from scanpy.tools import rank_genes_groups from scanpy.tools._rank_genes_groups import _RankGenes from scanpy.get import rank_genes_groups_df -from scanpy._utils import select_groups - +from scanpy._utils import select_groups, elem_mul -HERE = Path(__file__).parent / Path("_data/") +HERE = Path(__file__).parent +DATA_PATH = HERE / "_data" # We test results for a simple generic example @@ -29,7 +35,8 @@ # results differ (very) slightly -def get_example_data(*, sparse=False): +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +def get_example_data(array_type: Callable[[np.ndarray], Any]) -> AnnData: # create test object adata = AnnData( np.multiply(binomial(1, 0.15, (100, 20)), negative_binomial(2, 0.25, (100, 20))) @@ -39,27 +46,27 @@ def get_example_data(*, sparse=False): binomial(1, 0.9, (10, 5)), negative_binomial(1, 0.5, (10, 5)) ) - # The following construction is inefficient, but makes sure that the same data is used in the sparse case - if sparse: - adata.X = sp.csr_matrix(adata.X) + adata.X = array_type(adata.X) # Create cluster according to groups adata.obs["true_groups"] = pd.Categorical( - np.concatenate( - ( - np.zeros((10,), dtype=int), - np.ones((90,), dtype=int), - ) - ) + np.concatenate((np.zeros((10,), dtype=int), np.ones((90,), dtype=int))) ) return adata -def get_true_scores(): - with Path(HERE, "objs_t_test.pkl").open("rb") as f: +def get_true_scores() -> ( + tuple[ + NDArray[np.object_], + NDArray[np.object_], + NDArray[np.floating], + NDArray[np.floating], + ] +): + with (DATA_PATH / "objs_t_test.pkl").open("rb") as f: true_scores_t_test, true_names_t_test = pickle.load(f) - with Path(HERE, "objs_wilcoxon.pkl").open("rb") as f: + with (DATA_PATH / "objs_wilcoxon.pkl").open("rb") as f: true_scores_wilcoxon, true_names_wilcoxon = pickle.load(f) return ( @@ -70,10 +77,12 @@ def get_true_scores(): ) -def test_results_dense(): +# TODO: Make dask compatible +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +def test_results(array_type): seed(1234) - adata = get_example_data() + adata = get_example_data(array_type) assert adata.raw is None # Assumption for later checks ( @@ -113,61 +122,15 @@ def test_results_dense(): assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False -def test_results_sparse(): - seed(1234) - - adata = get_example_data(sparse=True) - - ( - true_names_t_test, - true_names_wilcoxon, - true_scores_t_test, - true_scores_wilcoxon, - ) = get_true_scores() - - rank_genes_groups(adata, "true_groups", n_genes=20, method="t-test") - - adata.uns["rank_genes_groups"]["names"] = adata.uns["rank_genes_groups"][ - "names" - ].astype(true_names_t_test.dtype) - - for name in true_scores_t_test.dtype.names: - assert np.allclose( - true_scores_t_test[name], adata.uns["rank_genes_groups"]["scores"][name] - ) - assert np.array_equal(true_names_t_test, adata.uns["rank_genes_groups"]["names"]) - assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False - - rank_genes_groups(adata, "true_groups", n_genes=20, method="wilcoxon") - - adata.uns["rank_genes_groups"]["names"] = adata.uns["rank_genes_groups"][ - "names" - ].astype(true_names_wilcoxon.dtype) - - for name in true_scores_t_test.dtype.names: - assert np.allclose( - true_scores_wilcoxon[name][:7], - adata.uns["rank_genes_groups"]["scores"][name][:7], - ) - assert np.array_equal( - true_names_wilcoxon[:7], adata.uns["rank_genes_groups"]["names"][:7] - ) - assert adata.uns["rank_genes_groups"]["params"]["use_raw"] is False - - -def test_results_layers(): +@pytest.mark.parametrize("array_type", ARRAY_TYPES_MEM) +def test_results_layers(array_type): seed(1234) - adata = get_example_data(sparse=False) + adata = get_example_data(array_type) adata.layers["to_test"] = adata.X.copy() - adata.X = adata.X * np.random.randint(0, 2, adata.shape, dtype=bool) + adata.X = elem_mul(adata.X, np.random.randint(0, 2, adata.shape, dtype=bool)) - ( - true_names_t_test, - true_names_wilcoxon, - true_scores_t_test, - true_scores_wilcoxon, - ) = get_true_scores() + _, _, true_scores_t_test, true_scores_wilcoxon = get_true_scores() # Wilcoxon rank_genes_groups( diff --git a/scanpy/tests/test_score_genes.py b/scanpy/tests/test_score_genes.py index da983ebf22..9601fe8150 100644 --- a/scanpy/tests/test_score_genes.py +++ b/scanpy/tests/test_score_genes.py @@ -64,8 +64,8 @@ def test_score_with_reference(): sc.tl.score_genes(adata, gene_list=adata.var_names[:100], score_name="Test") with Path(HERE, "score_genes_reference_paul2015.pkl").open("rb") as file: reference = pickle.load(file) - # assert np.allclose(reference, adata.obs.Test.values) - assert np.array_equal(reference, adata.obs.Test.values) + # np.testing.assert_allclose(reference, adata.obs.Test.values) + np.testing.assert_array_equal(reference, adata.obs.Test.values) def test_add_score(): diff --git a/scanpy/tests/test_utils.py b/scanpy/tests/test_utils.py index 89b79bf8c2..71ea4d9998 100644 --- a/scanpy/tests/test_utils.py +++ b/scanpy/tests/test_utils.py @@ -1,14 +1,19 @@ from types import ModuleType + import pytest -from scipy.sparse import csr_matrix import numpy as np +from scipy.sparse import csr_matrix +from anndata.tests.helpers import asarray from scanpy._utils import ( descend_classes_and_funcs, check_nonnegative_integers, + elem_mul, is_constant, ) from scanpy.testing._pytest.marks import needs +from scanpy._compat import DaskArray +from scanpy.testing._pytest.params import ARRAY_TYPES, ARRAY_TYPES_SUPPORTED def test_descend_classes_and_funcs(): @@ -28,22 +33,51 @@ def test_descend_classes_and_funcs(): assert {a.A, a.b.B} == set(descend_classes_and_funcs(a, "a")) -def test_check_nonnegative_integers(): - X = np.random.poisson(size=(100, 100)).astype(np.float64) - assert check_nonnegative_integers(X) is True - assert check_nonnegative_integers(-X) is False - - X_ = X + np.random.normal(size=(100, 100)) - assert check_nonnegative_integers(X_) is False - - X = csr_matrix(X) - assert check_nonnegative_integers(X) is True - assert check_nonnegative_integers(-X) is False - - X_ = csr_matrix(X_) - assert check_nonnegative_integers(X_) is False +# TODO: add support for dask-in-sparse +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) +def test_elem_mul(array_type): + m1 = array_type([[0, 1, 1], [1, 0, 1]]) + m2 = array_type([[2, 2, 1], [3, 2, 0]]) + expd = np.array([[0, 2, 1], [3, 0, 0]]) + res = asarray(elem_mul(m1, m2)) + np.testing.assert_array_equal(res, expd) +@pytest.mark.parametrize("array_type", ARRAY_TYPES) +@pytest.mark.parametrize( + ("array_value", "expected"), + [ + pytest.param( + np.random.poisson(size=(100, 100)).astype(np.float64), + True, + id="poisson-float64", + ), + pytest.param( + np.random.poisson(size=(100, 100)).astype(np.uint32), + True, + id="poisson-uint32", + ), + pytest.param(np.random.normal(size=(100, 100)), False, id="normal"), + pytest.param(np.array([[0, 0, 0], [0, -1, 0], [0, 0, 0]]), False, id="middle"), + ], +) +def test_check_nonnegative_integers(array_type, array_value, expected): + X = array_type(array_value) + + received = check_nonnegative_integers(X) + if isinstance(X, DaskArray): + assert isinstance(received, DaskArray) + # compute + received = received.compute() + assert not isinstance(received, DaskArray) + if isinstance(received, np.bool_): + # convert to python bool + received = received.item() + assert received is expected + + +# TODO: Make it work for sparse-in-dask +@pytest.mark.parametrize("array_type", ARRAY_TYPES_SUPPORTED) def test_is_constant(array_type): constant_inds = [1, 3] A = np.arange(20).reshape(5, 4) diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py index b655cdf5f4..57e7adb60d 100644 --- a/scanpy/tools/_rank_genes_groups.py +++ b/scanpy/tools/_rank_genes_groups.py @@ -1,25 +1,27 @@ """Rank genes according to differential expression. """ +from __future__ import annotations + from math import floor -from typing import Iterable, Union, Optional, Literal +from typing import Generator, Iterable, Union, Optional, Literal, get_args import numpy as np import pandas as pd +from numpy.typing import NDArray from anndata import AnnData from scipy.sparse import issparse, vstack from .. import _utils from .. import logging as logg from ..preprocessing._simple import _get_mean_var -from ..get import _get_obs_rep from .._utils import check_nonnegative_integers -_Method = Optional[Literal["logreg", "t-test", "wilcoxon", "t-test_overestim_var"]] +_Method = Literal["logreg", "t-test", "wilcoxon", "t-test_overestim_var"] _CorrMethod = Literal["benjamini-hochberg", "bonferroni"] -def _select_top_n(scores, n_top): +def _select_top_n(scores: NDArray, n_top: int): n_from = scores.shape[0] reference_indices = np.arange(n_from, dtype=int) partition = np.argpartition(scores, -n_top)[-n_top:] @@ -80,14 +82,15 @@ def _tiecorrect(ranks): class _RankGenes: def __init__( self, - adata, - groups, - groupby, + adata: AnnData, + groups: list[str] | Literal["all"], + groupby: str, + *, reference="rest", - use_raw=True, - layer=None, - comp_pts=False, - ): + use_raw: bool = True, + layer: str | None = None, + comp_pts: bool = False, + ) -> None: if "log1p" in adata.uns_keys() and adata.uns["log1p"].get("base") is not None: self.expm1_func = lambda x: np.expm1(x * np.log(adata.uns["log1p"]["base"])) else: @@ -145,7 +148,8 @@ def __init__( self.grouping_mask = adata.obs[groupby].isin(self.groups_order) self.grouping = adata.obs.loc[self.grouping_mask, groupby] - def _basic_stats(self): + def _basic_stats(self) -> None: + """Set self.{means,vars,pts}{,_rest} depending on X.""" n_genes = self.X.shape[1] n_groups = self.groups_masks.shape[0] @@ -192,17 +196,19 @@ def _basic_stats(self): # deleting the next line causes a memory leak for some reason del X_rest - def t_test(self, method): + def t_test( + self, method: Literal["t-test", "t-test_overestim_var"] + ) -> Generator[tuple[int, NDArray[np.floating], NDArray[np.floating]], None, None]: from scipy import stats self._basic_stats() - for group_index, mask in enumerate(self.groups_masks): + for group_index, (mask, mean_group, var_group) in enumerate( + zip(self.groups_masks, self.means, self.vars) + ): if self.ireference is not None and group_index == self.ireference: continue - mean_group = self.means[group_index] - var_group = self.vars[group_index] ns_group = np.count_nonzero(mask) if self.ireference is not None: @@ -242,7 +248,9 @@ def t_test(self, method): yield group_index, scores, pvals - def wilcoxon(self, tie_correct): + def wilcoxon( + self, tie_correct: bool + ) -> Generator[tuple[int, NDArray[np.floating], NDArray[np.floating]], None, None]: from scipy import stats self._basic_stats() @@ -327,7 +335,9 @@ def wilcoxon(self, tie_correct): yield group_index, scores[group_index], pvals - def logreg(self, **kwds): + def logreg( + self, **kwds + ) -> Generator[tuple[int, NDArray[np.floating], None], None, None]: # if reference is not set, then the groups listed will be compared to the rest # if reference is set, then the groups listed will be compared only to the other groups listed from sklearn.linear_model import LogisticRegression @@ -359,13 +369,14 @@ def logreg(self, **kwds): def compute_statistics( self, - method, - corr_method="benjamini-hochberg", - n_genes_user=None, - rankby_abs=False, - tie_correct=False, + method: _Method, + *, + corr_method: _CorrMethod = "benjamini-hochberg", + n_genes_user: int | None = None, + rankby_abs: bool = False, + tie_correct: bool = False, **kwds, - ): + ) -> None: if method in {"t-test", "t-test_overestim_var"}: generate_test_results = self.t_test(method) elif method == "wilcoxon": @@ -439,7 +450,7 @@ def rank_genes_groups( pts: bool = False, key_added: Optional[str] = None, copy: bool = False, - method: _Method = None, + method: _Method | None = None, corr_method: _CorrMethod = "benjamini-hochberg", tie_correct: bool = False, layer: Optional[str] = None, @@ -552,7 +563,7 @@ def rank_genes_groups( rankby_abs = not kwds.pop("only_positive") # backwards compat start = logg.info("ranking genes") - avail_methods = {"t-test", "t-test_overestim_var", "wilcoxon", "logreg"} + avail_methods = set(get_args(_Method)) if method not in avail_methods: raise ValueError(f"Method must be one of {avail_methods}.") @@ -591,7 +602,15 @@ def rank_genes_groups( corr_method=corr_method, ) - test_obj = _RankGenes(adata, groups_order, groupby, reference, use_raw, layer, pts) + test_obj = _RankGenes( + adata, + groups_order, + groupby, + reference=reference, + use_raw=use_raw, + layer=layer, + comp_pts=pts, + ) if check_nonnegative_integers(test_obj.X) and method != "logreg": logg.warning( @@ -610,7 +629,12 @@ def rank_genes_groups( logg.debug(f"with sizes: {np.count_nonzero(test_obj.groups_masks, axis=1)}") test_obj.compute_statistics( - method, corr_method, n_genes_user, rankby_abs, tie_correct, **kwds + method, + corr_method=corr_method, + n_genes_user=n_genes_user, + rankby_abs=rankby_abs, + tie_correct=tie_correct, + **kwds, ) if test_obj.pts is not None: