(feat): calculate_qc_metrics with dask (#3307)

ilan-gold · flying-sheep · web-flow · commit 2f15f796013e · 2024-11-07T12:44:51.000+01:00
Co-authored-by: Philipp A. &lt;flying-sheep@web.de&gt;
diff --git a/docs/release-notes/3307.feature.md b/docs/release-notes/3307.feature.md
@@ -0,0 +1 @@
+Add support {class}`dask.array.Array` to {func}`scanpy.pp.calculate_qc_metrics` {smaller}`I Gold`
diff --git a/src/scanpy/_utils/__init__.py b/src/scanpy/_utils/__init__.py
@@ -54,7 +54,7 @@
     from typing import Any, TypeVar
 
     from anndata import AnnData
-    from numpy.typing import DTypeLike, NDArray
+    from numpy.typing import ArrayLike, DTypeLike, NDArray
 
     from ..neighbors import NeighborsParams, RPForestDict
 
@@ -738,6 +738,27 @@ def _(
     )
 
 
+@singledispatch
+def axis_nnz(X: ArrayLike, axis: Literal[0, 1]) -> np.ndarray:
+    return np.count_nonzero(X, axis=axis)
+
+
+@axis_nnz.register(sparse.spmatrix)
+def _(X: sparse.spmatrix, axis: Literal[0, 1]) -> np.ndarray:
+    return X.getnnz(axis=axis)
+
+
+@axis_nnz.register(DaskArray)
+def _(X: DaskArray, axis: Literal[0, 1]) -> DaskArray:
+    return X.map_blocks(
+        partial(axis_nnz, axis=axis),
+        dtype=np.int64,
+        meta=np.array([], dtype=np.int64),
+        drop_axis=0,
+        chunks=len(X.to_delayed()) * (X.chunksize[int(not axis)],),
+    )
+
+
 @overload
 def axis_sum(
     X: sparse.spmatrix,
diff --git a/src/scanpy/preprocessing/_qc.py b/src/scanpy/preprocessing/_qc.py
@@ -1,15 +1,19 @@
 from __future__ import annotations
 
+from functools import singledispatch
 from typing import TYPE_CHECKING
 from warnings import warn
 
 import numba
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix, issparse, isspmatrix_coo, isspmatrix_csr
-from sklearn.utils.sparsefuncs import mean_variance_axis
+from scipy.sparse import csr_matrix, issparse, isspmatrix_coo, isspmatrix_csr, spmatrix
 
-from .._utils import _doc_params
+from scanpy.preprocessing._distributed import materialize_as_ndarray
+from scanpy.preprocessing._utils import _get_mean_var
+
+from .._compat import DaskArray
+from .._utils import _doc_params, axis_nnz, axis_sum
 from ._docs import (
     doc_adata_basic,
     doc_expr_reps,
@@ -23,7 +27,6 @@
     from collections.abc import Collection
 
     from anndata import AnnData
-    from scipy.sparse import spmatrix
 
 
 def _choose_mtx_rep(adata, *, use_raw: bool = False, layer: str | None = None):
@@ -104,15 +107,14 @@ def describe_obs(
         if issparse(X):
             X.eliminate_zeros()
     obs_metrics = pd.DataFrame(index=adata.obs_names)
-    if issparse(X):
-        obs_metrics[f"n_{var_type}_by_{expr_type}"] = X.getnnz(axis=1)
-    else:
-        obs_metrics[f"n_{var_type}_by_{expr_type}"] = np.count_nonzero(X, axis=1)
+    obs_metrics[f"n_{var_type}_by_{expr_type}"] = materialize_as_ndarray(
+        axis_nnz(X, axis=1)
+    )
     if log1p:
         obs_metrics[f"log1p_n_{var_type}_by_{expr_type}"] = np.log1p(
             obs_metrics[f"n_{var_type}_by_{expr_type}"]
         )
-    obs_metrics[f"total_{expr_type}"] = np.ravel(X.sum(axis=1))
+    obs_metrics[f"total_{expr_type}"] = np.ravel(axis_sum(X, axis=1))
     if log1p:
         obs_metrics[f"log1p_total_{expr_type}"] = np.log1p(
             obs_metrics[f"total_{expr_type}"]
@@ -126,7 +128,7 @@ def describe_obs(
             )
     for qc_var in qc_vars:
         obs_metrics[f"total_{expr_type}_{qc_var}"] = np.ravel(
-            X[:, adata.var[qc_var].values].sum(axis=1)
+            axis_sum(X[:, adata.var[qc_var].values], axis=1)
         )
         if log1p:
             obs_metrics[f"log1p_total_{expr_type}_{qc_var}"] = np.log1p(
@@ -141,6 +143,7 @@ def describe_obs(
         adata.obs[obs_metrics.columns] = obs_metrics
     else:
         return obs_metrics
+    return None
 
 
 @_doc_params(
@@ -191,21 +194,17 @@ def describe_var(
         if issparse(X):
             X.eliminate_zeros()
     var_metrics = pd.DataFrame(index=adata.var_names)
-    if issparse(X):
-        # Current memory bottleneck for csr matrices:
-        var_metrics["n_cells_by_{expr_type}"] = X.getnnz(axis=0)
-        var_metrics["mean_{expr_type}"] = mean_variance_axis(X, axis=0)[0]
-    else:
-        var_metrics["n_cells_by_{expr_type}"] = np.count_nonzero(X, axis=0)
-        var_metrics["mean_{expr_type}"] = X.mean(axis=0)
+    var_metrics["n_cells_by_{expr_type}"], var_metrics["mean_{expr_type}"] = (
+        materialize_as_ndarray((axis_nnz(X, axis=0), _get_mean_var(X, axis=0)[0]))
+    )
     if log1p:
         var_metrics["log1p_mean_{expr_type}"] = np.log1p(
             var_metrics["mean_{expr_type}"]
         )
     var_metrics["pct_dropout_by_{expr_type}"] = (
         1 - var_metrics["n_cells_by_{expr_type}"] / X.shape[0]
     ) * 100
-    var_metrics["total_{expr_type}"] = np.ravel(X.sum(axis=0))
+    var_metrics["total_{expr_type}"] = np.ravel(axis_sum(X, axis=0))
     if log1p:
         var_metrics["log1p_total_{expr_type}"] = np.log1p(
             var_metrics["total_{expr_type}"]
@@ -217,8 +216,8 @@ def describe_var(
     var_metrics.columns = new_colnames
     if inplace:
         adata.var[var_metrics.columns] = var_metrics
-    else:
-        return var_metrics
+        return None
+    return var_metrics
 
 
 @_doc_params(
@@ -387,9 +386,18 @@ def top_proportions_sparse_csr(data, indptr, n):
     return values
 
 
-def top_segment_proportions(
-    mtx: np.ndarray | spmatrix, ns: Collection[int]
-) -> np.ndarray:
+def check_ns(func):
+    def check_ns_inner(mtx: np.ndarray | spmatrix | DaskArray, ns: Collection[int]):
+        if not (max(ns) <= mtx.shape[1] and min(ns) > 0):
+            raise IndexError("Positions outside range of features.")
+        return func(mtx, ns)
+
+    return check_ns_inner
+
+
+@singledispatch
+@check_ns
+def top_segment_proportions(mtx: np.ndarray, ns: Collection[int]) -> np.ndarray:
     """
     Calculates total percentage of counts in top ns genes.
 
@@ -402,20 +410,6 @@ def top_segment_proportions(
         1-indexed, e.g. `ns=[50]` will calculate cumulative proportion up to
         the 50th most expressed gene.
     """
-    # Pretty much just does dispatch
-    if not (max(ns) <= mtx.shape[1] and min(ns) > 0):
-        raise IndexError("Positions outside range of features.")
-    if issparse(mtx):
-        if not isspmatrix_csr(mtx):
-            mtx = csr_matrix(mtx)
-        return top_segment_proportions_sparse_csr(mtx.data, mtx.indptr, np.array(ns))
-    else:
-        return top_segment_proportions_dense(mtx, ns)
-
-
-def top_segment_proportions_dense(
-    mtx: np.ndarray | spmatrix, ns: Collection[int]
-) -> np.ndarray:
     # Currently ns is considered to be 1 indexed
     ns = np.sort(ns)
     sums = mtx.sum(axis=1)
@@ -432,6 +426,25 @@ def top_segment_proportions_dense(
     return values / sums[:, None]
 
 
+@top_segment_proportions.register(DaskArray)
+@check_ns
+def _(mtx: DaskArray, ns: Collection[int]) -> DaskArray:
+    if not isinstance(mtx._meta, csr_matrix | np.ndarray):
+        msg = f"DaskArray must have csr matrix or ndarray meta, got {mtx._meta}."
+        raise ValueError(msg)
+    return mtx.map_blocks(
+        lambda x: top_segment_proportions(x, ns), meta=np.array([])
+    ).compute()
+
+
+@top_segment_proportions.register(spmatrix)
+@check_ns
+def _(mtx: spmatrix, ns: Collection[int]) -> DaskArray:
+    if not isspmatrix_csr(mtx):
+        mtx = csr_matrix(mtx)
+    return top_segment_proportions_sparse_csr(mtx.data, mtx.indptr, np.array(ns))
+
+
 @numba.njit(cache=True, parallel=True)
 def top_segment_proportions_sparse_csr(data, indptr, ns):
     # work around https://github.com/numba/numba/issues/5056
diff --git a/src/testing/scanpy/_helpers/__init__.py b/src/testing/scanpy/_helpers/__init__.py
@@ -5,8 +5,9 @@
 from __future__ import annotations
 
 import warnings
-from contextlib import AbstractContextManager
+from contextlib import AbstractContextManager, contextmanager
 from dataclasses import dataclass
+from importlib.util import find_spec
 from itertools import permutations
 from typing import TYPE_CHECKING
 
@@ -158,3 +159,24 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         for ctx in reversed(self.contexts):
             ctx.__exit__(exc_type, exc_value, traceback)
+
+
+@contextmanager
+def maybe_dask_process_context():
+    """
+    Running numba with dask's threaded scheduler causes crashes,
+    so we need to switch to single-threaded (or processes, which is slower)
+    scheduler for tests that use numba.
+    """
+    if not find_spec("dask"):
+        yield
+        return
+
+    import dask.config
+
+    prev_scheduler = dask.config.get("scheduler", "threads")
+    dask.config.set(scheduler="single-threaded")
+    try:
+        yield
+    finally:
+        dask.config.set(scheduler=prev_scheduler)
diff --git a/tests/test_qc_metrics.py b/tests/test_qc_metrics.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Add support {class}`dask.array.Array` to {func}`scanpy.pp.calculate_qc_metrics` {smaller}`I Gold`