From 1249c6b1dc45ca5dea6bcb6146ab152c478d4dad Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 10 Dec 2024 21:55:26 +0100 Subject: [PATCH 1/7] Cache Dask arrays to speed up loading files with multiple variables --- lib/iris/_lazy_data.py | 53 +++++++++++++++++-- .../tests/unit/lazy_data/test_as_lazy_data.py | 2 +- 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index a3dfa1edb4..8095f9dea5 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -202,6 +202,7 @@ def _optimum_chunksize_internals( dim = working[0] working = working[1:] result.append(dim) + result = tuple(result) return result @@ -227,6 +228,33 @@ def _optimum_chunksize( ) +class LRUCache: + def __init__(self, maxsize: int) -> None: + self._cache: dict = {} + self.maxsize = maxsize + + def __getitem__(self, key): + value = self._cache.pop(key) + self._cache[key] = value + return value + + def __setitem__(self, key, value): + self._cache[key] = value + if len(self._cache) > self.maxsize: + self._cache.pop(next(iter(self._cache))) + + def __contains__(self, key): + return key in self._cache + + def __repr__(self): + return ( + f"<{self.__class__.__name__} maxsize={self.maxsize} cache={self._cache!r} >" + ) + + +CACHE = LRUCache(100) + + def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None): """Convert the input array `data` to a :class:`dask.array.Array`. @@ -264,6 +292,8 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None): but reduced by a factor if that exceeds the dask default chunksize. """ + from iris.fileformats.netcdf._thread_safe_nc import NetCDFDataProxy + if isinstance(data, ma.core.MaskedConstant): data = ma.masked_array(data.data, mask=data.mask) @@ -277,7 +307,7 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None): if chunks is None: # No existing chunks : Make a chunk the shape of the entire input array # (but we will subdivide it if too big). - chunks = list(data.shape) + chunks = tuple(data.shape) # Adjust chunk size for better dask performance, # NOTE: but only if no shape dimension is zero, so that we can handle the @@ -291,9 +321,24 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None): dims_fixed=dims_fixed, ) - if not is_lazy_data(data): - data = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta) - return data + # Define a cache key for caching arrays created from NetCDFDataProxy objects. + # Creating new Dask arrays is relatively slow, therefore caching is beneficial + # if many cubes in the same file share coordinate arrays. + if isinstance(data, NetCDFDataProxy): + key = (repr(data), chunks, asarray, meta.dtype, type(meta)) + else: + key = None + + if is_lazy_data(data): + result = data + elif key in CACHE: + result = CACHE[key].copy() + else: + result = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta) + if key is not None: + CACHE[key] = result.copy() + + return result def _co_realise_lazy_arrays(arrays): diff --git a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py index 821370ce6c..a0c13208ee 100644 --- a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py +++ b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py @@ -161,7 +161,7 @@ def test_default_chunks_limiting(self, mocker): as_lazy_data(data) assert limitcall_patch.call_args_list == [ mock.call( - list(test_shape), + tuple(test_shape), shape=test_shape, dtype=np.dtype("f4"), dims_fixed=None, From 0533c05972248253d0bd1d6ef64cab1999e26fa4 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 11 Dec 2024 13:11:31 +0100 Subject: [PATCH 2/7] Add benchmark for files with many cubes --- benchmarks/benchmarks/load/__init__.py | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/benchmarks/benchmarks/load/__init__.py b/benchmarks/benchmarks/load/__init__.py index a4dfb40d19..5c5a62a515 100644 --- a/benchmarks/benchmarks/load/__init__.py +++ b/benchmarks/benchmarks/load/__init__.py @@ -132,6 +132,58 @@ def time_many_var_load(self) -> None: _ = load(str(self.FILE_PATH)) +class ManyCubes: + FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc" + + @staticmethod + def _create_file(save_path: str) -> None: + """Run externally - everything must be self-contained.""" + import numpy as np + + from iris import save + from iris.coords import AuxCoord, DimCoord + from iris.cube import Cube, CubeList + + data_len = 81920 + bnds_len = 3 + data = np.arange(data_len).astype(np.float32) + bnds_data = ( + np.arange(data_len * bnds_len) + .astype(np.float32) + .reshape(data_len, bnds_len) + ) + time = DimCoord(np.array([0]), standard_name="time") + lat = AuxCoord( + data, bounds=bnds_data, standard_name="latitude", units="degrees" + ) + lon = AuxCoord( + data, bounds=bnds_data, standard_name="longitude", units="degrees" + ) + cube = Cube(data.reshape(1, -1), units="unknown") + cube.add_dim_coord(time, 0) + cube.add_aux_coord(lat, 1) + cube.add_aux_coord(lon, 1) + + n_cubes = 100 + cubes = CubeList() + for i in range(n_cubes): + cube = cube.copy() + cube.long_name = f"var_{i}" + cubes.append(cube) + save(cubes, save_path) + + def setup_cache(self) -> None: + if not REUSE_DATA or not self.FILE_PATH.is_file(): + # See :mod:`benchmarks.generate_data` docstring for full explanation. + _ = run_function_elsewhere( + self._create_file, + str(self.FILE_PATH), + ) + + def time_many_cube_load(self) -> None: + _ = load(str(self.FILE_PATH)) + + class StructuredFF: """Test structured loading of a large-ish fieldsfile. From 36ce6a6500e371279a7039b8a62e0b47c9b5c6f5 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 12 Feb 2025 21:42:41 +0100 Subject: [PATCH 3/7] Add whatsnew --- docs/src/whatsnew/latest.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 74d090a006..0afe57455a 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -61,7 +61,7 @@ This document explains the changes made to Iris for this release =========================== #. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF - files faster. (:pull:`6229`) + files faster. (:pull:`6229` and :pull:`6252`) #. `@fnattino`_ enabled lazy cube interpolation using the linear and nearest-neighbour interpolators (:class:`iris.analysis.Linear` and From e3e9647d01c0c566da2d1c1f040c63c9f31a4016 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 12 Feb 2025 21:59:16 +0100 Subject: [PATCH 4/7] Add test --- .../tests/unit/lazy_data/test_lrucache.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 lib/iris/tests/unit/lazy_data/test_lrucache.py diff --git a/lib/iris/tests/unit/lazy_data/test_lrucache.py b/lib/iris/tests/unit/lazy_data/test_lrucache.py new file mode 100644 index 0000000000..c26640da49 --- /dev/null +++ b/lib/iris/tests/unit/lazy_data/test_lrucache.py @@ -0,0 +1,19 @@ +from iris._lazy_data import LRUCache + + +def test_lrucache(): + cache = LRUCache(2) + + cache["a"] = 1 + + assert "a" in cache + assert cache["a"] == 1 + + cache["b"] = 2 + cache["c"] = 3 + + assert "a" not in cache + assert "b" in cache + assert "c" in cache + + assert str(cache) == "" From 71c665c4cbf277c3b296d7a7adebd56538a2860e Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 12 Feb 2025 22:11:32 +0100 Subject: [PATCH 5/7] Add license header --- lib/iris/tests/unit/lazy_data/test_lrucache.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/iris/tests/unit/lazy_data/test_lrucache.py b/lib/iris/tests/unit/lazy_data/test_lrucache.py index c26640da49..2a8abb0cde 100644 --- a/lib/iris/tests/unit/lazy_data/test_lrucache.py +++ b/lib/iris/tests/unit/lazy_data/test_lrucache.py @@ -1,3 +1,9 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Test function :func:`iris._lazy data.LRUCache`.""" + from iris._lazy_data import LRUCache From 8d75e1b135daea489bd3718e016ced2c16105d8e Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 13 Feb 2025 22:04:29 +0100 Subject: [PATCH 6/7] Use a global to set the cache size --- lib/iris/_lazy_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 8095f9dea5..7546aaea04 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -19,6 +19,9 @@ import numpy as np import numpy.ma as ma +MAX_CACHE_SIZE = 100 +"""Maximum number of Dask arrays to cache.""" + def non_lazy(func): """Turn a lazy function into a function that returns a result immediately.""" @@ -252,7 +255,7 @@ def __repr__(self): ) -CACHE = LRUCache(100) +CACHE = LRUCache(MAX_CACHE_SIZE) def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None): From a79562cb20980846319ddd5ea93f50057ef56e07 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 13 Feb 2025 22:04:49 +0100 Subject: [PATCH 7/7] Update whatsnew --- docs/src/whatsnew/latest.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 0afe57455a..68737d86cc 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -60,8 +60,8 @@ This document explains the changes made to Iris for this release 🚀 Performance Enhancements =========================== -#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF - files faster. (:pull:`6229` and :pull:`6252`) +#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from NetCDF files + faster. (:pull:`6229` and :pull:`6252`) #. `@fnattino`_ enabled lazy cube interpolation using the linear and nearest-neighbour interpolators (:class:`iris.analysis.Linear` and