From 1249c6b1dc45ca5dea6bcb6146ab152c478d4dad Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Tue, 10 Dec 2024 21:55:26 +0100
Subject: [PATCH 1/7] Cache Dask arrays to speed up loading files with multiple
 variables

---
 lib/iris/_lazy_data.py                        | 53 +++++++++++++++++--
 .../tests/unit/lazy_data/test_as_lazy_data.py |  2 +-
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
index a3dfa1edb4..8095f9dea5 100644
--- a/lib/iris/_lazy_data.py
+++ b/lib/iris/_lazy_data.py
@@ -202,6 +202,7 @@ def _optimum_chunksize_internals(
                 dim = working[0]
                 working = working[1:]
             result.append(dim)
+        result = tuple(result)
 
     return result
 
@@ -227,6 +228,33 @@ def _optimum_chunksize(
     )
 
 
+class LRUCache:
+    def __init__(self, maxsize: int) -> None:
+        self._cache: dict = {}
+        self.maxsize = maxsize
+
+    def __getitem__(self, key):
+        value = self._cache.pop(key)
+        self._cache[key] = value
+        return value
+
+    def __setitem__(self, key, value):
+        self._cache[key] = value
+        if len(self._cache) > self.maxsize:
+            self._cache.pop(next(iter(self._cache)))
+
+    def __contains__(self, key):
+        return key in self._cache
+
+    def __repr__(self):
+        return (
+            f"<{self.__class__.__name__} maxsize={self.maxsize} cache={self._cache!r} >"
+        )
+
+
+CACHE = LRUCache(100)
+
+
 def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
     """Convert the input array `data` to a :class:`dask.array.Array`.
 
@@ -264,6 +292,8 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
     but reduced by a factor if that exceeds the dask default chunksize.
 
     """
+    from iris.fileformats.netcdf._thread_safe_nc import NetCDFDataProxy
+
     if isinstance(data, ma.core.MaskedConstant):
         data = ma.masked_array(data.data, mask=data.mask)
 
@@ -277,7 +307,7 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
         if chunks is None:
             # No existing chunks : Make a chunk the shape of the entire input array
             # (but we will subdivide it if too big).
-            chunks = list(data.shape)
+            chunks = tuple(data.shape)
 
         # Adjust chunk size for better dask performance,
         # NOTE: but only if no shape dimension is zero, so that we can handle the
@@ -291,9 +321,24 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
                 dims_fixed=dims_fixed,
             )
 
-    if not is_lazy_data(data):
-        data = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
-    return data
+    # Define a cache key for caching arrays created from NetCDFDataProxy objects.
+    # Creating new Dask arrays is relatively slow, therefore caching is beneficial
+    # if many cubes in the same file share coordinate arrays.
+    if isinstance(data, NetCDFDataProxy):
+        key = (repr(data), chunks, asarray, meta.dtype, type(meta))
+    else:
+        key = None
+
+    if is_lazy_data(data):
+        result = data
+    elif key in CACHE:
+        result = CACHE[key].copy()
+    else:
+        result = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
+        if key is not None:
+            CACHE[key] = result.copy()
+
+    return result
 
 
 def _co_realise_lazy_arrays(arrays):
diff --git a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
index 821370ce6c..a0c13208ee 100644
--- a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
+++ b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
@@ -161,7 +161,7 @@ def test_default_chunks_limiting(self, mocker):
         as_lazy_data(data)
         assert limitcall_patch.call_args_list == [
             mock.call(
-                list(test_shape),
+                tuple(test_shape),
                 shape=test_shape,
                 dtype=np.dtype("f4"),
                 dims_fixed=None,

From 0533c05972248253d0bd1d6ef64cab1999e26fa4 Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Wed, 11 Dec 2024 13:11:31 +0100
Subject: [PATCH 2/7] Add benchmark for files with many cubes

---
 benchmarks/benchmarks/load/__init__.py | 52 ++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/benchmarks/benchmarks/load/__init__.py b/benchmarks/benchmarks/load/__init__.py
index a4dfb40d19..5c5a62a515 100644
--- a/benchmarks/benchmarks/load/__init__.py
+++ b/benchmarks/benchmarks/load/__init__.py
@@ -132,6 +132,58 @@ def time_many_var_load(self) -> None:
         _ = load(str(self.FILE_PATH))
 
 
+class ManyCubes:
+    FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc"
+
+    @staticmethod
+    def _create_file(save_path: str) -> None:
+        """Run externally - everything must be self-contained."""
+        import numpy as np
+
+        from iris import save
+        from iris.coords import AuxCoord, DimCoord
+        from iris.cube import Cube, CubeList
+
+        data_len = 81920
+        bnds_len = 3
+        data = np.arange(data_len).astype(np.float32)
+        bnds_data = (
+            np.arange(data_len * bnds_len)
+            .astype(np.float32)
+            .reshape(data_len, bnds_len)
+        )
+        time = DimCoord(np.array([0]), standard_name="time")
+        lat = AuxCoord(
+            data, bounds=bnds_data, standard_name="latitude", units="degrees"
+        )
+        lon = AuxCoord(
+            data, bounds=bnds_data, standard_name="longitude", units="degrees"
+        )
+        cube = Cube(data.reshape(1, -1), units="unknown")
+        cube.add_dim_coord(time, 0)
+        cube.add_aux_coord(lat, 1)
+        cube.add_aux_coord(lon, 1)
+
+        n_cubes = 100
+        cubes = CubeList()
+        for i in range(n_cubes):
+            cube = cube.copy()
+            cube.long_name = f"var_{i}"
+            cubes.append(cube)
+        save(cubes, save_path)
+
+    def setup_cache(self) -> None:
+        if not REUSE_DATA or not self.FILE_PATH.is_file():
+            # See :mod:`benchmarks.generate_data` docstring for full explanation.
+            _ = run_function_elsewhere(
+                self._create_file,
+                str(self.FILE_PATH),
+            )
+
+    def time_many_cube_load(self) -> None:
+        _ = load(str(self.FILE_PATH))
+
+
 class StructuredFF:
     """Test structured loading of a large-ish fieldsfile.
 

From 36ce6a6500e371279a7039b8a62e0b47c9b5c6f5 Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Wed, 12 Feb 2025 21:42:41 +0100
Subject: [PATCH 3/7] Add whatsnew

---
 docs/src/whatsnew/latest.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
index 74d090a006..0afe57455a 100644
--- a/docs/src/whatsnew/latest.rst
+++ b/docs/src/whatsnew/latest.rst
@@ -61,7 +61,7 @@ This document explains the changes made to Iris for this release
 ===========================
 
 #. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF
-   files faster. (:pull:`6229`)
+   files faster. (:pull:`6229` and :pull:`6252`)
 
 #. `@fnattino`_ enabled lazy cube interpolation using the linear and
    nearest-neighbour interpolators (:class:`iris.analysis.Linear` and

From e3e9647d01c0c566da2d1c1f040c63c9f31a4016 Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Wed, 12 Feb 2025 21:59:16 +0100
Subject: [PATCH 4/7] Add test

---
 .../tests/unit/lazy_data/test_lrucache.py     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 lib/iris/tests/unit/lazy_data/test_lrucache.py

diff --git a/lib/iris/tests/unit/lazy_data/test_lrucache.py b/lib/iris/tests/unit/lazy_data/test_lrucache.py
new file mode 100644
index 0000000000..c26640da49
--- /dev/null
+++ b/lib/iris/tests/unit/lazy_data/test_lrucache.py
@@ -0,0 +1,19 @@
+from iris._lazy_data import LRUCache
+
+
+def test_lrucache():
+    cache = LRUCache(2)
+
+    cache["a"] = 1
+
+    assert "a" in cache
+    assert cache["a"] == 1
+
+    cache["b"] = 2
+    cache["c"] = 3
+
+    assert "a" not in cache
+    assert "b" in cache
+    assert "c" in cache
+
+    assert str(cache) == "<LRUCache maxsize=2 cache={'b': 2, 'c': 3} >"

From 71c665c4cbf277c3b296d7a7adebd56538a2860e Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Wed, 12 Feb 2025 22:11:32 +0100
Subject: [PATCH 5/7] Add license header

---
 lib/iris/tests/unit/lazy_data/test_lrucache.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/iris/tests/unit/lazy_data/test_lrucache.py b/lib/iris/tests/unit/lazy_data/test_lrucache.py
index c26640da49..2a8abb0cde 100644
--- a/lib/iris/tests/unit/lazy_data/test_lrucache.py
+++ b/lib/iris/tests/unit/lazy_data/test_lrucache.py
@@ -1,3 +1,9 @@
+# Copyright Iris contributors
+#
+# This file is part of Iris and is released under the BSD license.
+# See LICENSE in the root of the repository for full licensing details.
+"""Test function :func:`iris._lazy data.LRUCache`."""
+
 from iris._lazy_data import LRUCache
 
 

From 8d75e1b135daea489bd3718e016ced2c16105d8e Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 13 Feb 2025 22:04:29 +0100
Subject: [PATCH 6/7] Use a global to set the cache size

---
 lib/iris/_lazy_data.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
index 8095f9dea5..7546aaea04 100644
--- a/lib/iris/_lazy_data.py
+++ b/lib/iris/_lazy_data.py
@@ -19,6 +19,9 @@
 import numpy as np
 import numpy.ma as ma
 
+MAX_CACHE_SIZE = 100
+"""Maximum number of Dask arrays to cache."""
+
 
 def non_lazy(func):
     """Turn a lazy function into a function that returns a result immediately."""
@@ -252,7 +255,7 @@ def __repr__(self):
         )
 
 
-CACHE = LRUCache(100)
+CACHE = LRUCache(MAX_CACHE_SIZE)
 
 
 def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):

From a79562cb20980846319ddd5ea93f50057ef56e07 Mon Sep 17 00:00:00 2001
From: Bouwe Andela <b.andela@esciencecenter.nl>
Date: Thu, 13 Feb 2025 22:04:49 +0100
Subject: [PATCH 7/7] Update whatsnew

---
 docs/src/whatsnew/latest.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
index 0afe57455a..68737d86cc 100644
--- a/docs/src/whatsnew/latest.rst
+++ b/docs/src/whatsnew/latest.rst
@@ -60,8 +60,8 @@ This document explains the changes made to Iris for this release
 🚀 Performance Enhancements
 ===========================
 
-#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF
-   files faster. (:pull:`6229` and :pull:`6252`)
+#. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from NetCDF files
+   faster. (:pull:`6229` and :pull:`6252`)
 
 #. `@fnattino`_ enabled lazy cube interpolation using the linear and
    nearest-neighbour interpolators (:class:`iris.analysis.Linear` and