Skip to content

Commit c61b12f

Browse files
committed
Cache Dask arrays to speed up loading files with multiple variables
1 parent 0fdedb4 commit c61b12f

File tree

2 files changed

+48
-5
lines changed

2 files changed

+48
-5
lines changed

lib/iris/_lazy_data.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def _optimum_chunksize_internals(
202202
dim = working[0]
203203
working = working[1:]
204204
result.append(dim)
205+
result = tuple(result)
205206

206207
return result
207208

@@ -227,6 +228,33 @@ def _optimum_chunksize(
227228
)
228229

229230

231+
class LRUCache:
232+
def __init__(self, maxsize: int) -> None:
233+
self._cache: dict = {}
234+
self.maxsize = maxsize
235+
236+
def __getitem__(self, key):
237+
value = self._cache.pop(key)
238+
self._cache[key] = value
239+
return value
240+
241+
def __setitem__(self, key, value):
242+
self._cache[key] = value
243+
if len(self._cache) > self.maxsize:
244+
self._cache.pop(next(iter(self._cache)))
245+
246+
def __contains__(self, key):
247+
return key in self._cache
248+
249+
def __repr__(self):
250+
return (
251+
f"<{self.__class__.__name__} maxsize={self.maxsize} cache={self._cache!r} >"
252+
)
253+
254+
255+
CACHE = LRUCache(100)
256+
257+
230258
def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
231259
"""Convert the input array `data` to a :class:`dask.array.Array`.
232260
@@ -264,6 +292,8 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
264292
but reduced by a factor if that exceeds the dask default chunksize.
265293
266294
"""
295+
from iris.fileformats.netcdf._thread_safe_nc import NetCDFDataProxy
296+
267297
if isinstance(data, ma.core.MaskedConstant):
268298
data = ma.masked_array(data.data, mask=data.mask)
269299

@@ -277,7 +307,7 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
277307
if chunks is None:
278308
# No existing chunks : Make a chunk the shape of the entire input array
279309
# (but we will subdivide it if too big).
280-
chunks = list(data.shape)
310+
chunks = tuple(data.shape)
281311

282312
# Adjust chunk size for better dask performance,
283313
# NOTE: but only if no shape dimension is zero, so that we can handle the
@@ -291,9 +321,22 @@ def as_lazy_data(data, chunks=None, asarray=False, meta=None, dims_fixed=None):
291321
dims_fixed=dims_fixed,
292322
)
293323

294-
if not is_lazy_data(data):
295-
data = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
296-
return data
324+
# Define a cache key for caching arrays created from NetCDFDataProxy objects.
325+
# Creating new Dask arrays is relatively slow, therefore caching is beneficial
326+
# if many cubes in the same file share coordinate arrays.
327+
if isinstance(data, NetCDFDataProxy):
328+
key = (repr(data), chunks, asarray, meta.dtype, type(meta))
329+
else:
330+
key = None
331+
332+
if is_lazy_data(data):
333+
result = data
334+
else:
335+
if key not in CACHE:
336+
CACHE[key] = da.from_array(data, chunks=chunks, asarray=asarray, meta=meta)
337+
result = CACHE[key].copy()
338+
339+
return result
297340

298341

299342
def _co_realise_lazy_arrays(arrays):

lib/iris/tests/unit/lazy_data/test_as_lazy_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def test_default_chunks_limiting(self, mocker):
161161
as_lazy_data(data)
162162
assert limitcall_patch.call_args_list == [
163163
mock.call(
164-
list(test_shape),
164+
tuple(test_shape),
165165
shape=test_shape,
166166
dtype=np.dtype("f4"),
167167
dims_fixed=None,

0 commit comments

Comments
 (0)