From db0b8b3445c0a6a387875cb1bfa12da0b88c3fe6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 15 Nov 2024 12:55:42 +0100 Subject: [PATCH 1/3] Use custom tokenization function for NetCDFDataProxy objects --- lib/iris/fileformats/netcdf/_thread_safe_nc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 3a556f5447..9956f1480a 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -327,6 +327,12 @@ def ndim(self): def dask_meta(self): return np.ma.array(np.empty((0,) * self.ndim, dtype=self.dtype), mask=True) + def __dask_tokenize__(self): + # Dask uses this function to uniquely identify the "array". + # A custom function is slightly faster than general object tokenization, + # which improves the speed of loading small NetCDF files. + return f"<{self.__class__.__name__} path={self.path!r} variable_name={self.variable_name!r}>" + def __getitem__(self, keys): # Using a DatasetWrapper causes problems with invalid ID's and the # netCDF4 library, presumably because __getitem__ gets called so many From bfbd62551ba52cc09d1667f4c036420f6a12d807 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 11 Dec 2024 13:11:31 +0100 Subject: [PATCH 2/3] Add benchmark for files with many cubes --- benchmarks/benchmarks/load/__init__.py | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/benchmarks/benchmarks/load/__init__.py b/benchmarks/benchmarks/load/__init__.py index a4dfb40d19..5c5a62a515 100644 --- a/benchmarks/benchmarks/load/__init__.py +++ b/benchmarks/benchmarks/load/__init__.py @@ -132,6 +132,58 @@ def time_many_var_load(self) -> None: _ = load(str(self.FILE_PATH)) +class ManyCubes: + FILE_PATH = BENCHMARK_DATA / "many_cube_file.nc" + + @staticmethod + def _create_file(save_path: str) -> None: + """Run externally - everything must be self-contained.""" + import numpy as np + + from iris import save + from iris.coords import AuxCoord, DimCoord + from iris.cube import Cube, CubeList + + data_len = 81920 + bnds_len = 3 + data = np.arange(data_len).astype(np.float32) + bnds_data = ( + np.arange(data_len * bnds_len) + .astype(np.float32) + .reshape(data_len, bnds_len) + ) + time = DimCoord(np.array([0]), standard_name="time") + lat = AuxCoord( + data, bounds=bnds_data, standard_name="latitude", units="degrees" + ) + lon = AuxCoord( + data, bounds=bnds_data, standard_name="longitude", units="degrees" + ) + cube = Cube(data.reshape(1, -1), units="unknown") + cube.add_dim_coord(time, 0) + cube.add_aux_coord(lat, 1) + cube.add_aux_coord(lon, 1) + + n_cubes = 100 + cubes = CubeList() + for i in range(n_cubes): + cube = cube.copy() + cube.long_name = f"var_{i}" + cubes.append(cube) + save(cubes, save_path) + + def setup_cache(self) -> None: + if not REUSE_DATA or not self.FILE_PATH.is_file(): + # See :mod:`benchmarks.generate_data` docstring for full explanation. + _ = run_function_elsewhere( + self._create_file, + str(self.FILE_PATH), + ) + + def time_many_cube_load(self) -> None: + _ = load(str(self.FILE_PATH)) + + class StructuredFF: """Test structured loading of a large-ish fieldsfile. From 9a1a14dff2d4024f747d7344617c836fc71f45b2 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 12 Feb 2025 21:39:50 +0100 Subject: [PATCH 3/3] Add whatsnew --- docs/src/whatsnew/latest.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 74d090a006..d9f0701a18 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -61,7 +61,7 @@ This document explains the changes made to Iris for this release =========================== #. `@bouweandela`_ made loading :class:`~iris.cube.Cube`s from small NetCDF - files faster. (:pull:`6229`) + files faster. (:pull:`6229` and :pull:`6231`) #. `@fnattino`_ enabled lazy cube interpolation using the linear and nearest-neighbour interpolators (:class:`iris.analysis.Linear` and