diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index 4a964a648d..38502c9306 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -4,5 +4,69 @@ # See COPYING and COPYING.LESSER in the root of the repository for full # licensing details. """Common code for benchmarks.""" +import resource + +from .generate_data import BENCHMARK_DATA, run_function_elsewhere ARTIFICIAL_DIM_SIZE = int(10e3) # For all artificial cubes, coords etc. + + +def disable_repeat_between_setup(benchmark_object): + """ + Decorator for benchmarks where object persistence would be inappropriate. + + E.g: + * Benchmarking data realisation + * Benchmarking Cube coord addition + + Can be applied to benchmark classes/methods/functions. + + https://asv.readthedocs.io/en/stable/benchmarks.html#timing-benchmarks + + """ + # Prevent repeat runs between setup() runs - object(s) will persist after 1st. + benchmark_object.number = 1 + # Compensate for reduced certainty by increasing number of repeats. + # (setup() is run between each repeat). + # Minimum 5 repeats, run up to 30 repeats / 20 secs whichever comes first. + benchmark_object.repeat = (5, 30, 20.0) + # ASV uses warmup to estimate benchmark time before planning the real run. + # Prevent this, since object(s) will persist after first warmup run, + # which would give ASV misleading info (warmups ignore ``number``). + benchmark_object.warmup_time = 0.0 + + return benchmark_object + + +class TrackAddedMemoryAllocation: + """ + Context manager which measures by how much process resident memory grew, + during execution of its enclosed code block. + + Obviously limited as to what it actually measures : Relies on the current + process not having significant unused (de-allocated) memory when the + tested codeblock runs, and only reliable when the code allocates a + significant amount of new memory. + + Example: + with TrackAddedMemoryAllocation() as mb: + initial_call() + other_call() + result = mb.addedmem_mb() + + """ + + @staticmethod + def process_resident_memory_mb(): + return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0 + + def __enter__(self): + self.mb_before = self.process_resident_memory_mb() + return self + + def __exit__(self, *_): + self.mb_after = self.process_resident_memory_mb() + + def addedmem_mb(self): + """Return measured memory growth, in Mb.""" + return self.mb_after - self.mb_before diff --git a/benchmarks/benchmarks/aux_factory.py b/benchmarks/benchmarks/aux_factory.py index 270119da71..45bfa1b515 100644 --- a/benchmarks/benchmarks/aux_factory.py +++ b/benchmarks/benchmarks/aux_factory.py @@ -10,9 +10,10 @@ import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import aux_factory, coords +from . import ARTIFICIAL_DIM_SIZE + class FactoryCommon: # TODO: once https://github.com/airspeed-velocity/asv/pull/828 is released: diff --git a/benchmarks/benchmarks/coords.py b/benchmarks/benchmarks/coords.py index fce7318d49..5cea1e1e2e 100644 --- a/benchmarks/benchmarks/coords.py +++ b/benchmarks/benchmarks/coords.py @@ -10,9 +10,10 @@ import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import coords +from . import ARTIFICIAL_DIM_SIZE, disable_repeat_between_setup + def setup(): """General variables needed by multiple benchmark classes.""" @@ -92,6 +93,23 @@ def setup(self): def create(self): return coords.AuxCoord(**self.create_kwargs) + def time_points(self): + _ = self.component.points + + def time_bounds(self): + _ = self.component.bounds + + +@disable_repeat_between_setup +class AuxCoordLazy(AuxCoord): + """Lazy equivalent of :class:`AuxCoord`.""" + + def setup(self): + super().setup() + self.create_kwargs["points"] = self.component.lazy_points() + self.create_kwargs["bounds"] = self.component.lazy_bounds() + self.setup_common() + class CellMeasure(CoordCommon): def setup(self): diff --git a/benchmarks/benchmarks/cube.py b/benchmarks/benchmarks/cube.py index 3cfa6b248b..8a12391684 100644 --- a/benchmarks/benchmarks/cube.py +++ b/benchmarks/benchmarks/cube.py @@ -10,11 +10,13 @@ import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import analysis, aux_factory, coords, cube +from . import ARTIFICIAL_DIM_SIZE, disable_repeat_between_setup +from .generate_data.stock import sample_meshcoord -def setup(): + +def setup(*params): """General variables needed by multiple benchmark classes.""" global data_1d global data_2d @@ -170,6 +172,44 @@ def setup(self): self.setup_common() +class MeshCoord: + params = [ + 6, # minimal cube-sphere + int(1e6), # realistic cube-sphere size + ARTIFICIAL_DIM_SIZE, # To match size in :class:`AuxCoord` + ] + param_names = ["number of faces"] + + def setup(self, n_faces): + mesh_kwargs = dict( + n_nodes=n_faces + 2, n_edges=n_faces * 2, n_faces=n_faces + ) + + self.mesh_coord = sample_meshcoord(sample_mesh_kwargs=mesh_kwargs) + self.data = np.zeros(n_faces) + self.cube_blank = cube.Cube(data=self.data) + self.cube = self.create() + + def create(self): + return cube.Cube( + data=self.data, aux_coords_and_dims=[(self.mesh_coord, 0)] + ) + + def time_create(self, n_faces): + _ = self.create() + + @disable_repeat_between_setup + def time_add(self, n_faces): + self.cube_blank.add_aux_coord(self.mesh_coord, 0) + + @disable_repeat_between_setup + def time_remove(self, n_faces): + self.cube.remove_coord(self.mesh_coord) + + def time_return(self, n_faces): + _ = self.cube + + class Merge: def setup(self): self.cube_list = cube.CubeList() diff --git a/benchmarks/benchmarks/experimental/__init__.py b/benchmarks/benchmarks/experimental/__init__.py new file mode 100644 index 0000000000..f16e400bce --- /dev/null +++ b/benchmarks/benchmarks/experimental/__init__.py @@ -0,0 +1,9 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Benchmark tests for the experimental module. + +""" diff --git a/benchmarks/benchmarks/experimental/ugrid.py b/benchmarks/benchmarks/experimental/ugrid.py new file mode 100644 index 0000000000..609abbe77c --- /dev/null +++ b/benchmarks/benchmarks/experimental/ugrid.py @@ -0,0 +1,195 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Benchmark tests for the experimental.ugrid module. + +""" + +from copy import deepcopy + +import numpy as np + +from iris.experimental import ugrid + +from .. import ARTIFICIAL_DIM_SIZE, disable_repeat_between_setup +from ..generate_data.stock import sample_mesh + + +class UGridCommon: + """ + A base class running a generalised suite of benchmarks for any ugrid object. + Object to be specified in a subclass. + + ASV will run the benchmarks within this class for any subclasses. + + ASV will not benchmark this class as setup() triggers a NotImplementedError. + (ASV has not yet released ABC/abstractmethod support - asv#838). + + """ + + params = [ + 6, # minimal cube-sphere + int(1e6), # realistic cube-sphere size + ] + param_names = ["number of faces"] + + def setup(self, *params): + self.object = self.create() + + def create(self): + raise NotImplementedError + + def time_create(self, *params): + """Create an instance of the benchmarked object. create() method is + specified in the subclass.""" + self.create() + + def time_return(self, *params): + """Return an instance of the benchmarked object.""" + _ = self.object + + +class Connectivity(UGridCommon): + def setup(self, n_faces): + self.array = np.zeros([n_faces, 3], dtype=np.int) + super().setup(n_faces) + + def create(self): + return ugrid.Connectivity( + indices=self.array, cf_role="face_node_connectivity" + ) + + def time_indices(self, n_faces): + _ = self.object.indices + + def time_location_lengths(self, n_faces): + # Proofed against the Connectivity name change (633ed17). + if getattr(self.object, "src_lengths", False): + meth = self.object.src_lengths + else: + meth = self.object.location_lengths + _ = meth() + + def time_validate_indices(self, n_faces): + self.object.validate_indices() + + +@disable_repeat_between_setup +class ConnectivityLazy(Connectivity): + """Lazy equivalent of :class:`Connectivity`.""" + + def setup(self, n_faces): + super().setup(n_faces) + self.array = self.object.lazy_indices() + self.object = self.create() + + +class Mesh(UGridCommon): + def setup(self, n_faces, lazy=False): + #### + # Steal everything from the sample mesh for benchmarking creation of a + # brand new mesh. + source_mesh = sample_mesh( + n_nodes=n_faces + 2, + n_edges=n_faces * 2, + n_faces=n_faces, + lazy_values=lazy, + ) + + def get_coords_and_axes(location): + search_kwargs = {f"include_{location}s": True} + return [ + (source_mesh.coord(axis=axis, **search_kwargs), axis) + for axis in ("x", "y") + ] + + self.mesh_kwargs = dict( + topology_dimension=source_mesh.topology_dimension, + node_coords_and_axes=get_coords_and_axes("node"), + connectivities=source_mesh.connectivities(), + edge_coords_and_axes=get_coords_and_axes("edge"), + face_coords_and_axes=get_coords_and_axes("face"), + ) + #### + + super().setup(n_faces) + + self.face_node = self.object.face_node_connectivity + self.node_x = self.object.node_coords.node_x + # Kwargs for reuse in search and remove methods. + self.connectivities_kwarg = dict(cf_role="edge_node_connectivity") + self.coords_kwarg = dict(include_faces=True) + + # TODO: an opportunity for speeding up runtime if needed, since + # eq_object is not needed for all benchmarks. Just don't generate it + # within a benchmark - the execution time is large enough that it + # could be a significant portion of the benchmark - makes regressions + # smaller and could even pick up regressions in copying instead! + self.eq_object = deepcopy(self.object) + + def create(self): + return ugrid.Mesh(**self.mesh_kwargs) + + def time_add_connectivities(self, n_faces): + self.object.add_connectivities(self.face_node) + + def time_add_coords(self, n_faces): + self.object.add_coords(node_x=self.node_x) + + def time_connectivities(self, n_faces): + _ = self.object.connectivities(**self.connectivities_kwarg) + + def time_coords(self, n_faces): + _ = self.object.coords(**self.coords_kwarg) + + def time_eq(self, n_faces): + _ = self.object == self.eq_object + + def time_remove_connectivities(self, n_faces): + self.object.remove_connectivities(**self.connectivities_kwarg) + + def time_remove_coords(self, n_faces): + self.object.remove_coords(**self.coords_kwarg) + + +@disable_repeat_between_setup +class MeshLazy(Mesh): + """Lazy equivalent of :class:`Mesh`.""" + + def setup(self, n_faces, lazy=True): + super().setup(n_faces, lazy=lazy) + + +class MeshCoord(UGridCommon): + # Add extra parameter value to match AuxCoord benchmarking. + params = UGridCommon.params + [ARTIFICIAL_DIM_SIZE] + + def setup(self, n_faces, lazy=False): + self.mesh = sample_mesh( + n_nodes=n_faces + 2, + n_edges=n_faces * 2, + n_faces=n_faces, + lazy_values=lazy, + ) + + super().setup(n_faces) + + def create(self): + return ugrid.MeshCoord(mesh=self.mesh, location="face", axis="x") + + def time_points(self, n_faces): + _ = self.object.points + + def time_bounds(self, n_faces): + _ = self.object.bounds + + +@disable_repeat_between_setup +class MeshCoordLazy(MeshCoord): + """Lazy equivalent of :class:`MeshCoord`.""" + + def setup(self, n_faces, lazy=True): + super().setup(n_faces, lazy=lazy) diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py index a56f2e4623..125e2e1b53 100644 --- a/benchmarks/benchmarks/generate_data/__init__.py +++ b/benchmarks/benchmarks/generate_data/__init__.py @@ -16,11 +16,18 @@ benchmark sequence runs over two different Python versions. """ +from contextlib import contextmanager from inspect import getsource from os import environ from pathlib import Path from subprocess import CalledProcessError, check_output, run from textwrap import dedent +from typing import Iterable + +from iris import load_cube as iris_loadcube +from iris._lazy_data import as_concrete_data +from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD +from iris.fileformats import netcdf #: Python executable used by :func:`run_function_elsewhere`, set via env #: variable of same name. Must be path of Python within an environment that @@ -92,3 +99,99 @@ def run_function_elsewhere(func_to_run, *args, **kwargs): [DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True ) return result.stdout + + +def generate_cube_like_2d_cubesphere( + n_cube: int, with_mesh: bool, output_path: str +): + """ + Construct and save to file an LFRIc cubesphere-like cube for a given + cubesphere size, *or* a simpler structured (UM-like) cube of equivalent + size. + + NOTE: this function is *NEVER* called from within this actual package. + Instead, it is to be called via benchmarks.remote_data_generation, + so that it can use up-to-date facilities, independent of the ASV controlled + environment which contains the "Iris commit under test". + This means: + * it must be completely self-contained : i.e. it includes all its + own imports, and saves results to an output file. + + """ + from iris import save + from iris.tests.stock.mesh import sample_mesh, sample_mesh_cube + + n_face_nodes = n_cube * n_cube + n_faces = 6 * n_face_nodes + + # Set n_nodes=n_faces and n_edges=2*n_faces + # : Not exact, but similar to a 'real' cubesphere. + n_nodes = n_faces + n_edges = 2 * n_faces + if with_mesh: + mesh = sample_mesh( + n_nodes=n_nodes, n_faces=n_faces, n_edges=n_edges, lazy_values=True + ) + cube = sample_mesh_cube(mesh=mesh, n_z=1) + else: + cube = sample_mesh_cube(nomesh_faces=n_faces, n_z=1) + + # Strip off the 'extra' aux-coord mapping the mesh, which sample-cube adds + # but which we don't want. + cube.remove_coord("mesh_face_aux") + + # Save the result to a named file. + save(cube, output_path) + + +def make_cube_like_2d_cubesphere(n_cube: int, with_mesh: bool): + """ + Generate an LFRIc cubesphere-like cube for a given cubesphere size, + *or* a simpler structured (UM-like) cube of equivalent size. + + All the cube data, coords and mesh content are LAZY, and produced without + allocating large real arrays (to allow peak-memory testing). + + NOTE: the actual cube generation is done in a stable Iris environment via + benchmarks.remote_data_generation, so it is all channeled via cached netcdf + files in our common testdata directory. + + """ + identifying_filename = ( + f"cube_like_2d_cubesphere_C{n_cube}_Mesh={with_mesh}.nc" + ) + filepath = BENCHMARK_DATA / identifying_filename + if not filepath.exists(): + # Create the required testfile, by running the generation code remotely + # in a 'fixed' python environment. + run_function_elsewhere( + generate_cube_like_2d_cubesphere, + n_cube, + with_mesh=with_mesh, + output_path=str(filepath), + ) + + # File now *should* definitely exist: content is simply the desired cube. + with PARSE_UGRID_ON_LOAD.context(): + with load_realised(): + cube = iris_loadcube(str(filepath)) + return cube + + +@contextmanager +def load_realised(): + """ + Force NetCDF loading with realised arrays. + + Since passing between data generation and benchmarking environments is via + file loading, but some benchmarks are only meaningful if starting with real + arrays. + """ + from iris.fileformats.netcdf import _get_cf_var_data as pre_patched + + def patched(cf_var, filename): + return as_concrete_data(pre_patched(cf_var, filename)) + + netcdf._get_cf_var_data = patched + yield netcdf + netcdf._get_cf_var_data = pre_patched diff --git a/benchmarks/benchmarks/generate_data/stock.py b/benchmarks/benchmarks/generate_data/stock.py new file mode 100644 index 0000000000..e352147fc8 --- /dev/null +++ b/benchmarks/benchmarks/generate_data/stock.py @@ -0,0 +1,126 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Wrappers for using :mod:`iris.tests.stock` methods for benchmarking. + +See :mod:`benchmarks.generate_data` for an explanation of this structure. +""" + +from pathlib import Path +import pickle + +from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD, load_mesh + +from . import BENCHMARK_DATA, REUSE_DATA, load_realised, run_function_elsewhere + + +def create_file__xios_2d_face_half_levels( + temp_file_dir, dataset_name, n_faces=866, n_times=1 +): + """ + Wrapper for :meth:`iris.tests.stock.netcdf.create_file__xios_2d_face_half_levels`. + + Have taken control of temp_file_dir + + todo: is create_file__xios_2d_face_half_levels still appropriate now we can + properly save Mesh Cubes? + """ + + def _external(*args, **kwargs): + from iris.tests.stock.netcdf import ( + create_file__xios_2d_face_half_levels, + ) + + print(create_file__xios_2d_face_half_levels(*args, **kwargs), end="") + + args_list = [dataset_name, n_faces, n_times] + args_hash = hash(str(args_list)) + save_path = ( + BENCHMARK_DATA / f"create_file__xios_2d_face_half_levels_{args_hash}" + ).with_suffix(".nc") + if not REUSE_DATA or not save_path.is_file(): + # create_file__xios_2d_face_half_levels takes control of save location + # so need to move to a more specific name that allows re-use. + actual_path = run_function_elsewhere( + _external, str(BENCHMARK_DATA), *args_list + ) + Path(actual_path.decode()).replace(save_path) + return save_path + + +def sample_mesh(n_nodes=None, n_faces=None, n_edges=None, lazy_values=False): + """Wrapper for :meth:iris.tests.stock.mesh.sample_mesh`.""" + + def _external(*args, **kwargs): + from iris.experimental.ugrid import save_mesh + from iris.tests.stock.mesh import sample_mesh + + save_path_ = kwargs.pop("save_path") + # Always saving, so laziness is irrelevant. Use lazy to save time. + kwargs["lazy_values"] = True + new_mesh = sample_mesh(*args, **kwargs) + save_mesh(new_mesh, save_path_) + + arg_list = [n_nodes, n_faces, n_edges] + args_hash = hash(str(arg_list)) + save_path = (BENCHMARK_DATA / f"sample_mesh_{args_hash}").with_suffix( + ".nc" + ) + if not REUSE_DATA or not save_path.is_file(): + _ = run_function_elsewhere( + _external, *arg_list, save_path=str(save_path) + ) + with PARSE_UGRID_ON_LOAD.context(): + if not lazy_values: + # Realise everything. + with load_realised(): + mesh = load_mesh(str(save_path)) + else: + mesh = load_mesh(str(save_path)) + return mesh + + +def sample_meshcoord(sample_mesh_kwargs=None, location="face", axis="x"): + """ + Wrapper for :meth:`iris.tests.stock.mesh.sample_meshcoord`. + + Parameters deviate from the original as cannot pass a + :class:`iris.experimental.ugrid.Mesh to the separate Python instance - must + instead generate the Mesh as well. + + MeshCoords cannot be saved to file, so the _external method saves the + MeshCoord's Mesh, then the original Python instance loads in that Mesh and + regenerates the MeshCoord from there. + """ + + def _external(sample_mesh_kwargs_, save_path_): + from iris.experimental.ugrid import save_mesh + from iris.tests.stock.mesh import sample_mesh, sample_meshcoord + + if sample_mesh_kwargs_: + input_mesh = sample_mesh(**sample_mesh_kwargs_) + else: + input_mesh = None + # Don't parse the location or axis arguments - only saving the Mesh at + # this stage. + new_meshcoord = sample_meshcoord(mesh=input_mesh) + save_mesh(new_meshcoord.mesh, save_path_) + + args_hash = hash(str(sample_mesh_kwargs)) + save_path = ( + BENCHMARK_DATA / f"sample_mesh_coord_{args_hash}" + ).with_suffix(".nc") + if not REUSE_DATA or not save_path.is_file(): + _ = run_function_elsewhere( + _external, + sample_mesh_kwargs_=sample_mesh_kwargs, + save_path_=str(save_path), + ) + with PARSE_UGRID_ON_LOAD.context(): + with load_realised(): + source_mesh = load_mesh(str(save_path)) + # Regenerate MeshCoord from its Mesh, which we saved. + return source_mesh.to_MeshCoord(location=location, axis=axis) diff --git a/benchmarks/benchmarks/iterate.py b/benchmarks/benchmarks/iterate.py index 20422750ef..0a5415ac2b 100644 --- a/benchmarks/benchmarks/iterate.py +++ b/benchmarks/benchmarks/iterate.py @@ -9,9 +9,10 @@ """ import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import coords, cube, iterate +from . import ARTIFICIAL_DIM_SIZE + def setup(): """General variables needed by multiple benchmark classes.""" diff --git a/benchmarks/benchmarks/mixin.py b/benchmarks/benchmarks/mixin.py index e78b150438..bec5518eee 100644 --- a/benchmarks/benchmarks/mixin.py +++ b/benchmarks/benchmarks/mixin.py @@ -10,10 +10,11 @@ import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import coords from iris.common.metadata import AncillaryVariableMetadata +from . import ARTIFICIAL_DIM_SIZE + LONG_NAME = "air temperature" STANDARD_NAME = "air_temperature" VAR_NAME = "air_temp" diff --git a/benchmarks/benchmarks/netcdf_save.py b/benchmarks/benchmarks/netcdf_save.py new file mode 100644 index 0000000000..c7580b3c63 --- /dev/null +++ b/benchmarks/benchmarks/netcdf_save.py @@ -0,0 +1,61 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Cubesphere-like netcdf saving benchmarks. + +Where possible benchmarks should be parameterised for two sizes of input data: + * minimal: enables detection of regressions in parts of the run-time that do + NOT scale with data size. + * large: large enough to exclusively detect regressions in parts of the + run-time that scale with data size. Aim for benchmark time ~20x + that of the minimal benchmark. + +""" +from iris import save +from iris.experimental.ugrid import save_mesh + +from . import TrackAddedMemoryAllocation +from .generate_data import make_cube_like_2d_cubesphere + + +class NetcdfSave: + params = [[1, 600], [False, True]] + param_names = ["cubesphere-N", "is_unstructured"] + + def setup(self, n_cubesphere, is_unstructured): + self.cube = make_cube_like_2d_cubesphere( + n_cube=n_cubesphere, with_mesh=is_unstructured + ) + + def _save_data(self, cube, do_copy=True): + if do_copy: + # Copy the cube, to avoid distorting the results by changing it + # Because we known that older Iris code realises lazy coords + cube = cube.copy() + save(cube, "tmp.nc") + + def _save_mesh(self, cube): + # In this case, we are happy that the mesh is *not* modified + save_mesh(cube.mesh, "mesh.nc") + + def time_netcdf_save_cube(self, n_cubesphere, is_unstructured): + self._save_data(self.cube) + + def time_netcdf_save_mesh(self, n_cubesphere, is_unstructured): + if is_unstructured: + self._save_mesh(self.cube) + + def track_addedmem_netcdf_save(self, n_cubesphere, is_unstructured): + cube = self.cube.copy() # Do this outside the testing block + with TrackAddedMemoryAllocation() as mb: + self._save_data(cube, do_copy=False) + return mb.addedmem_mb() + + +# Declare a 'Mb' unit for all 'track_addedmem_..' type benchmarks +for attr in dir(NetcdfSave): + if attr.startswith("track_addedmem_"): + getattr(NetcdfSave, attr).unit = "Mb" diff --git a/benchmarks/benchmarks/plot.py b/benchmarks/benchmarks/plot.py index 24899776dc..75195c86e9 100644 --- a/benchmarks/benchmarks/plot.py +++ b/benchmarks/benchmarks/plot.py @@ -10,9 +10,10 @@ import matplotlib import numpy as np -from benchmarks import ARTIFICIAL_DIM_SIZE from iris import coords, cube, plot +from . import ARTIFICIAL_DIM_SIZE + matplotlib.use("agg") diff --git a/benchmarks/benchmarks/regions_combine.py b/benchmarks/benchmarks/regions_combine.py new file mode 100644 index 0000000000..a99dc57263 --- /dev/null +++ b/benchmarks/benchmarks/regions_combine.py @@ -0,0 +1,268 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Benchmarks stages of operation of the function +:func:`iris.experimental.ugrid.utils.recombine_submeshes`. + +Where possible benchmarks should be parameterised for two sizes of input data: + * minimal: enables detection of regressions in parts of the run-time that do + NOT scale with data size. + * large: large enough to exclusively detect regressions in parts of the + run-time that scale with data size. Aim for benchmark time ~20x + that of the minimal benchmark. + +""" +import os + +import dask.array as da +import numpy as np + +from iris import load, load_cube, save +from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD +from iris.experimental.ugrid.utils import recombine_submeshes + +from . import TrackAddedMemoryAllocation +from .generate_data import make_cube_like_2d_cubesphere + + +class MixinCombineRegions: + # Characterise time taken + memory-allocated, for various stages of combine + # operations on cubesphere-like test data. + params = [4, 500] + param_names = ["cubesphere-N"] + + def _parametrised_cache_filename(self, n_cubesphere, content_name): + return f"cube_C{n_cubesphere}_{content_name}.nc" + + def _make_region_cubes(self, full_mesh_cube): + """Make a fixed number of region cubes from a full meshcube.""" + # Divide the cube into regions. + n_faces = full_mesh_cube.shape[-1] + # Start with a simple list of face indices + # first extend to multiple of 5 + n_faces_5s = 5 * ((n_faces + 1) // 5) + i_faces = np.arange(n_faces_5s, dtype=int) + # reshape (5N,) to (N, 5) + i_faces = i_faces.reshape((n_faces_5s // 5, 5)) + # reorder [2, 3, 4, 0, 1] within each block of 5 + i_faces = np.concatenate([i_faces[:, 2:], i_faces[:, :2]], axis=1) + # flatten to get [2 3 4 0 1 (-) 8 9 10 6 7 (-) 13 14 15 11 12 ...] + i_faces = i_faces.flatten() + # reduce back to orignal length, wrap any overflows into valid range + i_faces = i_faces[:n_faces] % n_faces + + # Divide into regions -- always slightly uneven, since 7 doesn't divide + n_regions = 7 + n_facesperregion = n_faces // n_regions + i_face_regions = (i_faces // n_facesperregion) % n_regions + region_inds = [ + np.where(i_face_regions == i_region)[0] + for i_region in range(n_regions) + ] + # NOTE: this produces 7 regions, with near-adjacent value ranges but + # with some points "moved" to an adjacent region. + # Also, region-0 is bigger (because of not dividing by 7). + + # Finally, make region cubes with these indices. + region_cubes = [full_mesh_cube[..., inds] for inds in region_inds] + return region_cubes + + def setup_cache(self): + """Cache all the necessary source data on disk.""" + + # Control dask, to minimise memory usage + allow largest data. + self.fix_dask_settings() + + for n_cubesphere in self.params: + # Do for each parameter, since "setup_cache" is NOT parametrised + mesh_cube = make_cube_like_2d_cubesphere( + n_cube=n_cubesphere, with_mesh=True + ) + # Save to files which include the parameter in the names. + save( + mesh_cube, + self._parametrised_cache_filename(n_cubesphere, "meshcube"), + ) + region_cubes = self._make_region_cubes(mesh_cube) + save( + region_cubes, + self._parametrised_cache_filename(n_cubesphere, "regioncubes"), + ) + + def setup( + self, n_cubesphere, imaginary_data=True, create_result_cube=True + ): + """ + The combine-tests "standard" setup operation. + + Load the source cubes (full-mesh + region) from disk. + These are specific to the cubesize parameter. + The data is cached on disk rather than calculated, to avoid any + pre-loading of the process memory allocation. + + If 'imaginary_data' is set (default), the region cubes data is replaced + with lazy data in the form of a da.zeros(). Otherwise, the region data + is lazy data from the files. + + If 'create_result_cube' is set, create "self.combined_cube" containing + the (still lazy) result. + + NOTE: various test classes override + extend this. + + """ + + # Load source cubes (full-mesh and regions) + with PARSE_UGRID_ON_LOAD.context(): + self.full_mesh_cube = load_cube( + self._parametrised_cache_filename(n_cubesphere, "meshcube") + ) + self.region_cubes = load( + self._parametrised_cache_filename(n_cubesphere, "regioncubes") + ) + + # Remove all var-names from loaded cubes, which can otherwise cause + # problems. Also implement 'imaginary' data. + for cube in self.region_cubes + [self.full_mesh_cube]: + cube.var_name = None + for coord in cube.coords(): + coord.var_name = None + if imaginary_data: + # Replace cube data (lazy file data) with 'imaginary' data. + # This has the same lazy-array attributes, but is allocated by + # creating chunks on demand instead of loading from file. + data = cube.lazy_data() + data = da.zeros( + data.shape, dtype=data.dtype, chunks=data.chunksize + ) + cube.data = data + + if create_result_cube: + self.recombined_cube = self.recombine() + + # Fix dask usage mode for all the subsequent performance tests. + self.fix_dask_settings() + + def fix_dask_settings(self): + """ + Fix "standard" dask behaviour for time+space testing. + + Currently this is single-threaded mode, with known chunksize, + which is optimised for space saving so we can test largest data. + + """ + + import dask.config as dcfg + + # Use single-threaded, to avoid process-switching costs and minimise memory usage. + # N.B. generally may be slower, but use less memory ? + dcfg.set(scheduler="single-threaded") + # Configure iris._lazy_data.as_lazy_data to aim for 100Mb chunks + dcfg.set({"array.chunk-size": "128Mib"}) + + def recombine(self): + # A handy general shorthand for the main "combine" operation. + result = recombine_submeshes( + self.full_mesh_cube, + self.region_cubes, + index_coord_name="i_mesh_face", + ) + return result + + +class CombineRegionsCreateCube(MixinCombineRegions): + """ + Time+memory costs of creating a combined-regions cube. + + The result is lazy, and we don't do the actual calculation. + + """ + + def setup(self, n_cubesphere): + # In this case only, do *not* create the result cube. + # That is the operation we want to test. + super().setup(n_cubesphere, create_result_cube=False) + + def time_create_combined_cube(self, n_cubesphere): + self.recombine() + + def track_addedmem_create_combined_cube(self, n_cubesphere): + with TrackAddedMemoryAllocation() as mb: + self.recombine() + return mb.addedmem_mb() + + +CombineRegionsCreateCube.track_addedmem_create_combined_cube.unit = "Mb" + + +class CombineRegionsComputeRealData(MixinCombineRegions): + """ + Time+memory costs of computing combined-regions data. + """ + + def time_compute_data(self, n_cubesphere): + self.recombined_cube.data + + def track_addedmem_compute_data(self, n_cubesphere): + with TrackAddedMemoryAllocation() as mb: + self.recombined_cube.data + + return mb.addedmem_mb() + + +CombineRegionsComputeRealData.track_addedmem_compute_data.unit = "Mb" + + +class CombineRegionsSaveData(MixinCombineRegions): + """ + Test saving *only*, having replaced the input cube data with 'imaginary' + array data, so that input data is not loaded from disk during the save + operation. + + """ + + def time_save(self, n_cubesphere): + # Save to disk, which must compute data + stream it to file. + save(self.recombined_cube, "tmp.nc") + + def track_addedmem_save(self, n_cubesphere): + with TrackAddedMemoryAllocation() as mb: + save(self.recombined_cube, "tmp.nc") + + return mb.addedmem_mb() + + def track_filesize_saved(self, n_cubesphere): + save(self.recombined_cube, "tmp.nc") + return os.path.getsize("tmp.nc") * 1.0e-6 + + +CombineRegionsSaveData.track_addedmem_save.unit = "Mb" +CombineRegionsSaveData.track_filesize_saved.unit = "Mb" + + +class CombineRegionsFileStreamedCalc(MixinCombineRegions): + """ + Test the whole cost of file-to-file streaming. + Uses the combined cube which is based on lazy data loading from the region + cubes on disk. + """ + + def setup(self, n_cubesphere): + # In this case only, do *not* replace the loaded regions data with + # 'imaginary' data, as we want to test file-to-file calculation+save. + super().setup(n_cubesphere, imaginary_data=False) + + def time_stream_file2file(self, n_cubesphere): + # Save to disk, which must compute data + stream it to file. + save(self.recombined_cube, "tmp.nc") + + def track_addedmem_stream_file2file(self, n_cubesphere): + with TrackAddedMemoryAllocation() as mb: + save(self.recombined_cube, "tmp.nc") + + return mb.addedmem_mb() + + +CombineRegionsFileStreamedCalc.track_addedmem_stream_file2file.unit = "Mb" diff --git a/benchmarks/benchmarks/regridding.py b/benchmarks/benchmarks/regridding.py index 6db33aa192..c315119c11 100644 --- a/benchmarks/benchmarks/regridding.py +++ b/benchmarks/benchmarks/regridding.py @@ -25,16 +25,31 @@ def setup(self) -> None: ) self.cube = iris.load_cube(cube_file_path) + # Prepare a tougher cube and chunk it + chunked_cube_file_path = tests.get_data_path( + ["NetCDF", "regrid", "regrid_xyt.nc"] + ) + self.chunked_cube = iris.load_cube(chunked_cube_file_path) + + # Chunked data makes the regridder run repeatedly + self.cube.data = self.cube.lazy_data().rechunk((1, -1, -1)) + template_file_path = tests.get_data_path( ["NetCDF", "regrid", "regrid_template_global_latlon.nc"] ) self.template_cube = iris.load_cube(template_file_path) - # Chunked data makes the regridder run repeatedly - self.cube.data = self.cube.lazy_data().rechunk((1, -1, -1)) + # Prepare a regridding scheme + self.scheme_area_w = AreaWeighted() def time_regrid_area_w(self) -> None: # Regrid the cube onto the template. - out = self.cube.regrid(self.template_cube, AreaWeighted()) + out = self.cube.regrid(self.template_cube, self.scheme_area_w) # Realise the data out.data + + def time_regrid_area_w_new_grid(self) -> None: + # Regrid the chunked cube + out = self.chunked_cube.regrid(self.template_cube, self.scheme_area_w) + # Realise data + out.data diff --git a/benchmarks/benchmarks/ugrid_load.py b/benchmarks/benchmarks/ugrid_load.py new file mode 100644 index 0000000000..352450dcec --- /dev/null +++ b/benchmarks/benchmarks/ugrid_load.py @@ -0,0 +1,128 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the LGPL license. +# See COPYING and COPYING.LESSER in the root of the repository for full +# licensing details. +""" +Mesh data loading benchmark tests. + +Where possible benchmarks should be parameterised for two sizes of input data: + * minimal: enables detection of regressions in parts of the run-time that do + NOT scale with data size. + * large: large enough to exclusively detect regressions in parts of the + run-time that scale with data size. Aim for benchmark time ~20x + that of the minimal benchmark. + +""" + +from iris import load_cube as iris_load_cube +from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD +from iris.experimental.ugrid import load_mesh as iris_load_mesh + +from .generate_data.stock import create_file__xios_2d_face_half_levels + + +def synthetic_data(**kwargs): + # Ensure all uses of the synthetic data function use the common directory. + # File location is controlled by :mod:`generate_data`, hence temp_file_dir=None. + return create_file__xios_2d_face_half_levels(temp_file_dir=None, **kwargs) + + +def load_cube(*args, **kwargs): + with PARSE_UGRID_ON_LOAD.context(): + return iris_load_cube(*args, **kwargs) + + +def load_mesh(*args, **kwargs): + with PARSE_UGRID_ON_LOAD.context(): + return iris_load_mesh(*args, **kwargs) + + +class BasicLoading: + params = [1, int(4.1e6)] + param_names = ["number of faces"] + + def setup_common(self, **kwargs): + self.data_path = synthetic_data(**kwargs) + + def setup(self, *args): + self.setup_common(dataset_name="Loading", n_faces=args[0]) + + def time_load_file(self, *args): + _ = load_cube(str(self.data_path)) + + def time_load_mesh(self, *args): + _ = load_mesh(str(self.data_path)) + + +class BasicLoadingTime(BasicLoading): + """Same as BasicLoading, but scaling over a time series - an unlimited dimension.""" + + param_names = ["number of time steps"] + + def setup(self, *args): + self.setup_common(dataset_name="Loading", n_faces=1, n_times=args[0]) + + +class DataRealisation: + # Prevent repeat runs between setup() runs - data won't be lazy after 1st. + number = 1 + # Compensate for reduced certainty by increasing number of repeats. + repeat = (10, 10, 10.0) + # Prevent ASV running its warmup, which ignores `number` and would + # therefore get a false idea of typical run time since the data would stop + # being lazy. + warmup_time = 0.0 + timeout = 300.0 + + params = [1, int(4e6)] + param_names = ["number of faces"] + + def setup_common(self, **kwargs): + data_path = synthetic_data(**kwargs) + self.cube = load_cube(str(data_path)) + + def setup(self, *args): + self.setup_common(dataset_name="Realisation", n_faces=args[0]) + + def time_realise_data(self, *args): + assert self.cube.has_lazy_data() + _ = self.cube.data[0] + + +class DataRealisationTime(DataRealisation): + """Same as DataRealisation, but scaling over a time series - an unlimited dimension.""" + + param_names = ["number of time steps"] + + def setup(self, *args): + self.setup_common( + dataset_name="Realisation", n_faces=1, n_times=args[0] + ) + + +class Callback: + params = [1, int(4.5e6)] + param_names = ["number of faces"] + + def setup_common(self, **kwargs): + def callback(cube, field, filename): + return cube[::2] + + self.data_path = synthetic_data(**kwargs) + self.callback = callback + + def setup(self, *args): + self.setup_common(dataset_name="Loading", n_faces=args[0]) + + def time_load_file_callback(self, *args): + _ = load_cube(str(self.data_path), callback=self.callback) + + +class CallbackTime(Callback): + """Same as Callback, but scaling over a time series - an unlimited dimension.""" + + param_names = ["number of time steps"] + + def setup(self, *args): + self.setup_common(dataset_name="Loading", n_faces=1, n_times=args[0])