From aa0d52a3f09f42617723b082459aa6b9908bc2c3 Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Mon, 16 May 2022 18:06:24 +0100
Subject: [PATCH 1/9] CSPerf fixes.

---
 noxfile.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/noxfile.py b/noxfile.py
index 00a866f814..1bc108d328 100755
--- a/noxfile.py
+++ b/noxfile.py
@@ -9,6 +9,7 @@
 import hashlib
 import os
 from pathlib import Path
+import re
 from tempfile import NamedTemporaryFile
 from typing import Literal
 
@@ -314,7 +315,7 @@ def benchmarks(
     ----------
     session: object
         A `nox.sessions.Session` object.
-    run_type: {"overnight", "branch", "custom"}
+    run_type: {"overnight", "branch", "cperf", "sperf", "custom"}
         * ``overnight``: benchmarks all commits between the input **first
           commit** to ``HEAD``, comparing each to its parent for performance
           shifts. If a commit causes shifts, the output is saved to a file:
@@ -501,6 +502,11 @@ def asv_compare(*commits):
         asv_command = (
             asv_harness.format(posargs=commit_range) + f" --bench={run_type}"
         )
+        # C/SPerf benchmarks are much bigger than the CI ones:
+        # Don't fail the whole run if memory blows on 1 benchmark.
+        asv_command = asv_command.replace(" --strict", "")
+        # Only do a single round.
+        asv_command = re.sub(r"rounds=\d", "rounds=1", asv_command)
         session.run(*asv_command.split(" "), *asv_args)
 
         asv_command = f"asv publish {commit_range} --html-dir={publish_subdir}"

From 81e61602465fb35d1da6300cdf5e1deec9fd2b22 Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Tue, 17 May 2022 13:49:10 +0100
Subject: [PATCH 2/9] Correct argument hashing in file names.

---
 benchmarks/benchmarks/generate_data/stock.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmarks/generate_data/stock.py b/benchmarks/benchmarks/generate_data/stock.py
index bbc7dc0a63..eaf46bb405 100644
--- a/benchmarks/benchmarks/generate_data/stock.py
+++ b/benchmarks/benchmarks/generate_data/stock.py
@@ -9,6 +9,8 @@
 See :mod:`benchmarks.generate_data` for an explanation of this structure.
 """
 
+from hashlib import sha256
+import json
 from pathlib import Path
 
 from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD, load_mesh
@@ -16,6 +18,14 @@
 from . import BENCHMARK_DATA, REUSE_DATA, load_realised, run_function_elsewhere
 
 
+def hash_args(*args, **kwargs):
+    """Convert arguments into a short hash - for preserving args in filenames."""
+    arg_string = str(args)
+    kwarg_string = json.dumps(kwargs)
+    full_string = arg_string + kwarg_string
+    return sha256(full_string.encode()).hexdigest()[:10]
+
+
 def _create_file__xios_common(func_name, **kwargs):
     def _external(func_name_, temp_file_dir, **kwargs_):
         from iris.tests.stock import netcdf
@@ -23,7 +33,7 @@ def _external(func_name_, temp_file_dir, **kwargs_):
         func = getattr(netcdf, func_name_)
         print(func(temp_file_dir, **kwargs_), end="")
 
-    args_hash = hash(str(kwargs))
+    args_hash = hash_args(**kwargs)
     save_path = (BENCHMARK_DATA / f"{func_name}_{args_hash}").with_suffix(
         ".nc"
     )
@@ -95,7 +105,7 @@ def _external(*args, **kwargs):
         save_mesh(new_mesh, save_path_)
 
     arg_list = [n_nodes, n_faces, n_edges]
-    args_hash = hash(str(arg_list))
+    args_hash = hash_args(*arg_list)
     save_path = (BENCHMARK_DATA / f"sample_mesh_{args_hash}").with_suffix(
         ".nc"
     )
@@ -139,7 +149,7 @@ def _external(sample_mesh_kwargs_, save_path_):
         new_meshcoord = sample_meshcoord(mesh=input_mesh)
         save_mesh(new_meshcoord.mesh, save_path_)
 
-    args_hash = hash(str(sample_mesh_kwargs))
+    args_hash = hash_args(**sample_mesh_kwargs)
     save_path = (
         BENCHMARK_DATA / f"sample_mesh_coord_{args_hash}"
     ).with_suffix(".nc")

From 09bfbec8e22511f539abf9e3e73a14eabff25fcb Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Tue, 17 May 2022 15:08:13 +0100
Subject: [PATCH 3/9] Warn if BENCHMARK_DATA is not set.

---
 benchmarks/benchmarks/generate_data/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py
index 8874a2c214..78b971d9de 100644
--- a/benchmarks/benchmarks/generate_data/__init__.py
+++ b/benchmarks/benchmarks/generate_data/__init__.py
@@ -22,6 +22,7 @@
 from pathlib import Path
 from subprocess import CalledProcessError, check_output, run
 from textwrap import dedent
+from warnings import warn
 
 from iris._lazy_data import as_concrete_data
 from iris.fileformats import netcdf
@@ -47,6 +48,11 @@
 BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir))
 if BENCHMARK_DATA == default_data_dir:
     BENCHMARK_DATA.mkdir(exist_ok=True)
+    message = (
+        f"No BENCHMARK_DATA env var, defaulting to {BENCHMARK_DATA}. "
+        "Note that some benchmark files are GB in size."
+    )
+    warn(message)
 elif not BENCHMARK_DATA.is_dir():
     message = f"Not a directory: {BENCHMARK_DATA} ."
     raise ValueError(message)

From 546219b7c535c706595aa7f72efad062ab4c1c7c Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Tue, 17 May 2022 16:17:23 +0100
Subject: [PATCH 4/9] SPerf combine_regions better file handling.

---
 .../benchmarks/sperf/combine_regions.py       | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmarks/sperf/combine_regions.py b/benchmarks/benchmarks/sperf/combine_regions.py
index fd2c95c7fc..8c9b480312 100644
--- a/benchmarks/benchmarks/sperf/combine_regions.py
+++ b/benchmarks/benchmarks/sperf/combine_regions.py
@@ -16,20 +16,21 @@
 from iris.experimental.ugrid.utils import recombine_submeshes
 
 from .. import TrackAddedMemoryAllocation, on_demand_benchmark
-from ..generate_data.ugrid import make_cube_like_2d_cubesphere
+from ..generate_data.ugrid import BENCHMARK_DATA, make_cube_like_2d_cubesphere
 
 
 class Mixin:
     # Characterise time taken + memory-allocated, for various stages of combine
     # operations on cubesphere-like test data.
-    timeout = 180.0
+    timeout = 300.0
     params = [100, 200, 300, 500, 1000, 1668]
     param_names = ["cubesphere_C<N>"]
     # Fix result units for the tracking benchmarks.
     unit = "Mb"
+    temp_save_path = BENCHMARK_DATA / "tmp.nc"
 
     def _parametrised_cache_filename(self, n_cubesphere, content_name):
-        return f"cube_C{n_cubesphere}_{content_name}.nc"
+        return BENCHMARK_DATA / f"cube_C{n_cubesphere}_{content_name}.nc"
 
     def _make_region_cubes(self, full_mesh_cube):
         """Make a fixed number of region cubes from a full meshcube."""
@@ -139,6 +140,9 @@ def setup(
         # Fix dask usage mode for all the subsequent performance tests.
         self.fix_dask_settings()
 
+    def teardown(self, _):
+        self.temp_save_path.unlink(missing_ok=True)
+
     def fix_dask_settings(self):
         """
         Fix "standard" dask behaviour for time+space testing.
@@ -165,6 +169,9 @@ def recombine(self):
         )
         return result
 
+    def save(self):
+        save(self.recombined_cube, self.temp_save_path)
+
 
 @on_demand_benchmark
 class CreateCube(Mixin):
@@ -215,15 +222,15 @@ class SaveData(Mixin):
 
     def time_save(self, n_cubesphere):
         # Save to disk, which must compute data + stream it to file.
-        save(self.recombined_cube, "tmp.nc")
+        self.save()
 
     @TrackAddedMemoryAllocation.decorator()
     def track_addedmem_save(self, n_cubesphere):
-        save(self.recombined_cube, "tmp.nc")
+        self.save()
 
     def track_filesize_saved(self, n_cubesphere):
-        save(self.recombined_cube, "tmp.nc")
-        return os.path.getsize("tmp.nc") * 1.0e-6
+        self.save()
+        return self.temp_save_path.stat().st_size * 1.0e-6
 
 
 @on_demand_benchmark
@@ -243,8 +250,8 @@ def setup(
 
     def time_stream_file2file(self, n_cubesphere):
         # Save to disk, which must compute data + stream it to file.
-        save(self.recombined_cube, "tmp.nc")
+        self.save()
 
     @TrackAddedMemoryAllocation.decorator()
     def track_addedmem_stream_file2file(self, n_cubesphere):
-        save(self.recombined_cube, "tmp.nc")
+        self.save()

From 232bec2f3e3174c2216ed8ea564066fd076a6dee Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Tue, 17 May 2022 18:25:57 +0100
Subject: [PATCH 5/9] SPerf don't store FileMixin files long term - too much
 space.

---
 benchmarks/benchmarks/sperf/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/benchmarks/sperf/__init__.py b/benchmarks/benchmarks/sperf/__init__.py
index 696c8ef4df..3ba72cb61a 100644
--- a/benchmarks/benchmarks/sperf/__init__.py
+++ b/benchmarks/benchmarks/sperf/__init__.py
@@ -34,6 +34,11 @@ def setup(self, c_size, n_levels, n_times):
             c_size=c_size, n_levels=n_levels, n_times=n_times
         )
 
+    def teardown(self, _):
+        # All the params together could fill >4TB.
+        #  Better to take longer to setup and use less file space.
+        self.file_path.unlink(missing_ok=True)
+
     def load_cube(self):
         with PARSE_UGRID_ON_LOAD.context():
             return load_cube(str(self.file_path))

From 1bb2228da78c1cd6c60659baf784d735dc30652c Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Thu, 19 May 2022 11:47:05 +0100
Subject: [PATCH 6/9] More realistic SPerf file sizes, allow larger-than-memory
 files.

---
 benchmarks/README.md                    |  4 +++-
 benchmarks/benchmarks/sperf/__init__.py | 10 ++++------
 benchmarks/benchmarks/sperf/load.py     |  3 ---
 lib/iris/tests/stock/netcdf.py          |  8 ++++++--
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 6ea53c3ae8..8dffd473f3 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -30,7 +30,9 @@ used to generate benchmark test objects/files; see
 but will defer to any value already set in the shell.
 * `BENCHMARK_DATA` - optional - path to a directory for benchmark synthetic
 test data, which the benchmark scripts will create if it doesn't already
-exist. Defaults to `<root>/benchmarks/.data/` if not set.
+exist. Defaults to `<root>/benchmarks/.data/` if not set. Note that some of
+the generated files, especially in the 'SPerf' suite, are many GB in size so
+plan accordingly.
 * `ON_DEMAND_BENCHMARKS` - optional - when set (to any value): benchmarks
 decorated with `@on_demand_benchmark` are included in the ASV run. Usually
 coupled with the ASV `--bench` argument to only run the benchmark(s) of
diff --git a/benchmarks/benchmarks/sperf/__init__.py b/benchmarks/benchmarks/sperf/__init__.py
index 3ba72cb61a..77ee25b1ca 100644
--- a/benchmarks/benchmarks/sperf/__init__.py
+++ b/benchmarks/benchmarks/sperf/__init__.py
@@ -20,10 +20,13 @@
 class FileMixin:
     """For use in any benchmark classes that work on a file."""
 
+    # Allows time for large file generation.
+    timeout = 3600.0
+    # Largest file with these params: ~90GB.
     params = [
         [12, 384, 640, 960, 1280, 1668],
         [1, 36, 72],
-        [1, 3, 36, 72],
+        [1, 3, 10],
     ]
     param_names = ["cubesphere_C<N>", "N levels", "N time steps"]
     # cubesphere_C<N>: notation refers to faces per panel.
@@ -34,11 +37,6 @@ def setup(self, c_size, n_levels, n_times):
             c_size=c_size, n_levels=n_levels, n_times=n_times
         )
 
-    def teardown(self, _):
-        # All the params together could fill >4TB.
-        #  Better to take longer to setup and use less file space.
-        self.file_path.unlink(missing_ok=True)
-
     def load_cube(self):
         with PARSE_UGRID_ON_LOAD.context():
             return load_cube(str(self.file_path))
diff --git a/benchmarks/benchmarks/sperf/load.py b/benchmarks/benchmarks/sperf/load.py
index c1d1db43a9..6a60355976 100644
--- a/benchmarks/benchmarks/sperf/load.py
+++ b/benchmarks/benchmarks/sperf/load.py
@@ -18,9 +18,6 @@ def time_load_cube(self, _, __, ___):
 
 @on_demand_benchmark
 class Realise(FileMixin):
-    # The larger files take a long time to realise.
-    timeout = 600.0
-
     def setup(self, c_size, n_levels, n_times):
         super().setup(c_size, n_levels, n_times)
         self.loaded_cube = self.load_cube()
diff --git a/lib/iris/tests/stock/netcdf.py b/lib/iris/tests/stock/netcdf.py
index 030e90a0f3..4e12850ef1 100644
--- a/lib/iris/tests/stock/netcdf.py
+++ b/lib/iris/tests/stock/netcdf.py
@@ -9,6 +9,8 @@
 from string import Template
 import subprocess
 
+import dask
+from dask import array as da
 import netCDF4
 import numpy as np
 
@@ -79,11 +81,13 @@ def _add_standard_data(nc_path, unlimited_dim_size=0):
             # so it can be a dim-coord.
             data_size = np.prod(shape)
             data = np.arange(1, data_size + 1, dtype=var.dtype).reshape(shape)
+            var[:] = data
         else:
             # Fill with a plain value.  But avoid zeros, so we can simulate
             # valid ugrid connectivities even when start_index=1.
-            data = np.ones(shape, dtype=var.dtype)  # Do not use zero
-        var[:] = data
+            with dask.config.set({"array.chunk-size": "2048MiB"}):
+                data = da.ones(shape, dtype=var.dtype)  # Do not use zero
+            da.store(data, var)
 
     ds.close()
 

From 8d43a63e82a785779bf4cc61a77b4d70ac6b0fea Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Thu, 19 May 2022 11:50:07 +0100
Subject: [PATCH 7/9] Add SPerf comment about total disk space.

---
 benchmarks/benchmarks/sperf/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/benchmarks/sperf/__init__.py b/benchmarks/benchmarks/sperf/__init__.py
index 77ee25b1ca..eccad56f6f 100644
--- a/benchmarks/benchmarks/sperf/__init__.py
+++ b/benchmarks/benchmarks/sperf/__init__.py
@@ -23,6 +23,7 @@ class FileMixin:
     # Allows time for large file generation.
     timeout = 3600.0
     # Largest file with these params: ~90GB.
+    #  Total disk space: ~410GB.
     params = [
         [12, 384, 640, 960, 1280, 1668],
         [1, 36, 72],

From dc870b11217b02e9a55f3c5bf6ac260acbf583e1 Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Thu, 19 May 2022 13:34:10 +0100
Subject: [PATCH 8/9] Corrected CSPerf directories printout.

---
 noxfile.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/noxfile.py b/noxfile.py
index 1bc108d328..7919dfdfde 100755
--- a/noxfile.py
+++ b/noxfile.py
@@ -517,7 +517,6 @@ def asv_compare(*commits):
         print(
             f'New ASV results for "{run_type}".\n'
             f'See "{publish_subdir}",'
-            f'\n  html in "{location / "html"}".'
             f'\n  or JSON files under "{location / "results"}".'
         )
 

From ab467b5f5bd0ab9fbdef39c14d46dba6ff73d60e Mon Sep 17 00:00:00 2001
From: Martin Yeo <martin.yeo@metoffice.gov.uk>
Date: Mon, 23 May 2022 17:03:56 +0100
Subject: [PATCH 9/9] Remove namespace conflict in SPerf combine_regions.

---
 benchmarks/benchmarks/sperf/combine_regions.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmarks/sperf/combine_regions.py b/benchmarks/benchmarks/sperf/combine_regions.py
index 8c9b480312..e68382c06e 100644
--- a/benchmarks/benchmarks/sperf/combine_regions.py
+++ b/benchmarks/benchmarks/sperf/combine_regions.py
@@ -169,7 +169,7 @@ def recombine(self):
         )
         return result
 
-    def save(self):
+    def save_recombined_cube(self):
         save(self.recombined_cube, self.temp_save_path)
 
 
@@ -222,14 +222,14 @@ class SaveData(Mixin):
 
     def time_save(self, n_cubesphere):
         # Save to disk, which must compute data + stream it to file.
-        self.save()
+        self.save_recombined_cube()
 
     @TrackAddedMemoryAllocation.decorator()
     def track_addedmem_save(self, n_cubesphere):
-        self.save()
+        self.save_recombined_cube()
 
     def track_filesize_saved(self, n_cubesphere):
-        self.save()
+        self.save_recombined_cube()
         return self.temp_save_path.stat().st_size * 1.0e-6
 
 
@@ -250,8 +250,8 @@ def setup(
 
     def time_stream_file2file(self, n_cubesphere):
         # Save to disk, which must compute data + stream it to file.
-        self.save()
+        self.save_recombined_cube()
 
     @TrackAddedMemoryAllocation.decorator()
     def track_addedmem_stream_file2file(self, n_cubesphere):
-        self.save()
+        self.save_recombined_cube()