From c0546d962c7dc008ab474cbdb3861366cb2d8555 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 19 Feb 2020 17:44:01 +0000 Subject: [PATCH 1/3] Stop PPDataProxy accessing the file when no data is needed. (#3659) --- .../bugfix_2020-Feb-14_pp_emptyslices.txt | 5 + lib/iris/fileformats/pp.py | 34 +++-- .../unit/fileformats/pp/test_PPDataProxy.py | 126 +++++++++++++++++- lib/iris/util.py | 61 +++++++++ 4 files changed, 211 insertions(+), 15 deletions(-) create mode 100644 docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt diff --git a/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt b/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt new file mode 100644 index 0000000000..389209ae7e --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt @@ -0,0 +1,5 @@ +* Fixed a problem which was causing file loads to fetch *all* field data + whenever UM files (PP or Fieldsfiles) were loaded. + With large sourcefiles, initial file loads are slow, with large memory usage + before any cube data is even fetched. Large enough files will cause a crash. + The problem occurs only with Dask versions >= 2.0. diff --git a/lib/iris/fileformats/pp.py b/lib/iris/fileformats/pp.py index a57d5b5116..f1bd51f645 100644 --- a/lib/iris/fileformats/pp.py +++ b/lib/iris/fileformats/pp.py @@ -38,7 +38,7 @@ ) import iris.fileformats.rules import iris.coord_systems - +from iris.util import _array_slice_ifempty try: import mo_pack @@ -594,19 +594,25 @@ def ndim(self): return len(self.shape) def __getitem__(self, keys): - with open(self.path, "rb") as pp_file: - pp_file.seek(self.offset, os.SEEK_SET) - data_bytes = pp_file.read(self.data_len) - data = _data_bytes_to_shaped_array( - data_bytes, - self.lbpack, - self.boundary_packing, - self.shape, - self.src_dtype, - self.mdi, - ) - data = data.__getitem__(keys) - return np.asanyarray(data, dtype=self.dtype) + # Check for 'empty' slicings, in which case don't fetch the data. + # Because, since Dask v2, 'dask.array.from_array' performs an empty + # slicing and we must not fetch the data at that time. + result = _array_slice_ifempty(keys, self.shape, self.dtype) + if result is None: + with open(self.path, "rb") as pp_file: + pp_file.seek(self.offset, os.SEEK_SET) + data_bytes = pp_file.read(self.data_len) + data = _data_bytes_to_shaped_array( + data_bytes, + self.lbpack, + self.boundary_packing, + self.shape, + self.src_dtype, + self.mdi, + ) + result = data.__getitem__(keys) + + return np.asanyarray(result, dtype=self.dtype) def __repr__(self): fmt = ( diff --git a/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py b/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py index 53fcc08b95..8a22da061c 100644 --- a/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py +++ b/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py @@ -10,6 +10,7 @@ import iris.tests as tests from unittest import mock +import numpy as np from iris.fileformats.pp import PPDataProxy, SplittableInt @@ -21,7 +22,7 @@ def test_lbpack_SplittableInt(self): self.assertEqual(proxy.lbpack, lbpack) self.assertIs(proxy.lbpack, lbpack) - def test_lnpack_raw(self): + def test_lbpack_raw(self): lbpack = 4321 proxy = PPDataProxy(None, None, None, None, None, lbpack, None, None) self.assertEqual(proxy.lbpack, lbpack) @@ -33,5 +34,128 @@ def test_lnpack_raw(self): self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10) +class SliceTranslator: + """ + Class to translate an array-indexing expression into a tuple of keys. + + An instance just returns the argument of its __getitem__ call. + + """ + + def __getitem__(self, keys): + return keys + + +# A multidimensional-indexable object that returns its index keys, so we can +# use multidimensional-indexing notation to specify a slicing expression. +Slices = SliceTranslator() + + +class Test__getitem__slicing(tests.IrisTest): + def _check_slicing( + self, test_shape, indices, result_shape, data_was_fetched=True + ): + # Check behaviour of the getitem call with specific slicings. + # Especially: check cases where a fetch does *not* read from the file. + # This is necessary because, since Dask 2.0, the "from_array" function + # takes a zero-length slice of its array argument, to capture array + # metadata, and in those cases we want to avoid file access. + test_dtype = np.dtype(np.float32) + proxy = PPDataProxy( + shape=test_shape, + src_dtype=test_dtype, + path=None, + offset=None, + data_len=None, + lbpack=0, # Note: a 'real' value is needed. + boundary_packing=None, + mdi=None, + ) + + # Mock out the file-open call, to see if the file would be read. + builtin_open_func_name = "builtins.open" + mock_fileopen = self.patch(builtin_open_func_name) + + # Also mock out the 'databytes_to_shaped_array' call, to fake minimal + # operation in the cases where file-open *does* get called. + fake_data = np.zeros(test_shape, dtype=test_dtype) + self.patch( + "iris.fileformats.pp._data_bytes_to_shaped_array", + mock.MagicMock(return_value=fake_data), + ) + + # Test the requested indexing operation. + result = proxy.__getitem__(indices) + + # Check the behaviour and results were as expected. + self.assertEqual(mock_fileopen.called, data_was_fetched) + self.assertIsInstance(result, np.ndarray) + self.assertEqual(result.dtype, test_dtype) + self.assertEqual(result.shape, result_shape) + + def test_slicing_1d_normal(self): + # A 'normal' 1d testcase with no empty slices. + self._check_slicing( + test_shape=(3,), + indices=Slices[1:10], + result_shape=(2,), + data_was_fetched=True, + ) + + def test_slicing_1d_empty(self): + # A 1d testcase with an empty slicing. + self._check_slicing( + test_shape=(3,), + indices=Slices[0:0], + result_shape=(0,), + data_was_fetched=False, + ) + + def test_slicing_2d_normal(self): + # A 2d testcase with no empty slices. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[2, :3], + result_shape=(3,), + data_was_fetched=True, + ) + + def test_slicing_2d_allempty(self): + # A 2d testcase with all empty slices. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[0:0, 0:0], + result_shape=(0, 0), + data_was_fetched=False, + ) + + def test_slicing_2d_empty_dim0(self): + # A 2d testcase with an empty slice. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[0:0], + result_shape=(0, 4), + data_was_fetched=False, + ) + + def test_slicing_2d_empty_dim1(self): + # A 2d testcase with an empty slice, and an integer index. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[1, 0:0], + result_shape=(0,), + data_was_fetched=False, + ) + + def test_slicing_complex(self): + # Multiple dimensions with multiple empty slices. + self._check_slicing( + test_shape=(3, 4, 2, 5, 6, 3, 7), + indices=Slices[1:3, 2, 0:0, :, 1:1, :100], + result_shape=(2, 0, 5, 0, 3, 7), + data_was_fetched=False, + ) + + if __name__ == "__main__": tests.main() diff --git a/lib/iris/util.py b/lib/iris/util.py index 3bda110a07..3212eba4a5 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -959,6 +959,67 @@ def __lt__(self, other): return NotImplemented +def _array_slice_ifempty(keys, shape, dtype): + """ + Detect cases where an array slice will contain no data, as it contains a + zero-length dimension, and produce an equivalent result for those cases. + + The function indicates 'empty' slicing cases, by returning an array equal + to the slice result in those cases. + + Args: + + * keys (indexing key, or tuple of keys): + The argument from an array __getitem__ call. + Only tuples of integers and slices are supported, in particular no + newaxis, ellipsis or array keys. + These are the types of array access usage we expect from Dask. + * shape (tuple of int): + The shape of the array being indexed. + * dtype (numpy.dtype): + The dtype of the array being indexed. + + Returns: + result (np.ndarray or None): + If 'keys' contains a slice(0, 0), this is an ndarray of the correct + resulting shape and provided dtype. + Otherwise it is None. + + .. note:: + + This is used to prevent DataProxy arraylike objects from fetching their + file data when wrapped as Dask arrays. + This is because, for Dask >= 2.0, the "dask.array.from_array" call + performs a fetch like [0:0, 0:0, ...], to 'snapshot' array metadata. + This function enables us to avoid triggering a file data fetch in those + cases : This is consistent because the result will not contain any + actual data content. + + """ + # Convert a single key into a 1-tuple, so we always have a tuple of keys. + if isinstance(keys, tuple): + keys_tuple = keys + else: + keys_tuple = (keys,) + + if any(key == slice(0, 0) for key in keys_tuple): + # An 'empty' slice is present : Return a 'fake' array instead. + target_shape = list(shape) + for i_dim, key in enumerate(keys_tuple): + if key == slice(0, 0): + # Reduce dims with empty slicing to length 0. + target_shape[i_dim] = 0 + # Create a prototype result : no memory usage, as some dims are 0. + result = np.zeros(target_shape, dtype=dtype) + # Index with original keys to produce the desired result shape. + # Note : also ok in 0-length dims, as the slice is always '0:0'. + result = result[keys] + else: + result = None + + return result + + def create_temp_filename(suffix=""): """Return a temporary file name. From a2f075b3204da37df15b2292a9e0310ecf586ebb Mon Sep 17 00:00:00 2001 From: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> Date: Thu, 20 Feb 2020 16:07:53 +0000 Subject: [PATCH 2/3] Prep for 2.4 release. (#3654) --- docs/iris/src/whatsnew/2.4.rst | 59 +++++++++++++++++++ .../bugfix_2020-Feb-14_pp_emptyslices.txt | 5 -- 2 files changed, 59 insertions(+), 5 deletions(-) create mode 100644 docs/iris/src/whatsnew/2.4.rst delete mode 100644 docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt diff --git a/docs/iris/src/whatsnew/2.4.rst b/docs/iris/src/whatsnew/2.4.rst new file mode 100644 index 0000000000..2facb97a7a --- /dev/null +++ b/docs/iris/src/whatsnew/2.4.rst @@ -0,0 +1,59 @@ +What's New in Iris 2.4.0 +************************ + +:Release: 2.4.0 +:Date: 2020-02-20 + +This document explains the new/changed features of Iris in version 2.4.0 +(:doc:`View all changes `.) + + +Iris 2.4.0 Features +=================== + +.. admonition:: Last python 2 version of Iris + + Iris 2.4 is a final extra release of Iris 2, which back-ports specific desired features from + Iris 3 (not yet released). + + The purpose of this is both to support early adoption of certain newer features, + and to provide a final release for Python 2. + + The next release of Iris will be version 3.0 : a major-version release which + introduces breaking API and behavioural changes, and only supports Python 3. + +* :class:`iris.coord_systems.Geostationary` can now accept creation arguments of + `false_easting=None` or `false_northing=None`, equivalent to values of 0. + Previously these kwargs could be omitted, but could not be set to `None`. + This also enables loading of netcdf data on a Geostationary grid, where either of these + keys is not present as a grid-mapping variable property : Previously, loading any + such data caused an exception. +* The area weights used when performing area weighted regridding with :class:`iris.analysis.AreaWeighted` + are now cached. + This allows a significant speedup when regridding multiple similar cubes, by repeatedly using + a `'regridder' object <../iris/iris/analysis.html?highlight=regridder#iris.analysis.AreaWeighted.regridder>`_ + which you created first. +* Name constraint matching against cubes during loading or extracting has been relaxed from strictly matching + against the :meth:`~iris.cube.Cube.name`, to matching against either the + ``standard_name``, ``long_name``, NetCDF ``var_name``, or ``STASH`` attributes metadata of a cube. +* Cubes and coordinates now have a new ``names`` property that contains a tuple of the + ``standard_name``, ``long_name``, NetCDF ``var_name``, and ``STASH`` attributes metadata. +* The :class:`~iris.NameConstraint` provides richer name constraint matching when loading or extracting + against cubes, by supporting a constraint against any combination of + ``standard_name``, ``long_name``, NetCDF ``var_name`` and ``STASH`` + from the attributes dictionary of a :class:`~iris.cube.Cube`. + + +Iris 2.4.0 Dependency Updates +============================= +* Iris is now able to use the latest version of matplotlib. + + +Bugs Fixed +========== +* Fixed a problem which was causing file loads to fetch *all* field data + whenever UM files (PP or Fieldsfiles) were loaded. + With large sourcefiles, initial file loads are slow, with large memory usage + before any cube data is even fetched. Large enough files will cause a crash. + The problem occurs only with Dask versions >= 2.0. + diff --git a/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt b/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt deleted file mode 100644 index 389209ae7e..0000000000 --- a/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt +++ /dev/null @@ -1,5 +0,0 @@ -* Fixed a problem which was causing file loads to fetch *all* field data - whenever UM files (PP or Fieldsfiles) were loaded. - With large sourcefiles, initial file loads are slow, with large memory usage - before any cube data is even fetched. Large enough files will cause a crash. - The problem occurs only with Dask versions >= 2.0. From 172f109fb3bb159beba2ad6ed32866ddc58b0726 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 21 Feb 2020 11:32:44 +0000 Subject: [PATCH 3/3] Add 2.4 whatsnew into full whatsnew list. --- docs/iris/src/whatsnew/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/iris/src/whatsnew/index.rst b/docs/iris/src/whatsnew/index.rst index 179216ccb5..03834a43a7 100644 --- a/docs/iris/src/whatsnew/index.rst +++ b/docs/iris/src/whatsnew/index.rst @@ -11,6 +11,7 @@ Iris versions. latest.rst 3.0.rst + 2.4.rst 2.3.rst 2.2.rst 2.1.rst