2v4 mergeback picks (#3668)

pp-mo · trexfeathers · web-flow · commit 7f6e5e24f940 · 2020-02-21T14:22:39.000Z
* Stop PPDataProxy accessing the file when no data is needed. (#3659) * Add 2.4 whatsnew into full whatsnew list. Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com>
diff --git a/docs/iris/src/whatsnew/2.4.rst b/docs/iris/src/whatsnew/2.4.rst
@@ -0,0 +1,59 @@
+What's New in Iris 2.4.0
+************************
+
+:Release: 2.4.0
+:Date: 2020-02-20
+
+This document explains the new/changed features of Iris in version 2.4.0
+(:doc:`View all changes <index>`.)
+
+
+Iris 2.4.0 Features
+===================
+
+.. admonition:: Last python 2 version of Iris
+
+    Iris 2.4 is a final extra release of Iris 2, which back-ports specific desired features from
+    Iris 3 (not yet released).
+
+    The purpose of this is both to support early adoption of certain newer features,
+    and to provide a final release for Python 2.
+
+    The next release of Iris will be version 3.0 : a major-version release which
+    introduces breaking API and behavioural changes, and only supports Python 3.
+
+* :class:`iris.coord_systems.Geostationary` can now accept creation arguments of
+  `false_easting=None` or `false_northing=None`, equivalent to values of 0.
+  Previously these kwargs could be omitted, but could not be set to `None`.
+  This also enables loading of netcdf data on a Geostationary grid, where either of these
+  keys is not present as a grid-mapping variable property : Previously, loading any
+  such data caused an exception.
+* The area weights used when performing area weighted regridding with :class:`iris.analysis.AreaWeighted`
+  are now cached.
+  This allows a significant speedup when regridding multiple similar cubes, by repeatedly using
+  a `'regridder' object <../iris/iris/analysis.html?highlight=regridder#iris.analysis.AreaWeighted.regridder>`_
+  which you created first.
+* Name constraint matching against cubes during loading or extracting has been relaxed from strictly matching
+  against the :meth:`~iris.cube.Cube.name`, to matching against either the
+  ``standard_name``, ``long_name``, NetCDF ``var_name``, or ``STASH`` attributes metadata of a cube.
+* Cubes and coordinates now have a new ``names`` property that contains a tuple of the
+  ``standard_name``, ``long_name``, NetCDF ``var_name``, and ``STASH`` attributes metadata.
+* The :class:`~iris.NameConstraint` provides richer name constraint matching when loading or extracting
+  against cubes, by supporting a constraint against any combination of
+  ``standard_name``, ``long_name``, NetCDF ``var_name`` and ``STASH``
+  from the attributes dictionary of a :class:`~iris.cube.Cube`.
+
+
+Iris 2.4.0 Dependency Updates
+=============================
+* Iris is now able to use the latest version of matplotlib.
+
+
+Bugs Fixed
+==========
+* Fixed a problem which was causing file loads to fetch *all* field data
+  whenever UM files (PP or Fieldsfiles) were loaded.
+  With large sourcefiles, initial file loads are slow, with large memory usage
+  before any cube data is even fetched.  Large enough files will cause a crash.
+  The problem occurs only with Dask versions >= 2.0.
+
diff --git a/docs/iris/src/whatsnew/index.rst b/docs/iris/src/whatsnew/index.rst
@@ -11,6 +11,7 @@ Iris versions.
 
    latest.rst
    3.0.rst
+   2.4.rst
    2.3.rst
    2.2.rst
    2.1.rst
diff --git a/lib/iris/fileformats/pp.py b/lib/iris/fileformats/pp.py
@@ -38,7 +38,7 @@
 )
 import iris.fileformats.rules
 import iris.coord_systems
-
+from iris.util import _array_slice_ifempty
 
 try:
     import mo_pack
@@ -594,19 +594,25 @@ def ndim(self):
         return len(self.shape)
 
     def __getitem__(self, keys):
-        with open(self.path, "rb") as pp_file:
-            pp_file.seek(self.offset, os.SEEK_SET)
-            data_bytes = pp_file.read(self.data_len)
-            data = _data_bytes_to_shaped_array(
-                data_bytes,
-                self.lbpack,
-                self.boundary_packing,
-                self.shape,
-                self.src_dtype,
-                self.mdi,
-            )
-        data = data.__getitem__(keys)
-        return np.asanyarray(data, dtype=self.dtype)
+        # Check for 'empty' slicings, in which case don't fetch the data.
+        # Because, since Dask v2, 'dask.array.from_array' performs an empty
+        # slicing and we must not fetch the data at that time.
+        result = _array_slice_ifempty(keys, self.shape, self.dtype)
+        if result is None:
+            with open(self.path, "rb") as pp_file:
+                pp_file.seek(self.offset, os.SEEK_SET)
+                data_bytes = pp_file.read(self.data_len)
+                data = _data_bytes_to_shaped_array(
+                    data_bytes,
+                    self.lbpack,
+                    self.boundary_packing,
+                    self.shape,
+                    self.src_dtype,
+                    self.mdi,
+                )
+            result = data.__getitem__(keys)
+
+        return np.asanyarray(result, dtype=self.dtype)
 
     def __repr__(self):
         fmt = (
diff --git a/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py b/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py
@@ -10,6 +10,7 @@
 import iris.tests as tests
 
 from unittest import mock
+import numpy as np
 
 from iris.fileformats.pp import PPDataProxy, SplittableInt
 
@@ -21,7 +22,7 @@ def test_lbpack_SplittableInt(self):
         self.assertEqual(proxy.lbpack, lbpack)
         self.assertIs(proxy.lbpack, lbpack)
 
-    def test_lnpack_raw(self):
+    def test_lbpack_raw(self):
         lbpack = 4321
         proxy = PPDataProxy(None, None, None, None, None, lbpack, None, None)
         self.assertEqual(proxy.lbpack, lbpack)
@@ -33,5 +34,128 @@ def test_lnpack_raw(self):
         self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10)
 
 
+class SliceTranslator:
+    """
+    Class to translate an array-indexing expression into a tuple of keys.
+
+    An instance just returns the argument of its __getitem__ call.
+
+    """
+
+    def __getitem__(self, keys):
+        return keys
+
+
+# A multidimensional-indexable object that returns its index keys, so we can
+# use multidimensional-indexing notation to specify a slicing expression.
+Slices = SliceTranslator()
+
+
+class Test__getitem__slicing(tests.IrisTest):
+    def _check_slicing(
+        self, test_shape, indices, result_shape, data_was_fetched=True
+    ):
+        # Check behaviour of the getitem call with specific slicings.
+        # Especially: check cases where a fetch does *not* read from the file.
+        # This is necessary because, since Dask 2.0, the "from_array" function
+        # takes a zero-length slice of its array argument, to capture array
+        # metadata, and in those cases we want to avoid file access.
+        test_dtype = np.dtype(np.float32)
+        proxy = PPDataProxy(
+            shape=test_shape,
+            src_dtype=test_dtype,
+            path=None,
+            offset=None,
+            data_len=None,
+            lbpack=0,  # Note: a 'real' value is needed.
+            boundary_packing=None,
+            mdi=None,
+        )
+
+        # Mock out the file-open call, to see if the file would be read.
+        builtin_open_func_name = "builtins.open"
+        mock_fileopen = self.patch(builtin_open_func_name)
+
+        # Also mock out the 'databytes_to_shaped_array' call, to fake minimal
+        # operation in the cases where file-open *does* get called.
+        fake_data = np.zeros(test_shape, dtype=test_dtype)
+        self.patch(
+            "iris.fileformats.pp._data_bytes_to_shaped_array",
+            mock.MagicMock(return_value=fake_data),
+        )
+
+        # Test the requested indexing operation.
+        result = proxy.__getitem__(indices)
+
+        # Check the behaviour and results were as expected.
+        self.assertEqual(mock_fileopen.called, data_was_fetched)
+        self.assertIsInstance(result, np.ndarray)
+        self.assertEqual(result.dtype, test_dtype)
+        self.assertEqual(result.shape, result_shape)
+
+    def test_slicing_1d_normal(self):
+        # A 'normal' 1d testcase with no empty slices.
+        self._check_slicing(
+            test_shape=(3,),
+            indices=Slices[1:10],
+            result_shape=(2,),
+            data_was_fetched=True,
+        )
+
+    def test_slicing_1d_empty(self):
+        # A 1d testcase with an empty slicing.
+        self._check_slicing(
+            test_shape=(3,),
+            indices=Slices[0:0],
+            result_shape=(0,),
+            data_was_fetched=False,
+        )
+
+    def test_slicing_2d_normal(self):
+        # A 2d testcase with no empty slices.
+        self._check_slicing(
+            test_shape=(3, 4),
+            indices=Slices[2, :3],
+            result_shape=(3,),
+            data_was_fetched=True,
+        )
+
+    def test_slicing_2d_allempty(self):
+        # A 2d testcase with all empty slices.
+        self._check_slicing(
+            test_shape=(3, 4),
+            indices=Slices[0:0, 0:0],
+            result_shape=(0, 0),
+            data_was_fetched=False,
+        )
+
+    def test_slicing_2d_empty_dim0(self):
+        # A 2d testcase with an empty slice.
+        self._check_slicing(
+            test_shape=(3, 4),
+            indices=Slices[0:0],
+            result_shape=(0, 4),
+            data_was_fetched=False,
+        )
+
+    def test_slicing_2d_empty_dim1(self):
+        # A 2d testcase with an empty slice, and an integer index.
+        self._check_slicing(
+            test_shape=(3, 4),
+            indices=Slices[1, 0:0],
+            result_shape=(0,),
+            data_was_fetched=False,
+        )
+
+    def test_slicing_complex(self):
+        # Multiple dimensions with multiple empty slices.
+        self._check_slicing(
+            test_shape=(3, 4, 2, 5, 6, 3, 7),
+            indices=Slices[1:3, 2, 0:0, :, 1:1, :100],
+            result_shape=(2, 0, 5, 0, 3, 7),
+            data_was_fetched=False,
+        )
+
+
 if __name__ == "__main__":
     tests.main()
diff --git a/lib/iris/util.py b/lib/iris/util.py
@@ -959,6 +959,67 @@ def __lt__(self, other):
             return NotImplemented
 
 
+def _array_slice_ifempty(keys, shape, dtype):
+    """
+    Detect cases where an array slice will contain no data, as it contains a
+    zero-length dimension, and produce an equivalent result for those cases.
+
+    The function indicates 'empty' slicing cases, by returning an array equal
+    to the slice result in those cases.
+
+    Args:
+
+    * keys (indexing key, or tuple of keys):
+        The argument from an array __getitem__ call.
+        Only tuples of integers and slices are supported, in particular no
+        newaxis, ellipsis or array keys.
+        These are the types of array access usage we expect from Dask.
+    * shape (tuple of int):
+        The shape of the array being indexed.
+    * dtype (numpy.dtype):
+        The dtype of the array being indexed.
+
+    Returns:
+        result (np.ndarray or None):
+            If 'keys' contains a slice(0, 0), this is an ndarray of the correct
+            resulting shape and provided dtype.
+            Otherwise it is None.
+
+    .. note::
+
+        This is used to prevent DataProxy arraylike objects from fetching their
+        file data when wrapped as Dask arrays.
+        This is because, for Dask >= 2.0, the "dask.array.from_array" call
+        performs a fetch like [0:0, 0:0, ...], to 'snapshot' array metadata.
+        This function enables us to avoid triggering a file data fetch in those
+        cases :  This is consistent because the result will not contain any
+        actual data content.
+
+    """
+    # Convert a single key into a 1-tuple, so we always have a tuple of keys.
+    if isinstance(keys, tuple):
+        keys_tuple = keys
+    else:
+        keys_tuple = (keys,)
+
+    if any(key == slice(0, 0) for key in keys_tuple):
+        # An 'empty' slice is present :  Return a 'fake' array instead.
+        target_shape = list(shape)
+        for i_dim, key in enumerate(keys_tuple):
+            if key == slice(0, 0):
+                # Reduce dims with empty slicing to length 0.
+                target_shape[i_dim] = 0
+        # Create a prototype result : no memory usage, as some dims are 0.
+        result = np.zeros(target_shape, dtype=dtype)
+        # Index with original keys to produce the desired result shape.
+        # Note : also ok in 0-length dims, as the slice is always '0:0'.
+        result = result[keys]
+    else:
+        result = None
+
+    return result
+
+
 def create_temp_filename(suffix=""):
     """Return a temporary file name.
 

-Original file line number
+Diff line change
    latest.rst
 .0.rst
 +   2.4.rst
 .3.rst
 .2.rst
 .1.rst