From edd551c2e49b0b3d60ff72da359a02a7ec22cb79 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 19 May 2022 14:56:16 +0100 Subject: [PATCH 1/9] Permit fast percentile with mdtol=0 --- lib/iris/analysis/__init__.py | 46 ++++++++++++------- .../tests/unit/analysis/test_PERCENTILE.py | 31 ++++++++++++- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 60994fb6c2..1af00fc351 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -720,6 +720,25 @@ def __init__(self, units_func=None, **kwargs): **kwargs, ) + def _base_aggregate(self, data, axis, lazy, **kwargs): + """ + Method to avoid duplication of checks in aggregate and lazy_aggregate. + """ + msg = "{} aggregator requires the mandatory keyword argument {!r}." + for arg in self._args: + if arg not in kwargs: + raise ValueError(msg.format(self.name(), arg)) + + if kwargs.get("fast_percentile_method", False) and ( + kwargs.get("mdtol", 1) != 0 + ): + kwargs["error_on_masked"] = True + + if lazy: + return _Aggregator.lazy_aggregate(self, data, axis, **kwargs) + else: + return _Aggregator.aggregate(self, data, axis, **kwargs) + def aggregate(self, data, axis, **kwargs): """ Perform the percentile aggregation over the given data. @@ -755,12 +774,7 @@ def aggregate(self, data, axis, **kwargs): """ - msg = "{} aggregator requires the mandatory keyword argument {!r}." - for arg in self._args: - if arg not in kwargs: - raise ValueError(msg.format(self.name(), arg)) - - return _Aggregator.aggregate(self, data, axis, **kwargs) + return self._base_aggregate(data, axis, lazy=False, **kwargs) def lazy_aggregate(self, data, axis, **kwargs): """ @@ -794,12 +808,7 @@ def lazy_aggregate(self, data, axis, **kwargs): """ - msg = "{} aggregator requires the mandatory keyword argument {!r}." - for arg in self._args: - if arg not in kwargs: - raise ValueError(msg.format(self.name(), arg)) - - return _Aggregator.lazy_aggregate(self, data, axis, **kwargs) + return self._base_aggregate(data, axis, lazy=True, **kwargs) def post_process(self, collapsed_cube, data_result, coords, **kwargs): """ @@ -1281,9 +1290,13 @@ def _calc_percentile(data, percent, fast_percentile_method=False, **kwargs): """ if fast_percentile_method: - msg = "Cannot use fast np.percentile method with masked array." - if ma.is_masked(data): - raise TypeError(msg) + if kwargs.pop("error_on_masked", False): + msg = ( + "Cannot use fast np.percentile method with masked array unless" + " mdtol is 0." + ) + if ma.is_masked(data): + raise TypeError(msg) result = np.percentile(data, percent, axis=-1) result = result.T else: @@ -1965,7 +1978,8 @@ def interp_order(length): * fast_percentile_method (boolean): When set to True, uses :func:`numpy.percentile` method as a faster alternative to the :func:`scipy.stats.mstats.mquantiles` method. alphap and - betap are ignored. An exception is raised if the data are masked. + betap are ignored. An exception is raised if the data are masked and the + missing data tolerance is not 0. Defaults to False. **For example**: diff --git a/lib/iris/tests/unit/analysis/test_PERCENTILE.py b/lib/iris/tests/unit/analysis/test_PERCENTILE.py index d56d1ffb1f..85123f5715 100644 --- a/lib/iris/tests/unit/analysis/test_PERCENTILE.py +++ b/lib/iris/tests/unit/analysis/test_PERCENTILE.py @@ -208,12 +208,25 @@ def test_masked(self): shape = (2, 11) data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked - emsg = "Cannot use fast np.percentile method with masked array." + emsg = ( + "Cannot use fast np.percentile method with masked array unless " + "mdtol is 0." + ) with self.assertRaisesRegex(TypeError, emsg): PERCENTILE.aggregate( data, axis=0, percent=50, fast_percentile_method=True ) + def test_masked_mdtol_0(self): + shape = (2, 11) + axis = 0 + percent = 50 + data = ma.arange(np.prod(shape)).reshape(shape) + data[0, ::2] = ma.masked + expected = np.arange(shape[-1]) + 5.5 + expected[0] = ma.masked + self.check_percentile_calc(data, axis, percent, expected, mdtol=0) + @mock.patch("numpy.percentile") def test_numpy_percentile_called(self, mocked_percentile): # Basic check that numpy.percentile is called. @@ -286,10 +299,24 @@ def test_masked(self): actual = PERCENTILE.lazy_aggregate( data, axis=0, percent=50, fast_percentile_method=True ) - emsg = "Cannot use fast np.percentile method with masked array." + emsg = ( + "Cannot use fast np.percentile method with masked array unless " + "mdtol is 0." + ) with self.assertRaisesRegex(TypeError, emsg): as_concrete_data(actual) + def test_masked_mdtol_0(self): + shape = (2, 11) + axis = 0 + percent = 50 + data = ma.arange(np.prod(shape)).reshape(shape) + data[0, ::2] = ma.masked + data = as_lazy_data(data) + expected = np.arange(shape[-1]) + 5.5 + expected[0] = ma.masked + self.check_percentile_calc(data, axis, percent, expected, mdtol=0) + @mock.patch("numpy.percentile", return_value=np.array([2, 4])) def test_numpy_percentile_called(self, mocked_percentile): # Basic check that numpy.percentile is called. From 7865c2c0bf1a81e82701e7544e488963136718ca Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 19 May 2022 15:40:03 +0100 Subject: [PATCH 2/9] What's new --- docs/src/whatsnew/latest.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 14d5d0c924..c0dba82108 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -121,6 +121,7 @@ This document explains the changes made to Iris for this release #. `@wjbenfold`_ added caching to the calculation of the points array in a :class:`~iris.coords.DimCoord` created using :meth:`~iris.coords.DimCoord.from_regular`. (:pull:`4698`) + #. `@wjbenfold`_ introduced caching in :func:`_lazy_data._optimum_chunksize` and :func:`iris.fileformats.pp_load_rules._epoch_date_hours` to reduce time spent repeating calculations. (:pull:`4716`) @@ -128,6 +129,9 @@ This document explains the changes made to Iris for this release #. `@pp-mo`_ made :meth:`~iris.cube.Cube.add_aux_factory` faster. (:pull:`4718`) +#. `@wjbenfold`_ permitted the fast percentile aggregation method to be used on + masked data when the missing data tolerance is set to 0. (:issue:`4735`, + :pull:`4755`) 🔥 Deprecations From 376f8df0f25b964369a083ba014a881514b512a6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Fri, 20 May 2022 15:27:51 +0100 Subject: [PATCH 3/9] Check arrays like they're masked, and make the expected arrays masked --- lib/iris/tests/unit/analysis/test_PERCENTILE.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/iris/tests/unit/analysis/test_PERCENTILE.py b/lib/iris/tests/unit/analysis/test_PERCENTILE.py index 85123f5715..8a1fba9ffd 100644 --- a/lib/iris/tests/unit/analysis/test_PERCENTILE.py +++ b/lib/iris/tests/unit/analysis/test_PERCENTILE.py @@ -52,9 +52,9 @@ def check_percentile_calc( self.assertFalse(is_lazy) if approx: - self.assertArrayAlmostEqual(actual, expected) + self.assertMaskedArrayAlmostEqual(actual, expected) else: - self.assertArrayEqual(actual, expected) + self.assertMaskedArrayEqual(actual, expected) def test_1d_single(self): data = np.arange(11) @@ -223,7 +223,7 @@ def test_masked_mdtol_0(self): percent = 50 data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked - expected = np.arange(shape[-1]) + 5.5 + expected = ma.arange(shape[-1]) + 5.5 expected[0] = ma.masked self.check_percentile_calc(data, axis, percent, expected, mdtol=0) @@ -313,7 +313,7 @@ def test_masked_mdtol_0(self): data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked data = as_lazy_data(data) - expected = np.arange(shape[-1]) + 5.5 + expected = ma.arange(shape[-1]) + 5.5 expected[0] = ma.masked self.check_percentile_calc(data, axis, percent, expected, mdtol=0) From f409f449b0e1705427c7bc052e57a69c9350e58b Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Fri, 20 May 2022 16:19:44 +0100 Subject: [PATCH 4/9] Bigger arrays to avoid np quirk, bug fix, check masking in 2d_multi --- .../tests/unit/analysis/test_PERCENTILE.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/iris/tests/unit/analysis/test_PERCENTILE.py b/lib/iris/tests/unit/analysis/test_PERCENTILE.py index 8a1fba9ffd..9e3fecb3e3 100644 --- a/lib/iris/tests/unit/analysis/test_PERCENTILE.py +++ b/lib/iris/tests/unit/analysis/test_PERCENTILE.py @@ -32,7 +32,7 @@ def check_percentile_calc( if self.lazy: data = as_lazy_data(data) - expected = np.array(expected) + expected = ma.array(expected) actual = self.agg_method( data, @@ -131,7 +131,7 @@ def test_masked_2d_single(self): def test_masked_2d_multi(self): shape = (3, 10) data = ma.arange(np.prod(shape)).reshape(shape) - data[1] = ma.masked + data[1, ::2] = ma.masked percent = np.array([10, 50, 70, 80]) axis = 0 mdtol = 0.1 @@ -140,10 +140,11 @@ def test_masked_2d_multi(self): # linear interpolation. expected = percent / 100 * 20 # Other columns are first column plus column number. - expected = ( + expected = ma.array( np.broadcast_to(expected, (shape[-1], percent.size)) + np.arange(shape[-1])[:, np.newaxis] ) + expected[::2] = ma.masked self.check_percentile_calc( data, axis, percent, expected, mdtol=mdtol, approx=True @@ -218,13 +219,13 @@ def test_masked(self): ) def test_masked_mdtol_0(self): - shape = (2, 11) + shape = (3, 11) axis = 0 percent = 50 data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked - expected = ma.arange(shape[-1]) + 5.5 - expected[0] = ma.masked + expected = ma.arange(shape[-1]) + 11 + expected[::2] = ma.masked self.check_percentile_calc(data, axis, percent, expected, mdtol=0) @mock.patch("numpy.percentile") @@ -307,14 +308,14 @@ def test_masked(self): as_concrete_data(actual) def test_masked_mdtol_0(self): - shape = (2, 11) + shape = (3, 11) axis = 0 percent = 50 data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked data = as_lazy_data(data) - expected = ma.arange(shape[-1]) + 5.5 - expected[0] = ma.masked + expected = ma.arange(shape[-1]) + 11 + expected[::2] = ma.masked self.check_percentile_calc(data, axis, percent, expected, mdtol=0) @mock.patch("numpy.percentile", return_value=np.array([2, 4])) From 1a0a051169520ec6a4c1f8d2e33805872a912787 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Fri, 20 May 2022 16:30:20 +0100 Subject: [PATCH 5/9] Suppress numpy masked percentile warning --- lib/iris/analysis/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 1af00fc351..15c25134a0 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -39,6 +39,7 @@ from collections.abc import Iterable import functools from functools import wraps +import warnings import dask.array as da import numpy as np @@ -1297,7 +1298,12 @@ def _calc_percentile(data, percent, fast_percentile_method=False, **kwargs): ) if ma.is_masked(data): raise TypeError(msg) - result = np.percentile(data, percent, axis=-1) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Warning: 'partition' will ignore the 'mask' of the MaskedArray.", + ) + result = np.percentile(data, percent, axis=-1) result = result.T else: quantiles = percent / 100.0 From 34719085840786f74976468bec790bd6c5f76437 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Mon, 23 May 2022 09:39:33 +0100 Subject: [PATCH 6/9] Use a larger test array that numpy handles how it handles most of them (the worse case) --- lib/iris/tests/unit/analysis/test_PERCENTILE.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/iris/tests/unit/analysis/test_PERCENTILE.py b/lib/iris/tests/unit/analysis/test_PERCENTILE.py index 9e3fecb3e3..4a529002f4 100644 --- a/lib/iris/tests/unit/analysis/test_PERCENTILE.py +++ b/lib/iris/tests/unit/analysis/test_PERCENTILE.py @@ -206,7 +206,9 @@ def setUp(self): self.agg_method = PERCENTILE.aggregate def test_masked(self): - shape = (2, 11) + # Using (3,11) because np.percentile returns a masked array anyway with + # (2, 11) + shape = (3, 11) data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked emsg = ( @@ -219,6 +221,8 @@ def test_masked(self): ) def test_masked_mdtol_0(self): + # Using (3,11) because np.percentile returns a masked array anyway with + # (2, 11) shape = (3, 11) axis = 0 percent = 50 @@ -293,6 +297,8 @@ def setUp(self): self.agg_method = PERCENTILE.lazy_aggregate def test_masked(self): + # Using (3,11) because np.percentile returns a masked array anyway with + # (2, 11) shape = (2, 11) data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked From 2a42c1891c05313ab07f705317dd30b1c12301fa Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Mon, 23 May 2022 09:47:41 +0100 Subject: [PATCH 7/9] Broadcast the mask in the non-lazy case with added dim --- lib/iris/analysis/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 15c25134a0..3475cf915a 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -593,6 +593,12 @@ def aggregate(self, data, axis, **kwargs): ): fraction_not_missing = data.count(axis=axis) / data.shape[axis] mask_update = 1 - mdtol > fraction_not_missing + if result.ndim > mask_update.ndim: + # call_func created trailing dimension. + mask_update = np.broadcast_to( + mask_update.reshape(mask_update.shape + (1,)), + result.shape, + ) if ma.isMaskedArray(result): result.mask = result.mask | mask_update else: From f0a6a7bbeec575d804e3e4fa2896ddb99e65d5c6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Mon, 23 May 2022 10:29:32 +0100 Subject: [PATCH 8/9] Bug fix on ndim test --- lib/iris/analysis/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 3475cf915a..7621cf9916 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -593,11 +593,11 @@ def aggregate(self, data, axis, **kwargs): ): fraction_not_missing = data.count(axis=axis) / data.shape[axis] mask_update = 1 - mdtol > fraction_not_missing - if result.ndim > mask_update.ndim: + if np.array(result).ndim > np.array(mask_update).ndim: # call_func created trailing dimension. mask_update = np.broadcast_to( mask_update.reshape(mask_update.shape + (1,)), - result.shape, + np.array(result).shape, ) if ma.isMaskedArray(result): result.mask = result.mask | mask_update From 7af47a14fce0bad6269c234ece83c43b5f0d359f Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Mon, 23 May 2022 15:51:32 +0100 Subject: [PATCH 9/9] Review fixes --- docs/src/whatsnew/latest.rst | 3 +++ lib/iris/analysis/__init__.py | 4 ++-- lib/iris/tests/unit/analysis/test_PERCENTILE.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index c0dba82108..1f9d4d35b4 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -108,6 +108,9 @@ This document explains the changes made to Iris for this release #. `@wjbenfold`_ fixed plotting of circular coordinates to extend kwarg arrays as well as the data. (:issue:`466`, :pull:`4649`) +#. `@wjbenfold`_ corrected the axis on which masking is applied when an + aggregator adds a trailing dimension. (:pull:`4755`) + 💣 Incompatible Changes ======================= diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 7621cf9916..6d2ee599c8 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -592,8 +592,8 @@ def aggregate(self, data, axis, **kwargs): and result is not ma.masked ): fraction_not_missing = data.count(axis=axis) / data.shape[axis] - mask_update = 1 - mdtol > fraction_not_missing - if np.array(result).ndim > np.array(mask_update).ndim: + mask_update = np.array(1 - mdtol > fraction_not_missing) + if np.array(result).ndim > mask_update.ndim: # call_func created trailing dimension. mask_update = np.broadcast_to( mask_update.reshape(mask_update.shape + (1,)), diff --git a/lib/iris/tests/unit/analysis/test_PERCENTILE.py b/lib/iris/tests/unit/analysis/test_PERCENTILE.py index 4a529002f4..a74dd891ba 100644 --- a/lib/iris/tests/unit/analysis/test_PERCENTILE.py +++ b/lib/iris/tests/unit/analysis/test_PERCENTILE.py @@ -297,8 +297,6 @@ def setUp(self): self.agg_method = PERCENTILE.lazy_aggregate def test_masked(self): - # Using (3,11) because np.percentile returns a masked array anyway with - # (2, 11) shape = (2, 11) data = ma.arange(np.prod(shape)).reshape(shape) data[0, ::2] = ma.masked @@ -314,6 +312,8 @@ def test_masked(self): as_concrete_data(actual) def test_masked_mdtol_0(self): + # Using (3,11) because np.percentile returns a masked array anyway with + # (2, 11) shape = (3, 11) axis = 0 percent = 50