Skip to content

Commit 6b1d392

Browse files
authored
Permit fast percentile with mdtol=0 (#4755)
* Permit fast percentile with mdtol=0 * What's new * Check arrays like they're masked, and make the expected arrays masked * Bigger arrays to avoid np quirk, bug fix, check masking in 2d_multi * Suppress numpy masked percentile warning * Use a larger test array that numpy handles how it handles most of them (the worse case) * Broadcast the mask in the non-lazy case with added dim * Bug fix on ndim test * Review fixes
1 parent 508aa2b commit 6b1d392

File tree

3 files changed

+93
-26
lines changed

3 files changed

+93
-26
lines changed

docs/src/whatsnew/latest.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ This document explains the changes made to Iris for this release
108108
#. `@wjbenfold`_ fixed plotting of circular coordinates to extend kwarg arrays
109109
as well as the data. (:issue:`466`, :pull:`4649`)
110110

111+
#. `@wjbenfold`_ corrected the axis on which masking is applied when an
112+
aggregator adds a trailing dimension. (:pull:`4755`)
113+
111114

112115
💣 Incompatible Changes
113116
=======================
@@ -121,13 +124,17 @@ This document explains the changes made to Iris for this release
121124
#. `@wjbenfold`_ added caching to the calculation of the points array in a
122125
:class:`~iris.coords.DimCoord` created using
123126
:meth:`~iris.coords.DimCoord.from_regular`. (:pull:`4698`)
127+
124128
#. `@wjbenfold`_ introduced caching in :func:`_lazy_data._optimum_chunksize` and
125129
:func:`iris.fileformats.pp_load_rules._epoch_date_hours` to reduce time spent
126130
repeating calculations. (:pull:`4716`)
127131

128132
#. `@pp-mo`_ made :meth:`~iris.cube.Cube.add_aux_factory` faster.
129133
(:pull:`4718`)
130134

135+
#. `@wjbenfold`_ permitted the fast percentile aggregation method to be used on
136+
masked data when the missing data tolerance is set to 0. (:issue:`4735`,
137+
:pull:`4755`)
131138

132139

133140
🔥 Deprecations

lib/iris/analysis/__init__.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from collections.abc import Iterable
4040
import functools
4141
from functools import wraps
42+
import warnings
4243

4344
import dask.array as da
4445
import numpy as np
@@ -591,7 +592,13 @@ def aggregate(self, data, axis, **kwargs):
591592
and result is not ma.masked
592593
):
593594
fraction_not_missing = data.count(axis=axis) / data.shape[axis]
594-
mask_update = 1 - mdtol > fraction_not_missing
595+
mask_update = np.array(1 - mdtol > fraction_not_missing)
596+
if np.array(result).ndim > mask_update.ndim:
597+
# call_func created trailing dimension.
598+
mask_update = np.broadcast_to(
599+
mask_update.reshape(mask_update.shape + (1,)),
600+
np.array(result).shape,
601+
)
595602
if ma.isMaskedArray(result):
596603
result.mask = result.mask | mask_update
597604
else:
@@ -720,6 +727,25 @@ def __init__(self, units_func=None, **kwargs):
720727
**kwargs,
721728
)
722729

730+
def _base_aggregate(self, data, axis, lazy, **kwargs):
731+
"""
732+
Method to avoid duplication of checks in aggregate and lazy_aggregate.
733+
"""
734+
msg = "{} aggregator requires the mandatory keyword argument {!r}."
735+
for arg in self._args:
736+
if arg not in kwargs:
737+
raise ValueError(msg.format(self.name(), arg))
738+
739+
if kwargs.get("fast_percentile_method", False) and (
740+
kwargs.get("mdtol", 1) != 0
741+
):
742+
kwargs["error_on_masked"] = True
743+
744+
if lazy:
745+
return _Aggregator.lazy_aggregate(self, data, axis, **kwargs)
746+
else:
747+
return _Aggregator.aggregate(self, data, axis, **kwargs)
748+
723749
def aggregate(self, data, axis, **kwargs):
724750
"""
725751
Perform the percentile aggregation over the given data.
@@ -755,12 +781,7 @@ def aggregate(self, data, axis, **kwargs):
755781
756782
"""
757783

758-
msg = "{} aggregator requires the mandatory keyword argument {!r}."
759-
for arg in self._args:
760-
if arg not in kwargs:
761-
raise ValueError(msg.format(self.name(), arg))
762-
763-
return _Aggregator.aggregate(self, data, axis, **kwargs)
784+
return self._base_aggregate(data, axis, lazy=False, **kwargs)
764785

765786
def lazy_aggregate(self, data, axis, **kwargs):
766787
"""
@@ -794,12 +815,7 @@ def lazy_aggregate(self, data, axis, **kwargs):
794815
795816
"""
796817

797-
msg = "{} aggregator requires the mandatory keyword argument {!r}."
798-
for arg in self._args:
799-
if arg not in kwargs:
800-
raise ValueError(msg.format(self.name(), arg))
801-
802-
return _Aggregator.lazy_aggregate(self, data, axis, **kwargs)
818+
return self._base_aggregate(data, axis, lazy=True, **kwargs)
803819

804820
def post_process(self, collapsed_cube, data_result, coords, **kwargs):
805821
"""
@@ -1281,10 +1297,19 @@ def _calc_percentile(data, percent, fast_percentile_method=False, **kwargs):
12811297
12821298
"""
12831299
if fast_percentile_method:
1284-
msg = "Cannot use fast np.percentile method with masked array."
1285-
if ma.is_masked(data):
1286-
raise TypeError(msg)
1287-
result = np.percentile(data, percent, axis=-1)
1300+
if kwargs.pop("error_on_masked", False):
1301+
msg = (
1302+
"Cannot use fast np.percentile method with masked array unless"
1303+
" mdtol is 0."
1304+
)
1305+
if ma.is_masked(data):
1306+
raise TypeError(msg)
1307+
with warnings.catch_warnings():
1308+
warnings.filterwarnings(
1309+
"ignore",
1310+
"Warning: 'partition' will ignore the 'mask' of the MaskedArray.",
1311+
)
1312+
result = np.percentile(data, percent, axis=-1)
12881313
result = result.T
12891314
else:
12901315
quantiles = percent / 100.0
@@ -1965,7 +1990,8 @@ def interp_order(length):
19651990
* fast_percentile_method (boolean):
19661991
When set to True, uses :func:`numpy.percentile` method as a faster
19671992
alternative to the :func:`scipy.stats.mstats.mquantiles` method. alphap and
1968-
betap are ignored. An exception is raised if the data are masked.
1993+
betap are ignored. An exception is raised if the data are masked and the
1994+
missing data tolerance is not 0.
19691995
Defaults to False.
19701996
19711997
**For example**:

lib/iris/tests/unit/analysis/test_PERCENTILE.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def check_percentile_calc(
3232
if self.lazy:
3333
data = as_lazy_data(data)
3434

35-
expected = np.array(expected)
35+
expected = ma.array(expected)
3636

3737
actual = self.agg_method(
3838
data,
@@ -52,9 +52,9 @@ def check_percentile_calc(
5252
self.assertFalse(is_lazy)
5353

5454
if approx:
55-
self.assertArrayAlmostEqual(actual, expected)
55+
self.assertMaskedArrayAlmostEqual(actual, expected)
5656
else:
57-
self.assertArrayEqual(actual, expected)
57+
self.assertMaskedArrayEqual(actual, expected)
5858

5959
def test_1d_single(self):
6060
data = np.arange(11)
@@ -131,7 +131,7 @@ def test_masked_2d_single(self):
131131
def test_masked_2d_multi(self):
132132
shape = (3, 10)
133133
data = ma.arange(np.prod(shape)).reshape(shape)
134-
data[1] = ma.masked
134+
data[1, ::2] = ma.masked
135135
percent = np.array([10, 50, 70, 80])
136136
axis = 0
137137
mdtol = 0.1
@@ -140,10 +140,11 @@ def test_masked_2d_multi(self):
140140
# linear interpolation.
141141
expected = percent / 100 * 20
142142
# Other columns are first column plus column number.
143-
expected = (
143+
expected = ma.array(
144144
np.broadcast_to(expected, (shape[-1], percent.size))
145145
+ np.arange(shape[-1])[:, np.newaxis]
146146
)
147+
expected[::2] = ma.masked
147148

148149
self.check_percentile_calc(
149150
data, axis, percent, expected, mdtol=mdtol, approx=True
@@ -205,15 +206,32 @@ def setUp(self):
205206
self.agg_method = PERCENTILE.aggregate
206207

207208
def test_masked(self):
208-
shape = (2, 11)
209+
# Using (3,11) because np.percentile returns a masked array anyway with
210+
# (2, 11)
211+
shape = (3, 11)
209212
data = ma.arange(np.prod(shape)).reshape(shape)
210213
data[0, ::2] = ma.masked
211-
emsg = "Cannot use fast np.percentile method with masked array."
214+
emsg = (
215+
"Cannot use fast np.percentile method with masked array unless "
216+
"mdtol is 0."
217+
)
212218
with self.assertRaisesRegex(TypeError, emsg):
213219
PERCENTILE.aggregate(
214220
data, axis=0, percent=50, fast_percentile_method=True
215221
)
216222

223+
def test_masked_mdtol_0(self):
224+
# Using (3,11) because np.percentile returns a masked array anyway with
225+
# (2, 11)
226+
shape = (3, 11)
227+
axis = 0
228+
percent = 50
229+
data = ma.arange(np.prod(shape)).reshape(shape)
230+
data[0, ::2] = ma.masked
231+
expected = ma.arange(shape[-1]) + 11
232+
expected[::2] = ma.masked
233+
self.check_percentile_calc(data, axis, percent, expected, mdtol=0)
234+
217235
@mock.patch("numpy.percentile")
218236
def test_numpy_percentile_called(self, mocked_percentile):
219237
# Basic check that numpy.percentile is called.
@@ -286,10 +304,26 @@ def test_masked(self):
286304
actual = PERCENTILE.lazy_aggregate(
287305
data, axis=0, percent=50, fast_percentile_method=True
288306
)
289-
emsg = "Cannot use fast np.percentile method with masked array."
307+
emsg = (
308+
"Cannot use fast np.percentile method with masked array unless "
309+
"mdtol is 0."
310+
)
290311
with self.assertRaisesRegex(TypeError, emsg):
291312
as_concrete_data(actual)
292313

314+
def test_masked_mdtol_0(self):
315+
# Using (3,11) because np.percentile returns a masked array anyway with
316+
# (2, 11)
317+
shape = (3, 11)
318+
axis = 0
319+
percent = 50
320+
data = ma.arange(np.prod(shape)).reshape(shape)
321+
data[0, ::2] = ma.masked
322+
data = as_lazy_data(data)
323+
expected = ma.arange(shape[-1]) + 11
324+
expected[::2] = ma.masked
325+
self.check_percentile_calc(data, axis, percent, expected, mdtol=0)
326+
293327
@mock.patch("numpy.percentile", return_value=np.array([2, 4]))
294328
def test_numpy_percentile_called(self, mocked_percentile):
295329
# Basic check that numpy.percentile is called.

0 commit comments

Comments
 (0)