diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 7cab939417..7a0e4e655d 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -19,6 +19,7 @@ import os.path import re import string +from typing import List import warnings import cf_units @@ -185,13 +186,14 @@ _CM_INTERVAL = "interval" _CM_METHOD = "method" _CM_NAME = "name" +_CM_PARSE_NAME = re.compile(r"([\w_]+\s*?:\s+)+") _CM_PARSE = re.compile( r""" (?P([\w_]+\s*?:\s+)+) (?P[\w_\s]+(?![\w_]*\s*?:))\s* (?: \(\s* - (?P[^\)]+) + (?P.+) \)\s* )? """, @@ -203,6 +205,69 @@ class UnknownCellMethodWarning(Warning): pass +def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: + """ + Split a CF cell_methods attribute string into a list of zero or more cell + methods, each of which is then parsed with a regex to return a list of match + objects. + + Args: + + * nc_cell_methods: The value of the cell methods attribute to be split. + + Returns: + + * nc_cell_methods_matches: A list of the re.Match objects associated with + each parsed cell method + + Splitting is done based on words followed by colons outside of any brackets. + Validation of anything other than being laid out in the expected format is + left to the calling function. + """ + + # Find name candidates + name_start_inds = [] + for m in _CM_PARSE_NAME.finditer(nc_cell_methods): + name_start_inds.append(m.start()) + + # Remove those that fall inside brackets + bracket_depth = 0 + for ind, cha in enumerate(nc_cell_methods): + if cha == "(": + bracket_depth += 1 + elif cha == ")": + bracket_depth -= 1 + if bracket_depth < 0: + msg = ( + "Cell methods may be incorrectly parsed due to mismatched " + "brackets" + ) + warnings.warn(msg, UserWarning, stacklevel=2) + if bracket_depth > 0 and ind in name_start_inds: + name_start_inds.remove(ind) + + # List tuples of indices of starts and ends of the cell methods in the string + method_indices = [] + for ii in range(len(name_start_inds) - 1): + method_indices.append((name_start_inds[ii], name_start_inds[ii + 1])) + method_indices.append((name_start_inds[-1], len(nc_cell_methods))) + + # Index the string and match against each substring + nc_cell_methods_matches = [] + for start_ind, end_ind in method_indices: + nc_cell_method_str = nc_cell_methods[start_ind:end_ind] + nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip()) + if not nc_cell_method_match: + msg = ( + f"Failed to fully parse cell method string: {nc_cell_methods}" + ) + warnings.warn(msg, UserWarning, stacklevel=2) + continue + nc_cell_methods_matches.append(nc_cell_method_match) + + return nc_cell_methods_matches + + def parse_cell_methods(nc_cell_methods): """ Parse a CF cell_methods attribute string into a tuple of zero or @@ -226,7 +291,7 @@ def parse_cell_methods(nc_cell_methods): cell_methods = [] if nc_cell_methods is not None: - for m in _CM_PARSE.finditer(nc_cell_methods): + for m in _split_cell_methods(nc_cell_methods): d = m.groupdict() method = d[_CM_METHOD] method = method.strip() diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index 9c4fbf622b..bbde2d0a2d 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -41,6 +41,20 @@ def test_with_interval(self): res = parse_cell_methods(cell_method_str) self.assertEqual(res, expected) + def test_multiple_axes(self): + cell_method_strings = [ + "lat: lon: standard_deviation", + "lat: lon : standard_deviation", + "lat : lon: standard_deviation", + "lat : lon : standard_deviation", + ] + expected = ( + CellMethod(method="standard_deviation", coords=["lat", "lon"]), + ) + for cell_method_str in cell_method_strings: + res = parse_cell_methods(cell_method_str) + self.assertEqual(res, expected) + def test_multiple(self): cell_method_strings = [ "time: maximum (interval: 1 hr) time: mean (interval: 1 day)", @@ -85,6 +99,51 @@ def test_comment(self): res = parse_cell_methods(cell_method_str) self.assertEqual(res, expected) + def test_comment_brackets(self): + cell_method_strings = [ + "time: minimum within days (comment: 18h(day-1)-18h)", + "time : minimum within days (comment: 18h(day-1)-18h)", + ] + expected = ( + CellMethod( + method="minimum within days", + coords="time", + intervals=None, + comments="18h(day-1)-18h", + ), + ) + for cell_method_str in cell_method_strings: + res = parse_cell_methods(cell_method_str) + self.assertEqual(res, expected) + + def test_comment_bracket_mismatch_warning(self): + cell_method_strings = [ + "time: minimum within days (comment: 18h day-1)-18h)", + "time : minimum within days (comment: 18h day-1)-18h)", + ] + for cell_method_str in cell_method_strings: + with self.assertWarns( + UserWarning, + msg="Cell methods may be incorrectly parsed due to mismatched brackets", + ): + _ = parse_cell_methods(cell_method_str) + + def test_badly_formatted_warning(self): + cell_method_strings = [ + # "time: maximum (interval: 1 hr comment: first bit " + # "time: mean (interval: 1 day comment: second bit)", + "time: (interval: 1 hr comment: first bit) " + "time: mean (interval: 1 day comment: second bit)", + "time: maximum (interval: 1 hr comment: first bit) " + "time: (interval: 1 day comment: second bit)", + ] + for cell_method_str in cell_method_strings: + with self.assertWarns( + UserWarning, + msg=f"Failed to fully parse cell method string: {cell_method_str}", + ): + _ = parse_cell_methods(cell_method_str) + def test_portions_of_cells(self): cell_method_strings = [ "area: mean where sea_ice over sea",