From f0c7a25b4ee01830db9e2b4c6a552ae385dc2a20 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 12:05:55 +0000 Subject: [PATCH 1/8] Add failing test for brackets within comments --- .../netcdf/test_parse_cell_methods.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index 9c4fbf622b..4ed0634869 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -85,6 +85,22 @@ def test_comment(self): res = parse_cell_methods(cell_method_str) self.assertEqual(res, expected) + def test_comment_brackets(self): + cell_method_strings = [ + "time: minimum within days (comment: 18h(day-1)-18h)" + ] + expected = ( + CellMethod( + method="minimum within days", + coords="time", + intervals="", + comments="18h(day-1)-18h", + ), + ) + for cell_method_str in cell_method_strings: + res = parse_cell_methods(cell_method_str) + self.assertEqual(res, expected) + def test_portions_of_cells(self): cell_method_strings = [ "area: mean where sea_ice over sea", From dcd12ce4e73798659a5136513c1eff488aeef8e6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 12:08:26 +0000 Subject: [PATCH 2/8] Fix test --- .../tests/unit/fileformats/netcdf/test_parse_cell_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index 4ed0634869..aefdc8d835 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -93,7 +93,7 @@ def test_comment_brackets(self): CellMethod( method="minimum within days", coords="time", - intervals="", + intervals=None, comments="18h(day-1)-18h", ), ) From 09bd39917738bcd3467295b8fb44460eacfee169 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 13:37:37 +0000 Subject: [PATCH 3/8] Separate private function now splits the cell method before it's parsed --- lib/iris/fileformats/netcdf.py | 69 +++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 7cab939417..52ceca278d 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -19,6 +19,7 @@ import os.path import re import string +from typing import List import warnings import cf_units @@ -191,7 +192,7 @@ (?P[\w_\s]+(?![\w_]*\s*?:))\s* (?: \(\s* - (?P[^\)]+) + (?P.+) \)\s* )? """, @@ -203,6 +204,70 @@ class UnknownCellMethodWarning(Warning): pass +def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: + """ + Split a CF cell_methods attribute string into a list of zero or more cell + methods, each of which is then parsed with a regex to return a list of match + objects. + + Args: + + * nc_cell_methods: The value of the cell methods attribute to be split. + + Returns: + + * nc_cell_methods_matches: The original string split into strings that each + describe one cell method + + Splitting is done based on colons outside of any brackets. Validation past + being laid out in the expected format is left to the calling function. + """ + + # Find indices of spaces that precede a name: method pair + break_indices = [] + bracket_depth = 0 + for ind, cha in enumerate(nc_cell_methods): + if cha == "(": + bracket_depth += 1 + elif cha == ")": + bracket_depth -= 1 + if bracket_depth < 0: + pass + # TODO: Raise meaningful warning or error to indicate badly formatted cell method + elif cha == ":" and bracket_depth == 0: + candidate_ind = ind - 1 + while nc_cell_methods[candidate_ind] == " " and candidate_ind > 0: + candidate_ind -= 1 + while nc_cell_methods[candidate_ind] != " " and candidate_ind > 0: + candidate_ind -= 1 + if candidate_ind != 0: + break_indices.append(candidate_ind) + + # List tuples of indices of starts and ends of the cell methods in the string + method_indices = [] + if not break_indices: + method_indices.append((0, len(nc_cell_methods))) + else: + method_indices.append((0, break_indices[0])) + for ii in range(len(break_indices) - 1): + method_indices.append( + (break_indices[ii] + 1, break_indices[ii + 1]) + ) + method_indices.append((break_indices[-1] + 1, len(nc_cell_methods))) + + # Index the string and match against each substring + nc_cell_methods_matches = [] + for start_ind, end_ind in method_indices: + nc_cell_method_str = nc_cell_methods[start_ind:end_ind] + nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str) + if not nc_cell_method_match: + pass + # TODO: Raise meaningful warning or error to indicate badly formatted cell method + nc_cell_methods_matches.append(nc_cell_method_match) + + return nc_cell_methods_matches + + def parse_cell_methods(nc_cell_methods): """ Parse a CF cell_methods attribute string into a tuple of zero or @@ -226,7 +291,7 @@ def parse_cell_methods(nc_cell_methods): cell_methods = [] if nc_cell_methods is not None: - for m in _CM_PARSE.finditer(nc_cell_methods): + for m in _split_cell_methods(nc_cell_methods): d = m.groupdict() method = d[_CM_METHOD] method = method.strip() From a0e08608dea0bb10181ffbe37ac8015b2753ab66 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 14:40:32 +0000 Subject: [PATCH 4/8] Add additional failing test for multiple axis methods --- .../netcdf/test_parse_cell_methods.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index aefdc8d835..c03b9a992c 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -41,6 +41,20 @@ def test_with_interval(self): res = parse_cell_methods(cell_method_str) self.assertEqual(res, expected) + def test_multiple_axes(self): + cell_method_strings = [ + "lat: lon: standard_deviation", + "lat: lon : standard_deviation", + "lat : lon: standard_deviation", + "lat : lon : standard_deviation", + ] + expected = ( + CellMethod(method="standard_deviation", coords=["lat", "lon"]), + ) + for cell_method_str in cell_method_strings: + res = parse_cell_methods(cell_method_str) + self.assertEqual(res, expected) + def test_multiple(self): cell_method_strings = [ "time: maximum (interval: 1 hr) time: mean (interval: 1 day)", @@ -87,7 +101,8 @@ def test_comment(self): def test_comment_brackets(self): cell_method_strings = [ - "time: minimum within days (comment: 18h(day-1)-18h)" + "time: minimum within days (comment: 18h(day-1)-18h)", + "time : minimum within days (comment: 18h(day-1)-18h)", ] expected = ( CellMethod( From d58ba6be9569c5d153c1ec9bae3272054cd21e55 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 15:08:07 +0000 Subject: [PATCH 5/8] Handle multiple axis cell methods --- lib/iris/fileformats/netcdf.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 52ceca278d..73068963b3 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -186,6 +186,7 @@ _CM_INTERVAL = "interval" _CM_METHOD = "method" _CM_NAME = "name" +_CM_PARSE_NAME = re.compile(r"([\w_]+\s*?:\s+)+") _CM_PARSE = re.compile( r""" (?P([\w_]+\s*?:\s+)+) @@ -223,8 +224,12 @@ def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: being laid out in the expected format is left to the calling function. """ - # Find indices of spaces that precede a name: method pair - break_indices = [] + # Find name candidates + name_start_inds = [] + for m in _CM_PARSE_NAME.finditer(nc_cell_methods): + name_start_inds.append(m.start()) + + # Remove those that fall inside brackets bracket_depth = 0 for ind, cha in enumerate(nc_cell_methods): if cha == "(": @@ -234,32 +239,20 @@ def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: if bracket_depth < 0: pass # TODO: Raise meaningful warning or error to indicate badly formatted cell method - elif cha == ":" and bracket_depth == 0: - candidate_ind = ind - 1 - while nc_cell_methods[candidate_ind] == " " and candidate_ind > 0: - candidate_ind -= 1 - while nc_cell_methods[candidate_ind] != " " and candidate_ind > 0: - candidate_ind -= 1 - if candidate_ind != 0: - break_indices.append(candidate_ind) + if bracket_depth > 0 and ind in name_start_inds: + name_start_inds.remove(ind) # List tuples of indices of starts and ends of the cell methods in the string method_indices = [] - if not break_indices: - method_indices.append((0, len(nc_cell_methods))) - else: - method_indices.append((0, break_indices[0])) - for ii in range(len(break_indices) - 1): - method_indices.append( - (break_indices[ii] + 1, break_indices[ii + 1]) - ) - method_indices.append((break_indices[-1] + 1, len(nc_cell_methods))) + for ii in range(len(name_start_inds) - 1): + method_indices.append((name_start_inds[ii], name_start_inds[ii + 1])) + method_indices.append((name_start_inds[-1], len(nc_cell_methods))) # Index the string and match against each substring nc_cell_methods_matches = [] for start_ind, end_ind in method_indices: nc_cell_method_str = nc_cell_methods[start_ind:end_ind] - nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str) + nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip()) if not nc_cell_method_match: pass # TODO: Raise meaningful warning or error to indicate badly formatted cell method From 53b54b8434d80c903bcfa2be3f8f2ba3b5e09fb6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 24 Nov 2021 15:16:06 +0000 Subject: [PATCH 6/8] Update docstring to reflect eventual choices --- lib/iris/fileformats/netcdf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 73068963b3..bb72c4475a 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -217,11 +217,12 @@ def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: Returns: - * nc_cell_methods_matches: The original string split into strings that each - describe one cell method + * nc_cell_methods_matches: A list of the re.Match objects associated with + each parsed cell method - Splitting is done based on colons outside of any brackets. Validation past - being laid out in the expected format is left to the calling function. + Splitting is done based on words followed by colons outside of any brackets. + Validation of anything other than being laid out in the expected format is + left to the calling function. """ # Find name candidates From 3877ed7da3a28c20edc82178b4fa6319a920242f Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Mon, 29 Nov 2021 10:21:42 +0000 Subject: [PATCH 7/8] Add warnings to function, and test them --- lib/iris/fileformats/netcdf.py | 14 ++++++--- .../netcdf/test_parse_cell_methods.py | 30 +++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index bb72c4475a..7a0e4e655d 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -238,8 +238,11 @@ def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: elif cha == ")": bracket_depth -= 1 if bracket_depth < 0: - pass - # TODO: Raise meaningful warning or error to indicate badly formatted cell method + msg = ( + "Cell methods may be incorrectly parsed due to mismatched " + "brackets" + ) + warnings.warn(msg, UserWarning, stacklevel=2) if bracket_depth > 0 and ind in name_start_inds: name_start_inds.remove(ind) @@ -255,8 +258,11 @@ def _split_cell_methods(nc_cell_methods: str) -> List[re.Match]: nc_cell_method_str = nc_cell_methods[start_ind:end_ind] nc_cell_method_match = _CM_PARSE.match(nc_cell_method_str.strip()) if not nc_cell_method_match: - pass - # TODO: Raise meaningful warning or error to indicate badly formatted cell method + msg = ( + f"Failed to fully parse cell method string: {nc_cell_methods}" + ) + warnings.warn(msg, UserWarning, stacklevel=2) + continue nc_cell_methods_matches.append(nc_cell_method_match) return nc_cell_methods_matches diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index c03b9a992c..ff34b303e0 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -116,6 +116,36 @@ def test_comment_brackets(self): res = parse_cell_methods(cell_method_str) self.assertEqual(res, expected) + def test_comment_bracket_mismatch_warning(self): + cell_method_strings = [ + "time: minimum within days (comment: 18h day-1)-18h)", + "time : minimum within days (comment: 18h day-1)-18h)", + ] + for cell_method_str in cell_method_strings: + with mock.patch("warnings.warn") as warn: + _ = parse_cell_methods(cell_method_str) + self.assertIn( + "Cell methods may be incorrectly parsed due to mismatched brackets", + warn.call_args[0][0], + ) + + def test_badly_formatted_warning(self): + cell_method_strings = [ + # "time: maximum (interval: 1 hr comment: first bit " + # "time: mean (interval: 1 day comment: second bit)", + "time: (interval: 1 hr comment: first bit) " + "time: mean (interval: 1 day comment: second bit)", + "time: maximum (interval: 1 hr comment: first bit) " + "time: (interval: 1 day comment: second bit)", + ] + for cell_method_str in cell_method_strings: + with mock.patch("warnings.warn") as warn: + _ = parse_cell_methods(cell_method_str) + self.assertIn( + f"Failed to fully parse cell method string: {cell_method_str}", + warn.call_args[0][0], + ) + def test_portions_of_cells(self): cell_method_strings = [ "area: mean where sea_ice over sea", From d70bdae747304212538bfde362f2f09095db9d97 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Fri, 3 Dec 2021 11:44:59 +0000 Subject: [PATCH 8/8] Significantly simpler warning checks --- .../netcdf/test_parse_cell_methods.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py index ff34b303e0..bbde2d0a2d 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_parse_cell_methods.py @@ -122,12 +122,11 @@ def test_comment_bracket_mismatch_warning(self): "time : minimum within days (comment: 18h day-1)-18h)", ] for cell_method_str in cell_method_strings: - with mock.patch("warnings.warn") as warn: + with self.assertWarns( + UserWarning, + msg="Cell methods may be incorrectly parsed due to mismatched brackets", + ): _ = parse_cell_methods(cell_method_str) - self.assertIn( - "Cell methods may be incorrectly parsed due to mismatched brackets", - warn.call_args[0][0], - ) def test_badly_formatted_warning(self): cell_method_strings = [ @@ -139,12 +138,11 @@ def test_badly_formatted_warning(self): "time: (interval: 1 day comment: second bit)", ] for cell_method_str in cell_method_strings: - with mock.patch("warnings.warn") as warn: + with self.assertWarns( + UserWarning, + msg=f"Failed to fully parse cell method string: {cell_method_str}", + ): _ = parse_cell_methods(cell_method_str) - self.assertIn( - f"Failed to fully parse cell method string: {cell_method_str}", - warn.call_args[0][0], - ) def test_portions_of_cells(self): cell_method_strings = [