From 20ced703ce27ac085812680ab8ee889cf7a2c7ce Mon Sep 17 00:00:00 2001 From: martonvago Date: Mon, 13 Oct 2025 13:57:41 +0100 Subject: [PATCH 1/5] feat: :sparkles: add RequiredRule --- _quarto.yml | 1 + src/check_datapackage/__init__.py | 3 +- src/check_datapackage/rule.py | 110 ++++++++++++++++++++++++------ tests/test_rule.py | 59 +++++++++++++++- 4 files changed, 149 insertions(+), 24 deletions(-) diff --git a/_quarto.yml b/_quarto.yml index 7ffc3f86..22322130 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -70,6 +70,7 @@ quartodoc: - name: Config - name: Exclude - name: Rule + - name: RequiredRule - name: Issue - name: example_package_descriptor - name: example_resource_descriptor diff --git a/src/check_datapackage/__init__.py b/src/check_datapackage/__init__.py index 2b8b95ef..73a7e306 100644 --- a/src/check_datapackage/__init__.py +++ b/src/check_datapackage/__init__.py @@ -6,13 +6,14 @@ from .exclude import Exclude from .issue import Issue from .read_json import read_json -from .rule import Rule +from .rule import RequiredRule, Rule __all__ = [ "Config", "Exclude", "Issue", "Rule", + "RequiredRule", "example_package_descriptor", "example_resource_descriptor", "check", diff --git a/src/check_datapackage/rule.py b/src/check_datapackage/rule.py index 76898bd9..c3a974e7 100644 --- a/src/check_datapackage/rule.py +++ b/src/check_datapackage/rule.py @@ -1,7 +1,9 @@ +import re from dataclasses import dataclass from typing import Any, Callable from check_datapackage.internals import ( + DescriptorField, _filter, _flat_map, _get_fields_at_jsonpath, @@ -10,7 +12,7 @@ from check_datapackage.issue import Issue -@dataclass +@dataclass(frozen=True) class Rule: """A custom check to be done on a Data Package descriptor. @@ -43,34 +45,84 @@ class Rule: check: Callable[[Any], bool] type: str = "custom" + def apply(self, descriptor: dict[str, Any]) -> list[Issue]: + """Checks the descriptor against this rule and creates issues on failure. -def apply_rules(rules: list[Rule], descriptor: dict[str, Any]) -> list[Issue]: - """Checks the descriptor for all rules and creates issues for fields that fail. + Args: + descriptor: The descriptor to check. - Args: - rules: The rules to apply to the descriptor. - descriptor: The descriptor to check. + Returns: + A list of `Issue`s. + """ + matching_fields = _get_fields_at_jsonpath(self.jsonpath, descriptor) + return _get_issues(self, matching_fields) - Returns: - A list of `Issue`s. + +class RequiredRule(Rule): + """A rule that checks that a field is present (i.e. not None). + + Attributes: + jsonpath (str): The location of the field or fields, expressed in [JSON + path](https://jg-rp.github.io/python-jsonpath/syntax/) notation, to which + the rule applies (e.g., `$.resources[*].name`). + message (str): The message that is shown when the rule is violated. + + Examples: + ```{python} + import check_datapackage as cdp + required_title_rule = cdp.RequiredRule( + jsonpath="$.title", + message="A title is required.", + ) + ``` """ - return _flat_map( - rules, - lambda rule: _apply_rule(rule, descriptor), - ) + _field_name: str -def _apply_rule(rule: Rule, descriptor: dict[str, Any]) -> list[Issue]: - """Checks the descriptor against the rule and creates issues for fields that fail. + def __init__(self, jsonpath: str, message: str): + """Initializes the `RequiredRule`.""" + field_name_match = re.search(r"\.(\w+)$", jsonpath) + if not field_name_match: + raise ValueError( + "A `RequiredRule` must point to an object field that is not an array" + " item, e.g., `$.title` or `$.resources[*].name`." + ) - Args: - rule: The rule to apply to the descriptor. - descriptor: The descriptor to check. + self._field_name = field_name_match.group(1) + super().__init__( + jsonpath=jsonpath, + message=message, + check=lambda value: value is not None, + type="required", + ) - Returns: - A list of `Issue`s. - """ - matching_fields = _get_fields_at_jsonpath(rule.jsonpath, descriptor) + def apply(self, descriptor: dict[str, Any]) -> list[Issue]: + """Checks the descriptor against this rule and creates issues on failure. + + Args: + descriptor: The descriptor to check. + + Returns: + A list of `Issue`s. + """ + matching_fields = _get_fields_at_jsonpath(self.jsonpath, descriptor) + matching_paths = _map(matching_fields, lambda field: field.jsonpath) + parent_path = self.jsonpath.rstrip(f".{self._field_name}") + matching_parents = _get_fields_at_jsonpath(parent_path, descriptor) + parent_paths = _map( + matching_parents, lambda parent: f"{parent.jsonpath}.{self._field_name}" + ) + missing_paths = _filter(parent_paths, lambda path: path not in matching_paths) + missing_fields = _map( + missing_paths, + lambda path: DescriptorField(jsonpath=path, value=None), + ) + + return _get_issues(self, matching_fields + missing_fields) + + +def _get_issues(rule: Rule, matching_fields: list[DescriptorField]) -> list[Issue]: + """Checks matching fields against the rule and creates issues on failure.""" failed_fields = _filter(matching_fields, lambda field: not rule.check(field.value)) return _map( failed_fields, @@ -78,3 +130,19 @@ def _apply_rule(rule: Rule, descriptor: dict[str, Any]) -> list[Issue]: jsonpath=field.jsonpath, type=rule.type, message=rule.message ), ) + + +def apply_rules(rules: list[Rule], descriptor: dict[str, Any]) -> list[Issue]: + """Checks the descriptor for all rules and creates issues for fields that fail. + + Args: + rules: The rules to apply to the descriptor. + descriptor: The descriptor to check. + + Returns: + A list of `Issue`s. + """ + return _flat_map( + rules, + lambda rule: rule.apply(descriptor), + ) diff --git a/tests/test_rule.py b/tests/test_rule.py index f8330fd0..af80b7cd 100644 --- a/tests/test_rule.py +++ b/tests/test_rule.py @@ -1,3 +1,5 @@ +from pytest import raises + from check_datapackage.check import check from check_datapackage.config import Config from check_datapackage.examples import ( @@ -5,7 +7,7 @@ example_resource_descriptor, ) from check_datapackage.issue import Issue -from check_datapackage.rule import Rule +from check_datapackage.rule import RequiredRule, Rule lowercase_rule = Rule( jsonpath="$.name", @@ -57,8 +59,20 @@ def test_multiple_rules(): descriptor = example_package_descriptor() descriptor["name"] = "ALLCAPS" descriptor["resources"][0]["name"] = "not starting with woolly" + del descriptor["version"] + + version_rule = RequiredRule( + jsonpath="$.version", + message="Version is required.", + ) - config = Config(rules=[lowercase_rule, resource_name_rule]) + config = Config( + rules=[ + lowercase_rule, + resource_name_rule, + version_rule, + ] + ) issues = check(descriptor, config=config) assert issues == [ @@ -72,6 +86,11 @@ def test_multiple_rules(): type=resource_name_rule.type, message=resource_name_rule.message, ), + Issue( + jsonpath=version_rule.jsonpath, + type="required", + message=version_rule.message, + ), ] @@ -97,3 +116,39 @@ def test_no_matching_jsonpath(): issues = check(descriptor, config=config) assert issues == [] + + +def test_required_rule_indirect_jsonpath(): + descriptor = example_package_descriptor() + descriptor["contributors"] = [ + {"path": "a/path"}, + {"path": "a/path"}, + {"path": "a/path", "name": "a name"}, + ] + rule = RequiredRule( + jsonpath="$.contributors[*].name", + message="Contributor name is required.", + ) + config = Config(rules=[rule]) + issues = check(descriptor, config=config) + + assert issues == [ + Issue( + jsonpath="$.contributors[0].name", + type=rule.type, + message=rule.message, + ), + Issue( + jsonpath="$.contributors[1].name", + type=rule.type, + message=rule.message, + ), + ] + + +def test_required_rule_cannot_apply_to_array_item(): + with raises(ValueError): + RequiredRule( + jsonpath="$.resources[*]", + message="This should fail.", + ) From 520f04a8eeb5445b627670246f57d3a62171013e Mon Sep 17 00:00:00 2001 From: martonvago Date: Tue, 14 Oct 2025 10:15:47 +0100 Subject: [PATCH 2/5] refactor: :recycle: simplify apply logic --- src/check_datapackage/internals.py | 6 ++++ src/check_datapackage/rule.py | 47 +++++++++++++++--------------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/check_datapackage/internals.py b/src/check_datapackage/internals.py index 1bfba88c..cbd7acdf 100644 --- a/src/check_datapackage/internals.py +++ b/src/check_datapackage/internals.py @@ -68,6 +68,12 @@ def _get_fields_at_jsonpath( return _map(matches, _create_descriptor_field) +def _get_direct_jsonpaths(jsonpath: str, descriptor: dict[str, Any]) -> list[str]: + """Returns all direct JSON paths that match a direct or indirect JSON path.""" + fields = _get_fields_at_jsonpath(jsonpath, descriptor) + return _map(fields, lambda field: field.jsonpath) + + def _create_descriptor_field(match: JSONPathMatch) -> DescriptorField: return DescriptorField( jsonpath=match.path.replace("['", ".").replace("']", ""), diff --git a/src/check_datapackage/rule.py b/src/check_datapackage/rule.py index c3a974e7..0e749f64 100644 --- a/src/check_datapackage/rule.py +++ b/src/check_datapackage/rule.py @@ -6,6 +6,7 @@ DescriptorField, _filter, _flat_map, + _get_direct_jsonpaths, _get_fields_at_jsonpath, _map, ) @@ -55,7 +56,15 @@ def apply(self, descriptor: dict[str, Any]) -> list[Issue]: A list of `Issue`s. """ matching_fields = _get_fields_at_jsonpath(self.jsonpath, descriptor) - return _get_issues(self, matching_fields) + failed_fields = _filter( + matching_fields, lambda field: not self.check(field.value) + ) + return _map( + failed_fields, + lambda field: Issue( + jsonpath=field.jsonpath, type=self.type, message=self.message + ), + ) class RequiredRule(Rule): @@ -81,7 +90,7 @@ class RequiredRule(Rule): def __init__(self, jsonpath: str, message: str): """Initializes the `RequiredRule`.""" - field_name_match = re.search(r"\.(\w+)$", jsonpath) + field_name_match = re.search(r"(\.\w+)$", jsonpath) if not field_name_match: raise ValueError( "A `RequiredRule` must point to an object field that is not an array" @@ -105,32 +114,22 @@ def apply(self, descriptor: dict[str, Any]) -> list[Issue]: Returns: A list of `Issue`s. """ - matching_fields = _get_fields_at_jsonpath(self.jsonpath, descriptor) - matching_paths = _map(matching_fields, lambda field: field.jsonpath) - parent_path = self.jsonpath.rstrip(f".{self._field_name}") - matching_parents = _get_fields_at_jsonpath(parent_path, descriptor) - parent_paths = _map( - matching_parents, lambda parent: f"{parent.jsonpath}.{self._field_name}" + matching_paths = _get_direct_jsonpaths(self.jsonpath, descriptor) + indirect_parent_path = self.jsonpath.rstrip(self._field_name) + direct_parent_paths = _get_direct_jsonpaths(indirect_parent_path, descriptor) + missing_paths = _filter( + direct_parent_paths, + lambda path: f"{path}{self._field_name}" not in matching_paths, ) - missing_paths = _filter(parent_paths, lambda path: path not in matching_paths) - missing_fields = _map( + return _map( missing_paths, - lambda path: DescriptorField(jsonpath=path, value=None), + lambda path: Issue( + jsonpath=path + self._field_name, + type=self.type, + message=self.message, + ), ) - return _get_issues(self, matching_fields + missing_fields) - - -def _get_issues(rule: Rule, matching_fields: list[DescriptorField]) -> list[Issue]: - """Checks matching fields against the rule and creates issues on failure.""" - failed_fields = _filter(matching_fields, lambda field: not rule.check(field.value)) - return _map( - failed_fields, - lambda field: Issue( - jsonpath=field.jsonpath, type=rule.type, message=rule.message - ), - ) - def apply_rules(rules: list[Rule], descriptor: dict[str, Any]) -> list[Issue]: """Checks the descriptor for all rules and creates issues for fields that fail. From 3b8a7c3be8ee3461f54c63db08398e57a4c57276 Mon Sep 17 00:00:00 2001 From: martonvago Date: Tue, 14 Oct 2025 10:26:34 +0100 Subject: [PATCH 3/5] fix: :bug: remove unused import --- src/check_datapackage/rule.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/check_datapackage/rule.py b/src/check_datapackage/rule.py index 0e749f64..af93e6e6 100644 --- a/src/check_datapackage/rule.py +++ b/src/check_datapackage/rule.py @@ -3,7 +3,6 @@ from typing import Any, Callable from check_datapackage.internals import ( - DescriptorField, _filter, _flat_map, _get_direct_jsonpaths, From e0f3ac9b656f980d3cc2c8a4f351387a1284edfc Mon Sep 17 00:00:00 2001 From: martonvago Date: Tue, 14 Oct 2025 12:18:23 +0100 Subject: [PATCH 4/5] fix: :bug: disallow all ambiguous JSON paths --- src/check_datapackage/rule.py | 11 +++++++---- tests/test_rule.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/check_datapackage/rule.py b/src/check_datapackage/rule.py index af93e6e6..ae558263 100644 --- a/src/check_datapackage/rule.py +++ b/src/check_datapackage/rule.py @@ -89,11 +89,14 @@ class RequiredRule(Rule): def __init__(self, jsonpath: str, message: str): """Initializes the `RequiredRule`.""" - field_name_match = re.search(r"(\.\w+)$", jsonpath) + field_name_match = re.search(r"(? list[Issue]: A list of `Issue`s. """ matching_paths = _get_direct_jsonpaths(self.jsonpath, descriptor) - indirect_parent_path = self.jsonpath.rstrip(self._field_name) + indirect_parent_path = self.jsonpath.removesuffix(self._field_name) direct_parent_paths = _get_direct_jsonpaths(indirect_parent_path, descriptor) missing_paths = _filter( direct_parent_paths, diff --git a/tests/test_rule.py b/tests/test_rule.py index af80b7cd..e53eb41d 100644 --- a/tests/test_rule.py +++ b/tests/test_rule.py @@ -1,4 +1,4 @@ -from pytest import raises +from pytest import mark, raises from check_datapackage.check import check from check_datapackage.config import Config @@ -118,7 +118,20 @@ def test_no_matching_jsonpath(): assert issues == [] -def test_required_rule_indirect_jsonpath(): +def test_required_rule_wildcard(): + descriptor = example_package_descriptor() + rule = RequiredRule( + jsonpath="$.*.id", + message="All fields must have an id.", + ) + config = Config(rules=[rule]) + + issues = check(descriptor, config=config) + + assert len(issues) == 8 + + +def test_required_rule_array_wildcard(): descriptor = example_package_descriptor() descriptor["contributors"] = [ {"path": "a/path"}, @@ -146,9 +159,21 @@ def test_required_rule_indirect_jsonpath(): ] -def test_required_rule_cannot_apply_to_array_item(): +@mark.parametrize( + "jsonpath", + [ + "$", + "..*", + "created", + "$..path", + "..resources", + "$.resources[0].*", + "$.resources[*]", + ], +) +def test_required_rule_cannot_apply_to_ambiguous_path(jsonpath): with raises(ValueError): RequiredRule( - jsonpath="$.resources[*]", + jsonpath=jsonpath, message="This should fail.", ) From e9a48efceba0b0944787ba570fb0133deb527f7a Mon Sep 17 00:00:00 2001 From: martonvago Date: Mon, 20 Oct 2025 15:37:21 +0100 Subject: [PATCH 5/5] refactor: :recycle: use `check_missing` to express required --- _quarto.yml | 1 - src/check_datapackage/__init__.py | 3 +- src/check_datapackage/custom_check.py | 165 ++++++++++++-------------- tests/test_custom_check.py | 43 ++++++- 4 files changed, 115 insertions(+), 97 deletions(-) diff --git a/_quarto.yml b/_quarto.yml index 849450c1..8946a702 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -69,7 +69,6 @@ quartodoc: - name: read_json - name: Config - name: CustomCheck - - name: RequiredCheck - name: Exclude - name: Issue - name: example_package_properties diff --git a/src/check_datapackage/__init__.py b/src/check_datapackage/__init__.py index 6cd0eb60..cb5ca414 100644 --- a/src/check_datapackage/__init__.py +++ b/src/check_datapackage/__init__.py @@ -2,7 +2,7 @@ from .check import check from .config import Config -from .custom_check import CustomCheck, RequiredCheck +from .custom_check import CustomCheck from .examples import example_package_properties, example_resource_properties from .exclude import Exclude from .issue import Issue @@ -13,7 +13,6 @@ "Exclude", "Issue", "CustomCheck", - "RequiredCheck", "example_package_properties", "example_resource_properties", "check", diff --git a/src/check_datapackage/custom_check.py b/src/check_datapackage/custom_check.py index 3e8f6f64..9fca49d7 100644 --- a/src/check_datapackage/custom_check.py +++ b/src/check_datapackage/custom_check.py @@ -1,8 +1,9 @@ import re -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Callable from check_datapackage.internals import ( + DescriptorField, _filter, _flat_map, _get_direct_jsonpaths, @@ -27,6 +28,9 @@ class CustomCheck: type (str): An identifier for the custom check. It will be shown in error messages and can be used to exclude the check. Each custom check should have a unique `type`. + check_missing (bool): Whether fields that would match the JSON path but are + missing from the object should be passed to `check` as `None`. + Defaults to False. Examples: ```{python} @@ -45,93 +49,23 @@ class CustomCheck: message: str check: Callable[[Any], bool] type: str = "custom" - - def apply(self, descriptor: dict[str, Any]) -> list[Issue]: - """Checks the descriptor against this check and creates issues on failure. - - Args: - descriptor: The descriptor to check. - - Returns: - A list of `Issue`s. - """ - matching_fields = _get_fields_at_jsonpath(self.jsonpath, descriptor) - failed_fields = _filter( - matching_fields, lambda field: not self.check(field.value) - ) - return _map( - failed_fields, - lambda field: Issue( - jsonpath=field.jsonpath, type=self.type, message=self.message - ), - ) - - -class RequiredCheck(CustomCheck): - """A custom check that checks that a field is present (i.e. not None). - - Attributes: - jsonpath (str): The location of the field or fields, expressed in [JSON - path](https://jg-rp.github.io/python-jsonpath/syntax/) notation, to which - the check applies (e.g., `$.resources[*].name`). - message (str): The message that is shown when the check fails. - - Examples: - ```{python} - import check_datapackage as cdp - required_title_check = cdp.RequiredCheck( - jsonpath="$.title", - message="A title is required.", - ) - ``` - """ - - _field_name: str - - def __init__(self, jsonpath: str, message: str): - """Initializes the `RequiredCheck`.""" - field_name_match = re.search(r"(? list[Issue]: - """Checks the descriptor against this check and creates issues on failure. - - Args: - descriptor: The descriptor to check. - - Returns: - A list of `Issue`s. - """ - matching_paths = _get_direct_jsonpaths(self.jsonpath, descriptor) - indirect_parent_path = self.jsonpath.removesuffix(self._field_name) - direct_parent_paths = _get_direct_jsonpaths(indirect_parent_path, descriptor) - missing_paths = _filter( - direct_parent_paths, - lambda path: f"{path}{self._field_name}" not in matching_paths, - ) - return _map( - missing_paths, - lambda path: Issue( - jsonpath=path + self._field_name, - type=self.type, - message=self.message, - ), - ) + check_missing: bool = False + _field_name: str = field(init=False, repr=False) + + def __post_init__(self) -> None: + """Checks that `CustomCheck`s with `check_missing` have sensible `jsonpath`s.""" + if self.check_missing: + field_name_match = re.search(r"(? list[Issue]: + """Applies the custom check to the descriptor. + + If any fields fail the custom check, this function creates a list of issues + for those fields. + + Args: + custom_check: The custom check to apply to the descriptor. + descriptor: The descriptor to check. + + Returns: + A list of `Issue`s. + """ + matching_fields = _get_fields_at_jsonpath(custom_check.jsonpath, descriptor) + if custom_check.check_missing: + matching_fields += _get_missing_fields( + custom_check, descriptor, matching_fields + ) + + failed_fields = _filter( + matching_fields, lambda field: not custom_check.check(field.value) + ) + return _map( + failed_fields, + lambda field: Issue( + jsonpath=field.jsonpath, + type=custom_check.type, + message=custom_check.message, + ), + ) + + +def _get_missing_fields( + check: CustomCheck, + descriptor: dict[str, Any], + matching_fields: list[DescriptorField], +) -> list[DescriptorField]: + """Returns the missing fields that the check would apply to if they were present.""" + parent_jsonpath = check.jsonpath.removesuffix(check._field_name) + potentially_matching_paths = _map( + _get_direct_jsonpaths(parent_jsonpath, descriptor), + lambda path: f"{path}{check._field_name}", + ) + actually_matching_paths = _map(matching_fields, lambda field: field.jsonpath) + missing_paths = _filter( + potentially_matching_paths, + lambda path: path not in actually_matching_paths, ) + return _map(missing_paths, lambda path: DescriptorField(jsonpath=path, value=None)) diff --git a/tests/test_custom_check.py b/tests/test_custom_check.py index 839f570c..655cd405 100644 --- a/tests/test_custom_check.py +++ b/tests/test_custom_check.py @@ -1,8 +1,10 @@ +from typing import Any + from pytest import mark, raises from check_datapackage.check import check from check_datapackage.config import Config -from check_datapackage.custom_check import CustomCheck, RequiredCheck +from check_datapackage.custom_check import CustomCheck from check_datapackage.examples import ( example_package_properties, example_resource_properties, @@ -23,6 +25,10 @@ ) +def must_not_be_null(value: Any) -> bool: + return value is not None + + def test_direct_jsonpath(): properties = example_package_properties() properties["name"] = "ALLCAPS" @@ -61,9 +67,12 @@ def test_multiple_custom_checks(): descriptor["resources"][0]["name"] = "not starting with woolly" del descriptor["version"] - version_check = RequiredCheck( + version_check = CustomCheck( jsonpath="$.version", message="Version is required.", + type="required", + check=must_not_be_null, + check_missing=True, ) config = Config( @@ -118,11 +127,29 @@ def test_no_matching_jsonpath(): assert issues == [] +def test_no_matching_jsonpath_with_check_missing(): + properties = example_package_properties() + custom_check = CustomCheck( + jsonpath="$.missing", + message="This check always fails.", + check=lambda value: False, + type="always-fail", + check_missing=True, + ) + config = Config(custom_checks=[custom_check]) + issues = check(properties, config=config) + + assert len(issues) == 1 + + def test_required_check_wildcard(): descriptor = example_package_properties() - id_check = RequiredCheck( + id_check = CustomCheck( jsonpath="$.*.id", message="All fields must have an id.", + type="required", + check=must_not_be_null, + check_missing=True, ) config = Config(custom_checks=[id_check]) @@ -138,9 +165,12 @@ def test_required_check_array_wildcard(): {"path": "a/path"}, {"path": "a/path", "name": "a name"}, ] - name_check = RequiredCheck( + name_check = CustomCheck( jsonpath="$.contributors[*].name", message="Contributor name is required.", + type="required", + check=must_not_be_null, + check_missing=True, ) config = Config(custom_checks=[name_check]) issues = check(descriptor, config=config) @@ -173,7 +203,10 @@ def test_required_check_array_wildcard(): ) def test_required_check_cannot_apply_to_ambiguous_path(jsonpath): with raises(ValueError): - RequiredCheck( + CustomCheck( jsonpath=jsonpath, message="This should fail.", + type="required", + check=must_not_be_null, + check_missing=True, )