From 35c8a53216ad873a676b2365f6b98cb1c2f708b5 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 5 Mar 2024 13:30:12 +1000
Subject: [PATCH 01/25] chore: update provenance_payload in __main__.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py
index ad70156f3..3323f418e 100644
--- a/src/macaron/__main__.py
+++ b/src/macaron/__main__.py
@@ -142,7 +142,7 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None
         run_config,
         analyzer_single_args.sbom_path,
         analyzer_single_args.skip_deps,
-        prov_payload=prov_payload,
+        provenance_payload=prov_payload,
     )
     sys.exit(status_code)
 

From 321c21767d2efb5762b8f7b67038a3e8a0eebdff Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 5 Mar 2024 14:14:50 +1000
Subject: [PATCH 02/25] chore: ensure SLSA v0.1 list index is within the bounds
 of the associated list.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       | 276 ++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 src/macaron/repo_finder/provenance_extractor.py

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
new file mode 100644
index 000000000..36ed813ed
--- /dev/null
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module contains methods for extracting repository and commit metadata from provenance files."""
+import logging
+from typing import overload
+
+from macaron.slsa_analyzer.provenance import intoto
+from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
+from macaron.util import JsonType
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str, str]:
+    """Extract the repository and commit metadata from the passed provenance payload.
+
+    Parameters
+    ----------
+    payload: InTotoPayload
+        The payload to extract from.
+
+    Returns
+    -------
+    tuple[str, str]
+        The repository URL and commit hash if found, a pair of empty strings otherwise.
+    """
+    predicate_type = payload.statement.get("predicateType")
+    if isinstance(payload, InTotoV1Payload):
+        if isinstance(payload, InTotoV1Payload):
+            if predicate_type == "https://slsa.dev/provenance/v1":
+                return _extract_from_slsa_v1(payload)
+    elif isinstance(payload, InTotoV01Payload):
+        if predicate_type == "https://slsa.dev/provenance/v0.2":
+            return _extract_from_slsa_v02(payload)
+        if predicate_type == "https://slsa.dev/provenance/v0.1":
+            return _extract_from_slsa_v01(payload)
+        if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
+            return _extract_from_witness_provenance(payload)
+
+    logger.debug(
+        "Extraction from provenance not supported for versions: predicate_type %s, in-toto %s.",
+        predicate_type,
+        payload.__class__,
+    )
+    return "", ""
+
+
+def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
+    """Extract the repository and commit metadata from the slsa v01 provenance payload."""
+    predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
+    if not predicate:
+        return "", ""
+
+    # The repository URL and commit are stored inside an entry in the list of predicate -> materials.
+    # In predicate -> recipe -> definedInMaterial we find the list index that points to the correct entry.
+    list_index = _json_extract(predicate, ["recipe", "definedInMaterial"], int)
+    if not list_index:
+        return "", ""
+
+    material_list = _json_extract(predicate, ["materials"], list)
+    if not material_list:
+        return "", ""
+
+    if list_index >= len(material_list):
+        return "", ""
+    material = material_list[list_index]
+    if not material or not isinstance(material, dict):
+        return "", ""
+
+    uri = material.get("uri")
+    if not uri:
+        logger.debug("Could not extract repository URL.")
+    repo = _clean_spdx(uri)
+
+    digest_set = material.get("digest")
+    if not digest_set or not isinstance(digest_set, dict):
+        return "", ""
+    commit = _extract_commit_from_digest(digest_set)
+    if not commit:
+        logger.debug("Could not extract commit.")
+        return "", ""
+
+    return repo, commit
+
+
+def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
+    """Extract the repository and commit metadata from the slsa v02 provenance payload."""
+    predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
+    if not predicate:
+        return "", ""
+
+    # The repository URL and commit are stored within the predicate -> invocation -> configSource object.
+    # See https://slsa.dev/spec/v0.2/provenance
+    uri = _json_extract(predicate, ["invocation", "configSource", "uri"], str)
+    if not uri:
+        logger.debug("Could not extract repo URL.")
+        return "", ""
+    repo = _clean_spdx(uri)
+
+    digest_set = _json_extract(predicate, ["invocation", "configSource", "digest"], dict)
+    if not digest_set:
+        return "", ""
+    commit = _extract_commit_from_digest(digest_set)
+    if not commit:
+        logger.debug("Could not extract commit.")
+        return "", ""
+
+    return repo, commit
+
+
+def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
+    """Extract the repository and commit metadata from the slsa v1 provenance payload."""
+    predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
+    if not predicate:
+        return "", ""
+
+    build_def = _json_extract(predicate, ["buildDefinition"], dict)
+    if not build_def:
+        return "", ""
+    build_type = _json_extract(build_def, ["buildType"], str)
+    if not build_type:
+        return "", ""
+
+    # Extract the repository URL.
+    repo = None
+    if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
+        repo = _json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
+        if not repo:
+            repo = _json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
+    if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
+        repo = _json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
+
+    if not repo:
+        logger.debug("Failed to extract repository URL from provenance.")
+        return "", ""
+
+    # Extract the commit hash.
+    commit = None
+    deps = _json_extract(build_def, ["resolvedDependencies"], list)
+    if not deps:
+        return "", ""
+    for dep in deps:
+        if not isinstance(dep, dict):
+            continue
+        uri = dep["uri"]
+        url = _clean_spdx(uri)
+        if url != repo:
+            continue
+        if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
+            commit_dict = _json_extract(dep, ["digest"], dict)
+            if not commit_dict:
+                continue
+            commit = _extract_commit_from_digest(commit_dict)
+        if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
+            commit = _json_extract(dep, ["digest", "gitCommit"], str)
+
+    if not commit:
+        logger.debug("Failed to extract commit hash from provenance.")
+        return "", ""
+
+    return repo, commit
+
+
+def _extract_commit_from_digest(digest: dict[str, JsonType]) -> str | None:
+    """Extract the commit from the passed DigestSet.
+
+    The DigestSet is an in-toto object that maps algorithm types to commit hashes (digests).
+    """
+    # TODO decide on a preference for which algorithm to accept.
+    if len(digest.keys()) > 1:
+        logger.debug("DigestSet contains multiple algorithms: %s", digest.keys())
+
+    for key in digest:
+        if key in intoto.v1.VALID_ALGORITHMS:
+            value = digest.get(key)
+            if isinstance(value, str):
+                return value
+    return None
+
+
+def _clean_spdx(uri: str) -> str:
+    """Clean the passed SPDX URI and return the normalised URL it represents.
+
+    A SPDX URI has the form: git+https://example.com@refs/heads/main
+    """
+    url, _, _ = uri.lstrip("git+").rpartition("@")
+    return url
+
+
+def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, str]:
+    """Extract the repository and commit metadata from the witness provenance file found at the passed path.
+
+    To successfully return the commit and repository URL, the payload must respectively contain a Git attestation, and
+    either a GitHub or GitLab attestation.
+
+    Parameters
+    ----------
+    payload: InTotoPayload
+        The payload to extract from.
+
+    Returns
+    -------
+    tuple[str, str]
+        The repository URL and commit hash if found, a pair of empty strings otherwise.
+    """
+    predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
+    if not predicate:
+        return "", ""
+    attestations = _json_extract(predicate, ["attestations"], list)
+    if not attestations:
+        return "", ""
+    commit: str | None = None
+    repo: str | None = None
+    for entry in attestations:
+        if not isinstance(entry, dict):
+            continue
+        entry_type = entry.get("type")
+        if not entry_type:
+            continue
+        if entry_type.startswith("https://witness.dev/attestations/git/"):
+            commit = _json_extract(entry, ["attestation", "commithash"], str)
+        elif entry_type.startswith("https://witness.dev/attestations/gitlab/") or entry_type.startswith(
+            "https://witness.dev/attestations/github/"
+        ):
+            repo = _json_extract(entry, ["attestation", "projecturl"], str)
+
+    if not commit or not repo:
+        logger.debug("Could not extract repo and commit from provenance.")
+        return "", ""
+
+    return repo, commit
+
+
+@overload
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[int]) -> int | None:
+    ...
+
+
+@overload
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[list]) -> list | None:
+    ...
+
+
+@overload
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[dict]) -> dict | None:
+    ...
+
+
+@overload
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[str]) -> str | None:
+    ...
+
+
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[JsonType]) -> JsonType:
+    """Return the value found by following the list of depth-sequential keys inside the passed dictionary.
+
+    The value's type is validated against the passed type.
+    """
+    target = entry
+    for index, key in enumerate(keys):
+        if key not in target:
+            logger.debug("Key not found in JSON: %s", key)
+            return None
+        next_target = target[key]
+        if index == len(keys) - 1:
+            if isinstance(next_target, type_):
+                return next_target
+        else:
+            if not isinstance(next_target, dict):
+                logger.debug("Expected dict found: %s", next_target.__class__)
+                break
+            target = next_target
+
+    logger.debug("Failed to find %s in JSON dictionary", " > ".join(keys))
+    return None

From 6c95043b7f992b81f1fe33387309b026c92c6e33 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 5 Mar 2024 16:37:13 +1000
Subject: [PATCH 03/25] chore: keep code related to with statement in the
 statement block; add debug output for provenance extractor success

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       |  28 ++-
 src/macaron/repo_finder/provenance_finder.py  | 232 ++++++++++++++++++
 2 files changed, 249 insertions(+), 11 deletions(-)
 create mode 100644 src/macaron/repo_finder/provenance_finder.py

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 36ed813ed..7fca2c82e 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -25,25 +25,31 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
     tuple[str, str]
         The repository URL and commit hash if found, a pair of empty strings otherwise.
     """
+    repo = ""
+    commit = ""
     predicate_type = payload.statement.get("predicateType")
     if isinstance(payload, InTotoV1Payload):
         if isinstance(payload, InTotoV1Payload):
             if predicate_type == "https://slsa.dev/provenance/v1":
-                return _extract_from_slsa_v1(payload)
+                repo, commit = _extract_from_slsa_v1(payload)
     elif isinstance(payload, InTotoV01Payload):
         if predicate_type == "https://slsa.dev/provenance/v0.2":
-            return _extract_from_slsa_v02(payload)
+            repo, commit = _extract_from_slsa_v02(payload)
         if predicate_type == "https://slsa.dev/provenance/v0.1":
-            return _extract_from_slsa_v01(payload)
+            repo, commit = _extract_from_slsa_v01(payload)
         if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
-            return _extract_from_witness_provenance(payload)
-
-    logger.debug(
-        "Extraction from provenance not supported for versions: predicate_type %s, in-toto %s.",
-        predicate_type,
-        payload.__class__,
-    )
-    return "", ""
+            repo, commit = _extract_from_witness_provenance(payload)
+
+    if not repo or not commit:
+        logger.debug(
+            "Extraction from provenance not supported for versions: predicate_type %s, in-toto %s.",
+            predicate_type,
+            payload.__class__,
+        )
+        return "", ""
+
+    logger.debug("Extracted repo and commit from provenance: %s, %s", repo, commit)
+    return repo, commit
 
 
 def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
diff --git a/src/macaron/repo_finder/provenance_finder.py b/src/macaron/repo_finder/provenance_finder.py
new file mode 100644
index 000000000..42463f13d
--- /dev/null
+++ b/src/macaron/repo_finder/provenance_finder.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module contains methods for finding provenance files."""
+import logging
+import os
+import tempfile
+
+from packageurl import PackageURL
+
+from macaron.config.defaults import defaults
+from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type
+from macaron.slsa_analyzer.checks.provenance_available_check import ProvenanceAvailableException
+from macaron.slsa_analyzer.package_registry import JFrogMavenRegistry, NPMRegistry
+from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset
+from macaron.slsa_analyzer.provenance.intoto import InTotoPayload
+from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError
+from macaron.slsa_analyzer.provenance.loader import load_provenance_payload
+from macaron.slsa_analyzer.provenance.witness import is_witness_provenance_payload, load_witness_verifier_config
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class ProvenanceFinder:
+    """This class is used to find and retrieve provenance files from supported registries."""
+
+    def __init__(self) -> None:
+        self.last_provenance_payload: InTotoPayload | None = None
+
+    def find_provenance(self, purl: PackageURL) -> InTotoPayload | None:
+        """Find the provenance files of the passed PURL.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The PURL to find provenance for.
+
+        Returns
+        -------
+        InTotoPayload | None
+            The provenance payload if found, or None.
+        """
+        if determine_abstract_purl_type(purl) == AbstractPurlType.REPOSITORY:
+            # Do not perform this function for repository type targets.
+            self.last_provenance_payload = None
+
+        if purl.type == "npm":
+            self.last_provenance_payload = ProvenanceFinder.find_npm_provenance(purl)
+        elif purl.type in ["gradle", "maven"]:
+            self.last_provenance_payload = ProvenanceFinder.find_gav_provenance(purl)
+        else:
+            logger.debug("Provenance finding not supported for PURL type: %s", purl.type)
+            self.last_provenance_payload = None
+
+        return self.last_provenance_payload
+
+    @staticmethod
+    def find_npm_provenance(purl: PackageURL) -> InTotoPayload | None:
+        """Find and download the NPM based provenance for the passed PURL.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The PURL of the analysis target.
+
+        Returns
+        -------
+        InTotoPayload | None
+            The provenance payload if found, or None.
+        """
+        # Retrieve NPM registry configuration values.
+        npm_section = "package_registry.npm"
+        if not defaults.has_section(npm_section):
+            logger.debug("No NPM section found in config.")
+            return None
+        if not defaults.get(npm_section, "enabled"):
+            logger.debug("NPM section disabled in config.")
+            return None
+
+        hostname = defaults.get(npm_section, "hostname")
+        attestation_endpoint = defaults.get(npm_section, "attestation_endpoint")
+        try:
+            request_timeout = int(defaults.get(npm_section, "request_timeout"))
+        except ValueError as error:
+            logger.debug("Invalid value for NPM package registry timeout: %s", error)
+            return None
+        # Create registry from configuration values.
+        npm_registry = NPMRegistry(hostname, attestation_endpoint, request_timeout)
+
+        namespace = purl.namespace or ""
+        artifact_id = purl.name
+        version = purl.version
+
+        if not purl.version:
+            version = npm_registry.get_latest_version(namespace, artifact_id)
+
+        if not version:
+            logger.debug("Missing version for NPM package.")
+            return None
+
+        # The size of the asset (in bytes) is added to match the AssetLocator
+        # protocol and is not used because npm API registry does not provide it, so it is set to zero.
+        npm_provenance_asset = NPMAttestationAsset(
+            namespace=namespace,
+            artifact_id=artifact_id,
+            version=version,
+            npm_registry=npm_registry,
+            size_in_bytes=0,
+        )
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                download_path = os.path.join(temp_dir, f"{artifact_id}.intoto.jsonl")
+                if not npm_provenance_asset.download(download_path):
+                    logger.debug("Unable to find an npm provenance for %s@%s", artifact_id, version)
+                    return None
+
+                try:
+                    # Load the provenance file.
+                    provenance_payload = load_provenance_payload(download_path)
+                except LoadIntotoAttestationError as loadintotoerror:
+                    logger.error("Error while loading provenance %s", loadintotoerror)
+                    return None
+
+                return provenance_payload
+        except OSError as error:
+            logger.error("Error while storing provenance in the temporary directory: %s", error)
+            return None
+
+    @staticmethod
+    def find_gav_provenance(purl: PackageURL) -> InTotoPayload | None:
+        """Find and download the GAV based provenance for the passed PURL.
+
+        Parameters
+        ----------
+        purl: PackageURL
+            The PURL of the analysis target.
+
+        Returns
+        -------
+        InTotoPayload | None
+            The provenance payload if found, or None.
+
+        """
+        jfrog_section = "package_registry.jfrog.maven"
+        if not defaults.has_section(jfrog_section):
+            logger.debug("No JFrog section found in config.")
+            return None
+
+        try:
+            request_timeout = defaults.getint(jfrog_section, "request_timeout")
+            download_timeout = defaults.getint(jfrog_section, "download_timeout")
+        except ValueError as error:
+            logger.debug("Failed to parse default value as int: %s", error)
+            return None
+
+        jfrog_registry = JFrogMavenRegistry(
+            defaults.get(jfrog_section, "hostname"),
+            defaults.get(jfrog_section, "repo"),
+            request_timeout,
+            download_timeout,
+        )
+
+        provenance_extensions = defaults.get_list(
+            "slsa.verifier",
+            "provenance_extensions",
+            fallback=["intoto.jsonl"],
+        )
+
+        provenance_assets = jfrog_registry.fetch_assets(
+            group_id=purl.namespace if purl.namespace else "",
+            artifact_id=purl.name,
+            version=purl.version if purl.version else "",
+            extensions=set(provenance_extensions),
+        )
+
+        if not provenance_assets:
+            return None
+
+        max_valid_provenance_size = defaults.getint(
+            "slsa.verifier",
+            "max_download_size",
+            fallback=1000000,
+        )
+
+        for provenance_asset in provenance_assets:
+            if provenance_asset.size_in_bytes > max_valid_provenance_size:
+                msg = (
+                    f"The provenance asset {provenance_asset.name} unexpectedly exceeds the "
+                    f"max valid file size of {max_valid_provenance_size} (bytes). "
+                    "The check will not proceed due to potential security risks."
+                )
+                logger.error(msg)
+                raise ProvenanceAvailableException(msg)
+
+        provenance_filepaths = []
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                for provenance_asset in provenance_assets:
+                    provenance_filepath = os.path.join(temp_dir, provenance_asset.name)
+                    if not provenance_asset.download(provenance_filepath):
+                        logger.debug(
+                            "Could not download the provenance %s. Skip verifying...",
+                            provenance_asset.name,
+                        )
+                        continue
+                    provenance_filepaths.append(provenance_filepath)
+        except OSError as error:
+            logger.error("Error while storing provenance in the temporary directory: %s", error)
+
+        provenances = []
+        witness_verifier_config = load_witness_verifier_config()
+
+        for provenance_filepath in provenance_filepaths:
+            try:
+                provenance_payload = load_provenance_payload(provenance_filepath)
+            except LoadIntotoAttestationError as error:
+                logger.error("Error while loading provenance: %s", error)
+                continue
+
+            if not is_witness_provenance_payload(provenance_payload, witness_verifier_config.predicate_types):
+                continue
+
+            provenances.append(provenance_payload)
+
+        if not provenances:
+            logger.debug("No payloads found in provenance files.")
+            return None
+
+        # TODO decide what to do when multiple provenance payloads are present.
+        provenance = provenances[0]
+
+        return provenance

From fa1bbc7d1fd2ecc01ac0872bac00d62a9ba22329 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 7 Mar 2024 09:46:07 +1000
Subject: [PATCH 04/25] chore: replace overload with TypeVar

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 7fca2c82e..43db88e35 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -3,7 +3,7 @@
 
 """This module contains methods for extracting repository and commit metadata from provenance files."""
 import logging
-from typing import overload
+from typing import TypeVar
 
 from macaron.slsa_analyzer.provenance import intoto
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
@@ -238,27 +238,10 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
     return repo, commit
 
 
-@overload
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[int]) -> int | None:
-    ...
+T = TypeVar("T", bound=JsonType)
 
 
-@overload
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[list]) -> list | None:
-    ...
-
-
-@overload
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[dict]) -> dict | None:
-    ...
-
-
-@overload
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[str]) -> str | None:
-    ...
-
-
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[JsonType]) -> JsonType:
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T | None:
     """Return the value found by following the list of depth-sequential keys inside the passed dictionary.
 
     The value's type is validated against the passed type.

From e1ffe975be5987e012781380b38384b5c7de70cc Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Fri, 8 Mar 2024 13:26:57 +1000
Subject: [PATCH 05/25] chore: remove duplicate if statement; replace
 x.__class__ with str(type(x)); use default JFrog registry; only pass real
 values to JFrog fetch function; rename digest function to digest_set; copy
 intoto algorithms to v01, and add as input to _extract_commit_from_digest_set
 function; make provenance_extractor raise exceptions instead of returning
 empty tuples, and refactor accordingly; add gitCommit digest set type to v1
 algorithms.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       | 171 ++++++++----------
 src/macaron/repo_finder/provenance_finder.py  |  79 ++++----
 src/macaron/repo_finder/repo_finder.py        |   2 +-
 src/macaron/slsa_analyzer/analyzer.py         |  70 +++++--
 .../provenance/intoto/v01/__init__.py         |  25 +++
 .../provenance/intoto/v1/__init__.py          |   6 +-
 6 files changed, 198 insertions(+), 155 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 43db88e35..59196ec44 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -5,6 +5,7 @@
 import logging
 from typing import TypeVar
 
+from macaron.errors import MacaronError
 from macaron.slsa_analyzer.provenance import intoto
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
 from macaron.util import JsonType
@@ -12,6 +13,10 @@
 logger: logging.Logger = logging.getLogger(__name__)
 
 
+class ProvenanceExtractionException(MacaronError):
+    """When there is an error while extracting from provenance."""
+
+
 def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the passed provenance payload.
 
@@ -24,14 +29,18 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
     -------
     tuple[str, str]
         The repository URL and commit hash if found, a pair of empty strings otherwise.
+
+    Raises
+    ------
+    ProvenanceExtractionException
+        If the extraction process fails for any reason.
     """
     repo = ""
     commit = ""
     predicate_type = payload.statement.get("predicateType")
     if isinstance(payload, InTotoV1Payload):
-        if isinstance(payload, InTotoV1Payload):
-            if predicate_type == "https://slsa.dev/provenance/v1":
-                repo, commit = _extract_from_slsa_v1(payload)
+        if predicate_type == "https://slsa.dev/provenance/v1":
+            repo, commit = _extract_from_slsa_v1(payload)
     elif isinstance(payload, InTotoV01Payload):
         if predicate_type == "https://slsa.dev/provenance/v0.2":
             repo, commit = _extract_from_slsa_v02(payload)
@@ -41,12 +50,12 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
             repo, commit = _extract_from_witness_provenance(payload)
 
     if not repo or not commit:
-        logger.debug(
-            "Extraction from provenance not supported for versions: predicate_type %s, in-toto %s.",
-            predicate_type,
-            payload.__class__,
+        msg = (
+            f"Extraction from provenance not supported for versions: "
+            f"predicate_type {predicate_type}, in-toto {str(type(payload))}."
         )
-        return "", ""
+        logger.error(msg)
+        raise ProvenanceExtractionException(msg)
 
     logger.debug("Extracted repo and commit from provenance: %s, %s", repo, commit)
     return repo, commit
@@ -56,36 +65,23 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v01 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        return "", ""
+        raise ProvenanceExtractionException("No predicate in payload statement.")
 
     # The repository URL and commit are stored inside an entry in the list of predicate -> materials.
     # In predicate -> recipe -> definedInMaterial we find the list index that points to the correct entry.
     list_index = _json_extract(predicate, ["recipe", "definedInMaterial"], int)
-    if not list_index:
-        return "", ""
-
     material_list = _json_extract(predicate, ["materials"], list)
-    if not material_list:
-        return "", ""
-
     if list_index >= len(material_list):
-        return "", ""
+        raise ProvenanceExtractionException("Material list index outside of material list bounds.")
     material = material_list[list_index]
     if not material or not isinstance(material, dict):
-        return "", ""
+        raise ProvenanceExtractionException("Indexed material list entry is invalid.")
 
-    uri = material.get("uri")
-    if not uri:
-        logger.debug("Could not extract repository URL.")
+    uri = _json_extract(material, ["uri"], str)
     repo = _clean_spdx(uri)
 
-    digest_set = material.get("digest")
-    if not digest_set or not isinstance(digest_set, dict):
-        return "", ""
-    commit = _extract_commit_from_digest(digest_set)
-    if not commit:
-        logger.debug("Could not extract commit.")
-        return "", ""
+    digest_set = _json_extract(material, ["digest"], dict)
+    commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
     return repo, commit
 
@@ -94,23 +90,15 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v02 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        return "", ""
+        raise ProvenanceExtractionException("No predicate in payload statement.")
 
     # The repository URL and commit are stored within the predicate -> invocation -> configSource object.
     # See https://slsa.dev/spec/v0.2/provenance
     uri = _json_extract(predicate, ["invocation", "configSource", "uri"], str)
-    if not uri:
-        logger.debug("Could not extract repo URL.")
-        return "", ""
     repo = _clean_spdx(uri)
 
     digest_set = _json_extract(predicate, ["invocation", "configSource", "digest"], dict)
-    if not digest_set:
-        return "", ""
-    commit = _extract_commit_from_digest(digest_set)
-    if not commit:
-        logger.debug("Could not extract commit.")
-        return "", ""
+    commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
     return repo, commit
 
@@ -119,81 +107,43 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v1 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        return "", ""
+        raise ProvenanceExtractionException("No predicate in payload statement.")
 
     build_def = _json_extract(predicate, ["buildDefinition"], dict)
-    if not build_def:
-        return "", ""
     build_type = _json_extract(build_def, ["buildType"], str)
-    if not build_type:
-        return "", ""
 
     # Extract the repository URL.
-    repo = None
+    repo = ""
     if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
-        repo = _json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
-        if not repo:
+        try:
+            repo = _json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
+        except ProvenanceExtractionException:
             repo = _json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
     if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
         repo = _json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
 
     if not repo:
-        logger.debug("Failed to extract repository URL from provenance.")
-        return "", ""
+        raise ProvenanceExtractionException("Failed to extract repository URL from provenance.")
 
     # Extract the commit hash.
-    commit = None
+    commit = ""
     deps = _json_extract(build_def, ["resolvedDependencies"], list)
-    if not deps:
-        return "", ""
     for dep in deps:
         if not isinstance(dep, dict):
             continue
-        uri = dep["uri"]
+        uri = _json_extract(dep, ["uri"], str)
         url = _clean_spdx(uri)
         if url != repo:
             continue
-        if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
-            commit_dict = _json_extract(dep, ["digest"], dict)
-            if not commit_dict:
-                continue
-            commit = _extract_commit_from_digest(commit_dict)
-        if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
-            commit = _json_extract(dep, ["digest", "gitCommit"], str)
+        digest_set = _json_extract(dep, ["digest"], dict)
+        commit = _extract_commit_from_digest_set(digest_set, intoto.v1.VALID_ALGORITHMS)
 
     if not commit:
-        logger.debug("Failed to extract commit hash from provenance.")
-        return "", ""
+        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
 
     return repo, commit
 
 
-def _extract_commit_from_digest(digest: dict[str, JsonType]) -> str | None:
-    """Extract the commit from the passed DigestSet.
-
-    The DigestSet is an in-toto object that maps algorithm types to commit hashes (digests).
-    """
-    # TODO decide on a preference for which algorithm to accept.
-    if len(digest.keys()) > 1:
-        logger.debug("DigestSet contains multiple algorithms: %s", digest.keys())
-
-    for key in digest:
-        if key in intoto.v1.VALID_ALGORITHMS:
-            value = digest.get(key)
-            if isinstance(value, str):
-                return value
-    return None
-
-
-def _clean_spdx(uri: str) -> str:
-    """Clean the passed SPDX URI and return the normalised URL it represents.
-
-    A SPDX URI has the form: git+https://example.com@refs/heads/main
-    """
-    url, _, _ = uri.lstrip("git+").rpartition("@")
-    return url
-
-
 def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the witness provenance file found at the passed path.
 
@@ -212,12 +162,11 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
     """
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        return "", ""
+        raise ProvenanceExtractionException("No predicate in payload statement.")
+
     attestations = _json_extract(predicate, ["attestations"], list)
-    if not attestations:
-        return "", ""
-    commit: str | None = None
-    repo: str | None = None
+    commit = ""
+    repo = ""
     for entry in attestations:
         if not isinstance(entry, dict):
             continue
@@ -232,16 +181,41 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
             repo = _json_extract(entry, ["attestation", "projecturl"], str)
 
     if not commit or not repo:
-        logger.debug("Could not extract repo and commit from provenance.")
-        return "", ""
+        raise ProvenanceExtractionException("Could not extract repo and commit from provenance.")
 
     return repo, commit
 
 
+def _extract_commit_from_digest_set(digest_set: dict[str, JsonType], valid_algorithms: list[str]) -> str:
+    """Extract the commit from the passed DigestSet.
+
+    The DigestSet is an in-toto object that maps algorithm types to commit hashes (digests).
+    """
+    # TODO decide on a preference for which algorithm to accept.
+    if len(digest_set.keys()) > 1:
+        logger.debug("DigestSet contains multiple algorithms: %s", digest_set.keys())
+
+    for key in digest_set:
+        if key in valid_algorithms:
+            value = digest_set.get(key)
+            if isinstance(value, str):
+                return value
+    raise ProvenanceExtractionException("No valid digest in digest set.")
+
+
+def _clean_spdx(uri: str) -> str:
+    """Clean the passed SPDX URI and return the normalised URL it represents.
+
+    A SPDX URI has the form: git+https://example.com@refs/heads/main
+    """
+    url, _, _ = uri.lstrip("git+").rpartition("@")
+    return url
+
+
 T = TypeVar("T", bound=JsonType)
 
 
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T | None:
+def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T:
     """Return the value found by following the list of depth-sequential keys inside the passed dictionary.
 
     The value's type is validated against the passed type.
@@ -249,17 +223,14 @@ def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -
     target = entry
     for index, key in enumerate(keys):
         if key not in target:
-            logger.debug("Key not found in JSON: %s", key)
-            return None
+            raise ProvenanceExtractionException(f"JSON key not found: {key}")
         next_target = target[key]
         if index == len(keys) - 1:
             if isinstance(next_target, type_):
                 return next_target
         else:
             if not isinstance(next_target, dict):
-                logger.debug("Expected dict found: %s", next_target.__class__)
-                break
+                raise ProvenanceExtractionException(f"Extract value from non-dict type: {str(type(next_target))}")
             target = next_target
 
-    logger.debug("Failed to find %s in JSON dictionary", " > ".join(keys))
-    return None
+    raise ProvenanceExtractionException(f"Failed to find '{' > '.join(keys)}' as type '{type_}' in JSON dictionary.")
diff --git a/src/macaron/repo_finder/provenance_finder.py b/src/macaron/repo_finder/provenance_finder.py
index 42463f13d..957092593 100644
--- a/src/macaron/repo_finder/provenance_finder.py
+++ b/src/macaron/repo_finder/provenance_finder.py
@@ -11,7 +11,7 @@
 from macaron.config.defaults import defaults
 from macaron.repo_finder.commit_finder import AbstractPurlType, determine_abstract_purl_type
 from macaron.slsa_analyzer.checks.provenance_available_check import ProvenanceAvailableException
-from macaron.slsa_analyzer.package_registry import JFrogMavenRegistry, NPMRegistry
+from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, JFrogMavenRegistry, NPMRegistry
 from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload
 from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError
@@ -26,6 +26,15 @@ class ProvenanceFinder:
 
     def __init__(self) -> None:
         self.last_provenance_payload: InTotoPayload | None = None
+        registries = PACKAGE_REGISTRIES
+        self.npm_registry: NPMRegistry | None = None
+        self.jfrog_registry: JFrogMavenRegistry | None = None
+        if registries:
+            for registry in registries:
+                if isinstance(registry, NPMRegistry):
+                    self.npm_registry = registry
+                elif isinstance(registry, JFrogMavenRegistry):
+                    self.jfrog_registry = registry
 
     def find_provenance(self, purl: PackageURL) -> InTotoPayload | None:
         """Find the provenance files of the passed PURL.
@@ -42,12 +51,20 @@ def find_provenance(self, purl: PackageURL) -> InTotoPayload | None:
         """
         if determine_abstract_purl_type(purl) == AbstractPurlType.REPOSITORY:
             # Do not perform this function for repository type targets.
-            self.last_provenance_payload = None
+            return None
+
+        self.last_provenance_payload = None
 
         if purl.type == "npm":
-            self.last_provenance_payload = ProvenanceFinder.find_npm_provenance(purl)
+            if self.npm_registry:
+                self.last_provenance_payload = ProvenanceFinder.find_npm_provenance(purl, self.npm_registry)
+            else:
+                logger.debug("Missing npm registry to find provenance in.")
         elif purl.type in ["gradle", "maven"]:
-            self.last_provenance_payload = ProvenanceFinder.find_gav_provenance(purl)
+            if self.jfrog_registry:
+                self.last_provenance_payload = ProvenanceFinder.find_gav_provenance(purl, self.jfrog_registry)
+            else:
+                logger.debug("Missing JFrog registry to find provenance in.")
         else:
             logger.debug("Provenance finding not supported for PURL type: %s", purl.type)
             self.last_provenance_payload = None
@@ -55,37 +72,24 @@ def find_provenance(self, purl: PackageURL) -> InTotoPayload | None:
         return self.last_provenance_payload
 
     @staticmethod
-    def find_npm_provenance(purl: PackageURL) -> InTotoPayload | None:
+    def find_npm_provenance(purl: PackageURL, npm_registry: NPMRegistry) -> InTotoPayload | None:
         """Find and download the NPM based provenance for the passed PURL.
 
         Parameters
         ----------
         purl: PackageURL
             The PURL of the analysis target.
+        npm_registry: NPMRegistry
+            The npm registry to find provenance in.
 
         Returns
         -------
         InTotoPayload | None
             The provenance payload if found, or None.
         """
-        # Retrieve NPM registry configuration values.
-        npm_section = "package_registry.npm"
-        if not defaults.has_section(npm_section):
-            logger.debug("No NPM section found in config.")
-            return None
-        if not defaults.get(npm_section, "enabled"):
-            logger.debug("NPM section disabled in config.")
-            return None
-
-        hostname = defaults.get(npm_section, "hostname")
-        attestation_endpoint = defaults.get(npm_section, "attestation_endpoint")
-        try:
-            request_timeout = int(defaults.get(npm_section, "request_timeout"))
-        except ValueError as error:
-            logger.debug("Invalid value for NPM package registry timeout: %s", error)
+        if not npm_registry.enabled:
+            logger.debug("The npm registry is not enabled.")
             return None
-        # Create registry from configuration values.
-        npm_registry = NPMRegistry(hostname, attestation_endpoint, request_timeout)
 
         namespace = purl.namespace or ""
         artifact_id = purl.name
@@ -127,39 +131,34 @@ def find_npm_provenance(purl: PackageURL) -> InTotoPayload | None:
             return None
 
     @staticmethod
-    def find_gav_provenance(purl: PackageURL) -> InTotoPayload | None:
+    def find_gav_provenance(purl: PackageURL, jfrog_registry: JFrogMavenRegistry) -> InTotoPayload | None:
         """Find and download the GAV based provenance for the passed PURL.
 
         Parameters
         ----------
         purl: PackageURL
             The PURL of the analysis target.
+        jfrog_registry: JFrogMavenRegistry
+            The JFrog registry to find provenance in.
 
         Returns
         -------
         InTotoPayload | None
             The provenance payload if found, or None.
 
+        Raises
+        ------
+        ProvenanceAvailableException
+            If the discovered provenance file size exceeds the configured limit.
         """
-        jfrog_section = "package_registry.jfrog.maven"
-        if not defaults.has_section(jfrog_section):
-            logger.debug("No JFrog section found in config.")
+        if not jfrog_registry.enabled:
+            logger.debug("JFrog registry not enabled.")
             return None
 
-        try:
-            request_timeout = defaults.getint(jfrog_section, "request_timeout")
-            download_timeout = defaults.getint(jfrog_section, "download_timeout")
-        except ValueError as error:
-            logger.debug("Failed to parse default value as int: %s", error)
+        if not purl.namespace or not purl.version:
+            logger.debug("Missing purl namespace or version for finding provenance in JFrog registry.")
             return None
 
-        jfrog_registry = JFrogMavenRegistry(
-            defaults.get(jfrog_section, "hostname"),
-            defaults.get(jfrog_section, "repo"),
-            request_timeout,
-            download_timeout,
-        )
-
         provenance_extensions = defaults.get_list(
             "slsa.verifier",
             "provenance_extensions",
@@ -167,9 +166,9 @@ def find_gav_provenance(purl: PackageURL) -> InTotoPayload | None:
         )
 
         provenance_assets = jfrog_registry.fetch_assets(
-            group_id=purl.namespace if purl.namespace else "",
+            group_id=purl.namespace,
             artifact_id=purl.name,
-            version=purl.version if purl.version else "",
+            version=purl.version,
             extensions=set(provenance_extensions),
         )
 
diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py
index 999ce0f87..d365f34d8 100644
--- a/src/macaron/repo_finder/repo_finder.py
+++ b/src/macaron/repo_finder/repo_finder.py
@@ -74,7 +74,7 @@ def find_repo(purl: PackageURL) -> str:
         return ""
 
     # Call Repo Finder and return first valid URL
-    logger.debug("Analyzing %s with Repo Finder: %s", purl.to_string(), repo_finder.__class__)
+    logger.debug("Analyzing %s with Repo Finder: %s", purl.to_string(), str(type(repo_finder)))
     return repo_finder.find_repo(purl)
 
 
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 7eab59b43..599991357 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -26,6 +26,11 @@
 from macaron.output_reporter.results import Record, Report, SCMStatus
 from macaron.repo_finder import repo_finder
 from macaron.repo_finder.commit_finder import find_commit
+from macaron.repo_finder.provenance_extractor import (
+    ProvenanceExtractionException,
+    extract_repo_and_commit_from_provenance,
+)
+from macaron.repo_finder.provenance_finder import ProvenanceFinder
 from macaron.slsa_analyzer import git_url
 from macaron.slsa_analyzer.analyze_context import AnalyzeContext
 from macaron.slsa_analyzer.asset import VirtualReleaseAsset
@@ -116,7 +121,7 @@ def run(
         user_config: dict,
         sbom_path: str = "",
         skip_deps: bool = False,
-        prov_payload: InTotoPayload | None = None,
+        provenance_payload: InTotoPayload | None = None,
     ) -> int:
         """Run the analysis and write results to the output path.
 
@@ -131,7 +136,7 @@ def run(
             The path to the SBOM.
         skip_deps : bool
             Flag to skip dependency resolution.
-        prov_payload : InToToPayload | None
+        provenance_payload : InToToPayload | None
             The provenance intoto payload for the main software component.
 
         Returns
@@ -165,7 +170,7 @@ def run(
                 main_record = self.run_single(
                     main_config,
                     analysis,
-                    prov_payload=prov_payload,
+                    provenance_payload=provenance_payload,
                 )
 
                 if main_record.status != SCMStatus.AVAILABLE or not main_record.context:
@@ -267,7 +272,7 @@ def run_single(
         config: Configuration,
         analysis: Analysis,
         existing_records: dict[str, Record] | None = None,
-        prov_payload: InTotoPayload | None = None,
+        provenance_payload: InTotoPayload | None = None,
     ) -> Record:
         """Run the checks for a single repository target.
 
@@ -282,7 +287,7 @@ def run_single(
             The current analysis instance.
         existing_records : dict[str, Record] | None
             The mapping of existing records that the analysis has run successfully.
-        prov_payload : InToToPayload | None
+        provenance_payload : InToToPayload | None
             The provenance intoto payload for the analyzed software component.
 
         Returns
@@ -292,8 +297,9 @@ def run_single(
         """
         repo_id = config.get_value("id")
         component = None
+        provenance_finder = ProvenanceFinder()
         try:
-            component = self.add_component(config, analysis, existing_records)
+            component = self.add_component(config, analysis, provenance_finder, existing_records, provenance_payload)
         except PURLNotFoundError as error:
             logger.error(error)
             return Record(
@@ -321,7 +327,10 @@ def run_single(
         analyze_ctx.dynamic_data["expectation"] = self.expectations.get_expectation_for_target(
             analyze_ctx.component.purl.split("@")[0]
         )
-        analyze_ctx.dynamic_data["provenance"] = prov_payload
+        if not provenance_payload:
+            # Retrieve the provenance file from the finder. May also be None.
+            provenance_payload = provenance_finder.last_provenance_payload
+        analyze_ctx.dynamic_data["provenance"] = provenance_payload
         analyze_ctx.check_results = self.perform_checks(analyze_ctx)
 
         return Record(
@@ -441,7 +450,12 @@ class AnalysisTarget(NamedTuple):
         digest: str
 
     def add_component(
-        self, config: Configuration, analysis: Analysis, existing_records: dict[str, Record] | None = None
+        self,
+        config: Configuration,
+        analysis: Analysis,
+        provenance_finder: ProvenanceFinder,
+        existing_records: dict[str, Record] | None = None,
+        provenance_payload: InTotoPayload | None = None,
     ) -> Component:
         """Add a software component if it does not exist in the DB already.
 
@@ -454,8 +468,12 @@ def add_component(
             The configuration for running Macaron.
         analysis: Analysis
             The current analysis instance.
+        provenance_finder: ProvenanceFinder
+            The provenance finder object to use when finding provenance.
         existing_records : dict[str, Record] | None
             The mapping of existing records that the analysis has run successfully.
+        provenance_payload : InToToPayload | None
+            The provenance in-toto payload for the software component.
 
         Returns
         -------
@@ -472,7 +490,9 @@ def add_component(
         # Note: the component created in this function will be added to the database.
         available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
         try:
-            analysis_target = Analyzer.to_analysis_target(config, available_domains)
+            analysis_target = Analyzer.to_analysis_target(
+                config, available_domains, provenance_finder, provenance_payload
+            )
         except InvalidPURLError as error:
             raise PURLNotFoundError("Invalid input PURL.") from error
 
@@ -528,7 +548,12 @@ def add_component(
         return Component(purl=analysis_target.parsed_purl.to_string(), analysis=analysis, repository=repository)
 
     @staticmethod
-    def to_analysis_target(config: Configuration, available_domains: list[str]) -> AnalysisTarget:
+    def to_analysis_target(
+        config: Configuration,
+        available_domains: list[str],
+        provenance_finder: ProvenanceFinder | None = None,
+        provenance_payload: InTotoPayload | None = None,
+    ) -> AnalysisTarget:
         """Resolve the details of a software component from user input.
 
         Parameters
@@ -538,6 +563,10 @@ def to_analysis_target(config: Configuration, available_domains: list[str]) -> A
         available_domains : list[str]
             The list of supported git service host domain. This is used to convert repo-based PURL to a repository path
             of the corresponding software component.
+        provenance_finder: ProvenanceFinder
+            The provenance finder object to use when finding provenance.
+        provenance_payload : InToToPayload | None
+            The provenance in-toto payload for the software component.
 
         Returns
         -------
@@ -587,10 +616,29 @@ def to_analysis_target(config: Configuration, available_domains: list[str]) -> A
             case (_, ""):
                 # If a PURL but no repository path is provided, we try to extract the repository path from the PURL.
                 # Note that we can't always extract the repository path from any provided PURL.
-                repo = ""
                 converted_repo_path = None
+                repo: str = ""
+                digest: str = ""
                 # parsed_purl cannot be None here, but mypy cannot detect that without some extra help.
                 if parsed_purl is not None:
+                    # Try to find repository and commit via provenance.
+                    if not provenance_payload and provenance_finder:
+                        provenance_payload = provenance_finder.find_provenance(parsed_purl)
+                    if provenance_payload:
+                        try:
+                            repo, digest = extract_repo_and_commit_from_provenance(provenance_payload)
+                        except ProvenanceExtractionException as error:
+                            logger.debug("Failed to extract repo and commit from provenance: %s", error)
+
+                    if repo and digest:
+                        return Analyzer.AnalysisTarget(
+                            parsed_purl=parsed_purl,
+                            repo_path=repo,
+                            branch="",
+                            digest=digest,
+                        )
+
+                    # The commit was not found from provenance. Proceed with Repo and Commit Finder.
                     converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains)
                     if converted_repo_path is None:
                         # Try to find repo from PURL
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index 4e10f3ca8..1833e41be 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -10,6 +10,31 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
+# The full list of cryptographic algorithms supported in in-toto v0.1 provenance.
+# These are used as keys within the digest set of the resource descriptors within the subject.
+# For v0.1 see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
+VALID_ALGORITHMS = [
+    "sha256",
+    "sha224",
+    "sha384",
+    "sha512",
+    "sha512_224",
+    "sha512_256",
+    "sha3_224",
+    "sha3_256",
+    "sha3_384",
+    "sha3_512",
+    "shake128",
+    "shake256",
+    "blake2b",
+    "blake2s",
+    "ripemd160",
+    "sm3",
+    "gost",
+    "sha1",
+    "md5",
+]
+
 
 class InTotoV01Statement(TypedDict):
     """An in-toto version 0.1 statement.
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
index 8133635b4..9f1b95eb7 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
@@ -11,9 +11,9 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The full list of cryptographic algorithms supported in SLSA v1 provenance. These are used as keys within the digest
-#  set of the resource descriptors within the subject.
-# See: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
+# The full list of cryptographic algorithms supported in in-toto v1 provenance.
+# These are used as keys within the digest set of the resource descriptors within the subject.
+# For v1 see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
 VALID_ALGORITHMS = [
     "sha256",
     "sha224",

From ce120a76c586cd4754169a2907293b4f536e6c7e Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 12 Mar 2024 12:47:50 +1000
Subject: [PATCH 06/25] chore: use separate exception for json extract issues;
 remove redundant property from java repo finder; handle case where npm API
 returns no version; improve provenance extractor tests.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       |  99 ++--
 .../package_registry/npm_registry.py          |  43 ++
 .../repo_finder/test_provenance_extractor.py  | 455 ++++++++++++++++++
 3 files changed, 561 insertions(+), 36 deletions(-)
 create mode 100644 tests/repo_finder/test_provenance_extractor.py

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 59196ec44..81726aed4 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -38,23 +38,27 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
     repo = ""
     commit = ""
     predicate_type = payload.statement.get("predicateType")
-    if isinstance(payload, InTotoV1Payload):
-        if predicate_type == "https://slsa.dev/provenance/v1":
-            repo, commit = _extract_from_slsa_v1(payload)
-    elif isinstance(payload, InTotoV01Payload):
-        if predicate_type == "https://slsa.dev/provenance/v0.2":
-            repo, commit = _extract_from_slsa_v02(payload)
-        if predicate_type == "https://slsa.dev/provenance/v0.1":
-            repo, commit = _extract_from_slsa_v01(payload)
-        if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
-            repo, commit = _extract_from_witness_provenance(payload)
+    try:
+        if isinstance(payload, InTotoV1Payload):
+            if predicate_type == "https://slsa.dev/provenance/v1":
+                repo, commit = _extract_from_slsa_v1(payload)
+        elif isinstance(payload, InTotoV01Payload):
+            if predicate_type == "https://slsa.dev/provenance/v0.2":
+                repo, commit = _extract_from_slsa_v02(payload)
+            if predicate_type == "https://slsa.dev/provenance/v0.1":
+                repo, commit = _extract_from_slsa_v01(payload)
+            if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
+                repo, commit = _extract_from_witness_provenance(payload)
+    except JsonExtractionException as error:
+        logger.debug(error)
+        raise ProvenanceExtractionException("JSON exception while extracting from provenance.") from error
 
     if not repo or not commit:
         msg = (
             f"Extraction from provenance not supported for versions: "
             f"predicate_type {predicate_type}, in-toto {str(type(payload))}."
         )
-        logger.error(msg)
+        logger.debug(msg)
         raise ProvenanceExtractionException(msg)
 
     logger.debug("Extracted repo and commit from provenance: %s, %s", repo, commit)
@@ -69,18 +73,18 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
 
     # The repository URL and commit are stored inside an entry in the list of predicate -> materials.
     # In predicate -> recipe -> definedInMaterial we find the list index that points to the correct entry.
-    list_index = _json_extract(predicate, ["recipe", "definedInMaterial"], int)
-    material_list = _json_extract(predicate, ["materials"], list)
+    list_index = json_extract(predicate, ["recipe", "definedInMaterial"], int)
+    material_list = json_extract(predicate, ["materials"], list)
     if list_index >= len(material_list):
         raise ProvenanceExtractionException("Material list index outside of material list bounds.")
     material = material_list[list_index]
     if not material or not isinstance(material, dict):
         raise ProvenanceExtractionException("Indexed material list entry is invalid.")
 
-    uri = _json_extract(material, ["uri"], str)
+    uri = json_extract(material, ["uri"], str)
     repo = _clean_spdx(uri)
 
-    digest_set = _json_extract(material, ["digest"], dict)
+    digest_set = json_extract(material, ["digest"], dict)
     commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
     return repo, commit
@@ -94,10 +98,10 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
 
     # The repository URL and commit are stored within the predicate -> invocation -> configSource object.
     # See https://slsa.dev/spec/v0.2/provenance
-    uri = _json_extract(predicate, ["invocation", "configSource", "uri"], str)
+    uri = json_extract(predicate, ["invocation", "configSource", "uri"], str)
     repo = _clean_spdx(uri)
 
-    digest_set = _json_extract(predicate, ["invocation", "configSource", "digest"], dict)
+    digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
     commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
     return repo, commit
@@ -109,33 +113,33 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
     if not predicate:
         raise ProvenanceExtractionException("No predicate in payload statement.")
 
-    build_def = _json_extract(predicate, ["buildDefinition"], dict)
-    build_type = _json_extract(build_def, ["buildType"], str)
+    build_def = json_extract(predicate, ["buildDefinition"], dict)
+    build_type = json_extract(build_def, ["buildType"], str)
 
     # Extract the repository URL.
     repo = ""
     if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
         try:
-            repo = _json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
-        except ProvenanceExtractionException:
-            repo = _json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
+            repo = json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
+        except JsonExtractionException:
+            repo = json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
     if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
-        repo = _json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
+        repo = json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
 
     if not repo:
         raise ProvenanceExtractionException("Failed to extract repository URL from provenance.")
 
     # Extract the commit hash.
     commit = ""
-    deps = _json_extract(build_def, ["resolvedDependencies"], list)
+    deps = json_extract(build_def, ["resolvedDependencies"], list)
     for dep in deps:
         if not isinstance(dep, dict):
             continue
-        uri = _json_extract(dep, ["uri"], str)
+        uri = json_extract(dep, ["uri"], str)
         url = _clean_spdx(uri)
         if url != repo:
             continue
-        digest_set = _json_extract(dep, ["digest"], dict)
+        digest_set = json_extract(dep, ["digest"], dict)
         commit = _extract_commit_from_digest_set(digest_set, intoto.v1.VALID_ALGORITHMS)
 
     if not commit:
@@ -164,7 +168,7 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
     if not predicate:
         raise ProvenanceExtractionException("No predicate in payload statement.")
 
-    attestations = _json_extract(predicate, ["attestations"], list)
+    attestations = json_extract(predicate, ["attestations"], list)
     commit = ""
     repo = ""
     for entry in attestations:
@@ -174,11 +178,11 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
         if not entry_type:
             continue
         if entry_type.startswith("https://witness.dev/attestations/git/"):
-            commit = _json_extract(entry, ["attestation", "commithash"], str)
+            commit = json_extract(entry, ["attestation", "commithash"], str)
         elif entry_type.startswith("https://witness.dev/attestations/gitlab/") or entry_type.startswith(
             "https://witness.dev/attestations/github/"
         ):
-            repo = _json_extract(entry, ["attestation", "projecturl"], str)
+            repo = json_extract(entry, ["attestation", "projecturl"], str)
 
     if not commit or not repo:
         raise ProvenanceExtractionException("Could not extract repo and commit from provenance.")
@@ -212,25 +216,48 @@ def _clean_spdx(uri: str) -> str:
     return url
 
 
+class JsonExtractionException(BaseException):
+    """When there is an error while extracting from JSON."""
+
+
 T = TypeVar("T", bound=JsonType)
 
 
-def _json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T:
-    """Return the value found by following the list of depth-sequential keys inside the passed dictionary.
+def json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T:
+    """Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
+
+    The value must be truthy, and be of the passed type.
+
+    Parameters
+    ----------
+    entry: dict[str, JsonType]
+        An entry point into the JSON structure.
+    keys: list[str]
+        The list of depth-sequential keys within the JSON.
+    type: type[T]
+        The type to check the value against and return it as.
 
-    The value's type is validated against the passed type.
+    Returns
+    -------
+    T:
+        The found value as the type of the type parameter.
+
+    Raises
+    ------
+    JsonExtractionException
+        Raised if an error occurs while searching for or validating the value.
     """
     target = entry
     for index, key in enumerate(keys):
         if key not in target:
-            raise ProvenanceExtractionException(f"JSON key not found: {key}")
+            raise JsonExtractionException(f"JSON key not found: {key}")
         next_target = target[key]
         if index == len(keys) - 1:
-            if isinstance(next_target, type_):
+            if next_target and isinstance(next_target, type_):
                 return next_target
         else:
             if not isinstance(next_target, dict):
-                raise ProvenanceExtractionException(f"Extract value from non-dict type: {str(type(next_target))}")
+                raise JsonExtractionException(f"Cannot extract value from non-dict type: {str(type(next_target))}")
             target = next_target
 
-    raise ProvenanceExtractionException(f"Failed to find '{' > '.join(keys)}' as type '{type_}' in JSON dictionary.")
+    raise JsonExtractionException(f"Failed to find '{' > '.join(keys)}' as type '{type_}' in JSON dictionary.")
diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
index 6ceb01967..1e38486ae 100644
--- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
@@ -185,6 +185,8 @@ def download_attestation_payload(self, url: str, download_path: str) -> bool:
                 logger.debug("dsseEnvelope attribute in the bundle is missing. Skipping...")
                 continue
 
+            logger.debug("Found attestation with valid predicateType: %s", att.get("predicateType"))
+
             try:
                 with open(download_path, "w", encoding="utf-8") as file:
                     json.dump(dsse_env, file)
@@ -199,6 +201,47 @@ def download_attestation_payload(self, url: str, download_path: str) -> bool:
 
         return False
 
+    def get_latest_version(self, namespace: str, name: str) -> str | None:
+        """Try to retrieve the latest version of a package from the registry.
+
+        Parameters
+        ----------
+        namespace: str
+            The optional namespace of the package.
+        name: str
+            The name of the package.
+
+        Returns
+        -------
+        str | None
+            The latest version of the package, or None if one cannot be found.
+        """
+        if not name:
+            return None
+
+        url = f"https://{self.hostname}"
+        if namespace:
+            url = f"{url}/{namespace}"
+        url = f"{url}/{name}/latest"
+
+        response = send_get_http_raw(url, timeout=self.request_timeout)
+
+        if not response or not response.text:
+            logger.debug("No valid response from NPM server for latest version.")
+            return None
+
+        json_data = json.loads(response.text)
+        try:
+            version = json_data["version"]
+        except KeyError:
+            version = ""
+        if not version:
+            logger.debug("No version found in response from NPM server.")
+            return None
+
+        logger.debug("Found version for NPM artifact: %s", version)
+        return version if isinstance(version, str) else str(version)
+
 
 class NPMAttestationAsset(NamedTuple):
     """An attestation asset hosted on the npm registry.
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
new file mode 100644
index 000000000..dc4045ce0
--- /dev/null
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -0,0 +1,455 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module tests the provenance extractor on valid example provenances."""
+import json
+
+import pytest
+
+from macaron.repo_finder.provenance_extractor import (
+    JsonExtractionException,
+    ProvenanceExtractionException,
+    extract_repo_and_commit_from_provenance,
+    json_extract,
+)
+from macaron.slsa_analyzer.provenance.intoto import validate_intoto_payload
+from macaron.util import JsonType
+
+
+@pytest.fixture(name="slsa_v1_gcb_1_provenance")
+def slsa_v1_gcb_1_provenance_() -> str:
+    """Return a valid SLSA v1 provenance using build type gcb and sourceToBuild."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v1",
+                    "subject": [],
+                    "predicateType": "https://slsa.dev/provenance/v1",
+                    "predicate": {
+                        "buildDefinition": {
+                            "buildType": "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1",
+                            "externalParameters": {
+                                "sourceToBuild": {
+                                    "repository": "https://github.com/oracle/macaron"
+                                }
+                            },
+                            "resolvedDependencies": [
+                                {
+                                    "uri": "git+https://github.com/oracle/macaron@refs/heads/staging",
+                                    "digest": { "sha1": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0" }
+                                }
+                            ]
+                        }
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="slsa_v1_gcb_2_provenance")
+def slsa_v1_gcb_2_provenance_() -> str:
+    """Return a valid SLSA v1 provenance using build type gcb and configSource."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v1",
+                    "subject": [],
+                    "predicateType": "https://slsa.dev/provenance/v1",
+                    "predicate": {
+                        "buildDefinition": {
+                            "buildType": "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1",
+                            "externalParameters": {
+                                "configSource": {
+                                    "repository": "https://github.com/oracle/macaron"
+                                }
+                            },
+                            "resolvedDependencies": [
+                                {
+                                    "uri": "git+https://github.com/oracle/macaron@refs/heads/staging",
+                                    "digest": {
+                                        "sha1": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="slsa_v1_github_provenance")
+def slsa_v1_github_provenance_() -> str:
+    """Return a valid SLSA v1 provenance using build type GitHub."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v1",
+                    "subject": [],
+                    "predicateType": "https://slsa.dev/provenance/v1",
+                    "predicate": {
+                        "buildDefinition": {
+                            "buildType": "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1",
+                            "externalParameters": {
+                                "workflow": {
+                                    "repository": "https://github.com/oracle/macaron"
+                                }
+                            },
+                            "resolvedDependencies": [
+                                {
+                                    "uri": "git+https://github.com/oracle/macaron@refs/heads/staging",
+                                    "digest": {
+                                       "gitCommit": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                    }
+                                },
+                                {
+                                    "uri": "git+https://github.com/oracle-samples/macaron@refs/heads/main"
+                                }
+                            ]
+                        }
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="slsa_v02_provenance")
+def slsa_v02_provenance_() -> str:
+    """Return a valid SLSA v02 provenance."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v0.1",
+                    "subject": [],
+                    "predicateType": "https://slsa.dev/provenance/v0.2",
+                    "predicate": {
+                        "invocation": {
+                            "configSource": {
+                                "uri": "git+https://github.com/oracle/macaron@refs/heads/staging",
+                                "digest": {
+                                    "sha1": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                }
+                            }
+                        }
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="slsa_v01_provenance")
+def slsa_v01_provenance_() -> str:
+    """Return a valid SLSA v01 provenance."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v0.1",
+                    "subject": [],
+                    "predicateType": "https://slsa.dev/provenance/v0.1",
+                    "predicate": {
+                        "recipe": {
+                            "definedInMaterial": 1
+                        },
+                        "materials": [
+                            {
+                                "uri": "git+https://github.com/oracle-samples/macaron@refs/heads/main"
+                            },
+                            {
+                                "uri": "git+https://github.com/oracle/macaron@refs/heads/main",
+                                "digest": {
+                                    "sha256": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                }
+                            }
+                        ]
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="target_repository")
+def target_repository_() -> str:
+    """Return the target repository URL."""
+    return "https://github.com/oracle/macaron"
+
+
+@pytest.fixture(name="target_commit")
+def target_commit_() -> str:
+    """Return the target commit hash."""
+    return "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+
+
+def test_slsa_v1_gcb_1(slsa_v1_gcb_1_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test SLSA v1 provenance with build type gcb and sourceToBuild."""
+    payload = json.loads(slsa_v1_gcb_1_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], "")
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository key.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Add repository back.
+    _json_modify(
+        payload,
+        ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"],
+        target_repository,
+    )
+    # Re-test provenance validity.
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Remove commit.
+    _json_modify(payload, ["predicate", "buildDefinition", "resolvedDependencies"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def test_slsa_v1_gcb_2(slsa_v1_gcb_2_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test SLSA v1 provenance with build type gcb and configSource."""
+    payload = json.loads(slsa_v1_gcb_2_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], "")
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository key.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Re-add repository key with a bad value.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], "bad")
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def test_slsa_v1_github(slsa_v1_github_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test SLSA v1 provenance with build type GitHub."""
+    payload = json.loads(slsa_v1_github_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], "")
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository key.
+    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def test_slsa_v02(slsa_v02_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test SLSA v0.2 provenance."""
+    payload = json.loads(slsa_v02_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    _json_modify(payload, ["predicate", "invocation", "configSource", "uri"], "")
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository key.
+    _json_modify(payload, ["predicate", "invocation", "configSource", "uri"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Re-add repository and re-validate.
+    _json_modify(
+        payload, ["predicate", "invocation", "configSource", "uri"], f"git+{target_repository}@refs/heads/main"
+    )
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Remove commit.
+    _json_modify(payload, ["predicate", "invocation", "configSource", "digest", "sha1"], None)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def test_slsa_v01(slsa_v01_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test SLSA v0.1 provenance."""
+    payload = json.loads(slsa_v01_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    materials = json_extract(payload, ["predicate", "materials"], list)
+    material_index = json_extract(payload, ["predicate", "recipe", "definedInMaterial"], int)
+    _json_modify(materials[material_index], ["uri"], "")
+    _json_modify(payload, ["predicate", "materials"], materials)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository.
+    _json_modify(materials[material_index], ["uri"], None)
+    _json_modify(payload, ["predicate", "materials"], materials)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Restore repository and re-validate.
+    _json_modify(materials[material_index], ["uri"], f"git+{target_repository}@refs/heads/main")
+    _json_modify(payload, ["predicate", "materials"], materials)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set material index to an invalid value.
+    _json_modify(payload, ["predicate", "recipe", "definedInMaterial"], 10)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+@pytest.fixture(name="witness_gitlab_provenance")
+def witness_gitlab_provenance_() -> str:
+    """Return a Witness v0.1 provenance with a GitLab attestation."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v0.1",
+                    "subject": [],
+                    "predicateType": "https://witness.testifysec.com/attestation-collection/v0.1",
+                    "predicate": {
+                        "name": "test",
+                        "attestations": [
+                            {
+                                "type": "https://witness.dev/attestations/gitlab/v0.1",
+                                "attestation": {
+                                    "projecturl": "https://github.com/oracle/macaron"
+                                }
+                            },
+                            {
+                                "type": "https://witness.dev/attestations/git/v0.1",
+                                "attestation": {
+                                    "commithash": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                }
+                            }
+                        ]
+                    }
+                }
+            """
+
+
+@pytest.fixture(name="witness_github_provenance")
+def witness_github_provenance_() -> str:
+    """Return a Witness v0.1 provenance with a GitHub attestation."""
+    return """
+                {
+                    "_type": "https://in-toto.io/Statement/v0.1",
+                    "subject": [],
+                    "predicateType": "https://witness.testifysec.com/attestation-collection/v0.1",
+                    "predicate": {
+                        "name": "test",
+                        "attestations": [
+                            {
+                                "type": "https://witness.dev/attestations/github/v0.1",
+                                "attestation": {
+                                    "projecturl": "https://github.com/oracle/macaron"
+                                }
+                            },
+                            {
+                                "type": "https://witness.dev/attestations/git/v0.1",
+                                "attestation": {
+                                    "commithash": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                }
+                            }
+                        ]
+                    }
+                }
+            """
+
+
+def test_witness_gitlab(witness_gitlab_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test Witness v01 GitLab provenance."""
+    payload = json.loads(witness_gitlab_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set repository to an empty string.
+    attestations = json_extract(payload, ["predicate", "attestations"], list)
+    _json_modify(attestations[0], ["attestation", "projecturl"], "")
+    _json_modify(payload, ["attestation"], attestations)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove repository.
+    _json_modify(attestations[0], ["attestation", "projecturl"], None)
+    _json_modify(payload, ["attestation"], attestations)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Restore repository and re-validate.
+    _json_modify(attestations[0], ["attestation", "projecturl"], target_repository)
+    _json_modify(payload, ["attestation"], attestations)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+    # Set commit to an empty string.
+    _json_modify(attestations[1], ["attestation", "commithash"], "")
+    _json_modify(payload, ["attestation"], attestations)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+    # Remove the Git attestation.
+    _json_modify(payload, ["attestation"], attestations[:1])
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def test_witness_github(witness_github_provenance: str, target_repository: str, target_commit: str) -> None:
+    """Test Witness v01 GitHub provenance."""
+    payload = json.loads(witness_github_provenance)
+    assert isinstance(payload, dict)
+    _perform_provenance_comparison(payload, target_repository, target_commit)
+
+
+@pytest.mark.parametrize(
+    ("type_", "predicate_type"),
+    [
+        ("https://in-toto.io/Statement/v0.1", "https://slsa.dev/provenance/v1"),
+        ("https://in-toto.io/Statement/v1", "https://slsa.dev/provenance/v0.2"),
+        ("https://in-toto.io/Statement/v1", "https://slsa.dev/provenance/v0.1"),
+        ("https://in-toto.io/Statement/v1", "https://witness.testifysec.com/attestation-collection/v0.1"),
+    ],
+)
+def test_invalid_type_payloads(type_: str, predicate_type: str) -> None:
+    """Test payloads with invalid type combinations."""
+    payload_text = '{ "_type": ' + f'"{type_}",' + ' "predicateType": ' + f'"{predicate_type}",'
+    payload_text = f"{payload_text}" + '"subject": [], "predicate": {} }'
+    payload = json.loads(payload_text)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(payload, "", "")
+
+
+def _perform_provenance_comparison(payload: JsonType, expected_repo: str, expected_commit: str) -> None:
+    """Accept a provenance and extraction function, assert the extracted values match the expected ones."""
+    assert isinstance(payload, dict)
+    provenance = validate_intoto_payload(payload)
+    repo, commit = extract_repo_and_commit_from_provenance(provenance)
+    assert expected_repo == repo
+    assert expected_commit == commit
+
+
+def _json_modify(entry: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
+    """Modify the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
+
+    The found value will be overwritten by the new_value parameter.
+    If new_value is None, the value will be removed.
+    If the final key does not exist, it will be created as new_value.
+    """
+    target = entry
+    for index, key in enumerate(keys):
+        if key not in target:
+            if index == len(keys) - 1:
+                # Add key.
+                target[key] = new_value
+                return
+            raise JsonExtractionException(f"JSON key not found: {key}")
+        next_target = target[key]
+        if index == len(keys) - 1:
+            if new_value is None:
+                # Remove value.
+                del target[key]
+            else:
+                # Replace value
+                target[key] = new_value
+        else:
+            if not isinstance(next_target, dict):
+                raise JsonExtractionException(f"Cannot extract value from non-dict type: {str(type(next_target))}")
+            target = next_target

From 1d1a085c0f281502acee94445a4dc08f4bf26cf8 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 13 Mar 2024 12:49:51 +1000
Subject: [PATCH 07/25] chore: refactor stateful provenance finder.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/repo_finder/provenance_finder.py |  16 +--
 src/macaron/slsa_analyzer/analyzer.py        | 128 +++++++++++--------
 tests/repo_finder/test_repo_finder.py        |   3 +-
 tests/slsa_analyzer/test_analyzer.py         |  10 +-
 4 files changed, 91 insertions(+), 66 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_finder.py b/src/macaron/repo_finder/provenance_finder.py
index 957092593..fc4df1126 100644
--- a/src/macaron/repo_finder/provenance_finder.py
+++ b/src/macaron/repo_finder/provenance_finder.py
@@ -25,7 +25,6 @@ class ProvenanceFinder:
     """This class is used to find and retrieve provenance files from supported registries."""
 
     def __init__(self) -> None:
-        self.last_provenance_payload: InTotoPayload | None = None
         registries = PACKAGE_REGISTRIES
         self.npm_registry: NPMRegistry | None = None
         self.jfrog_registry: JFrogMavenRegistry | None = None
@@ -53,23 +52,18 @@ def find_provenance(self, purl: PackageURL) -> InTotoPayload | None:
             # Do not perform this function for repository type targets.
             return None
 
-        self.last_provenance_payload = None
-
         if purl.type == "npm":
             if self.npm_registry:
-                self.last_provenance_payload = ProvenanceFinder.find_npm_provenance(purl, self.npm_registry)
-            else:
-                logger.debug("Missing npm registry to find provenance in.")
+                return ProvenanceFinder.find_npm_provenance(purl, self.npm_registry)
+            logger.debug("Missing npm registry to find provenance in.")
         elif purl.type in ["gradle", "maven"]:
             if self.jfrog_registry:
-                self.last_provenance_payload = ProvenanceFinder.find_gav_provenance(purl, self.jfrog_registry)
-            else:
-                logger.debug("Missing JFrog registry to find provenance in.")
+                return ProvenanceFinder.find_gav_provenance(purl, self.jfrog_registry)
+            logger.debug("Missing JFrog registry to find provenance in.")
         else:
             logger.debug("Provenance finding not supported for PURL type: %s", purl.type)
-            self.last_provenance_payload = None
 
-        return self.last_provenance_payload
+        return None
 
     @staticmethod
     def find_npm_provenance(purl: PackageURL, npm_registry: NPMRegistry) -> InTotoPayload | None:
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 599991357..04b2c48a5 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -295,11 +295,44 @@ def run_single(
         Record
             The record of the analysis for this repository.
         """
+        # Parse the PURL.
         repo_id = config.get_value("id")
+        try:
+            parsed_purl = Analyzer.parse_purl(config)
+        except InvalidPURLError as error:
+            logger.error(error)
+            return Record(
+                record_id=repo_id,
+                description=str(error),
+                pre_config=config,
+                status=SCMStatus.ANALYSIS_FAILED,
+            )
+
+        if not provenance_payload and parsed_purl and not config.get_value("path"):
+            # Try to find the provenance file for the parsed PURL.
+            provenance_payload = ProvenanceFinder().find_provenance(parsed_purl)
+
+        # Create the analysis target.
+        msg = ""
+        available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
+        try:
+            analysis_target = Analyzer.to_analysis_target(config, available_domains, parsed_purl, provenance_payload)
+        except InvalidPURLError as error:
+            logger.debug("Invalid input PURL: %s", error)
+            msg = "Invalid input PURL."
+            analysis_target = None
+
+        if not analysis_target or (not analysis_target.parsed_purl and not analysis_target.repo_path):
+            return Record(
+                record_id=repo_id,
+                description=msg or "Cannot determine the analysis as PURL and/or repository path is not provided.",
+                pre_config=config,
+                status=SCMStatus.ANALYSIS_FAILED,
+            )
+
         component = None
-        provenance_finder = ProvenanceFinder()
         try:
-            component = self.add_component(config, analysis, provenance_finder, existing_records, provenance_payload)
+            component = self.add_component(analysis, analysis_target, existing_records)
         except PURLNotFoundError as error:
             logger.error(error)
             return Record(
@@ -327,9 +360,6 @@ def run_single(
         analyze_ctx.dynamic_data["expectation"] = self.expectations.get_expectation_for_target(
             analyze_ctx.component.purl.split("@")[0]
         )
-        if not provenance_payload:
-            # Retrieve the provenance file from the finder. May also be None.
-            provenance_payload = provenance_finder.last_provenance_payload
         analyze_ctx.dynamic_data["provenance"] = provenance_payload
         analyze_ctx.check_results = self.perform_checks(analyze_ctx)
 
@@ -451,11 +481,9 @@ class AnalysisTarget(NamedTuple):
 
     def add_component(
         self,
-        config: Configuration,
         analysis: Analysis,
-        provenance_finder: ProvenanceFinder,
+        analysis_target: AnalysisTarget,
         existing_records: dict[str, Record] | None = None,
-        provenance_payload: InTotoPayload | None = None,
     ) -> Component:
         """Add a software component if it does not exist in the DB already.
 
@@ -464,16 +492,12 @@ def add_component(
 
         Parameters
         ----------
-        config: Configuration
-            The configuration for running Macaron.
         analysis: Analysis
             The current analysis instance.
-        provenance_finder: ProvenanceFinder
-            The provenance finder object to use when finding provenance.
+        analysis_target: AnalysisTarget
+            The target of this analysis.
         existing_records : dict[str, Record] | None
             The mapping of existing records that the analysis has run successfully.
-        provenance_payload : InToToPayload | None
-            The provenance in-toto payload for the software component.
 
         Returns
         -------
@@ -488,17 +512,6 @@ def add_component(
             The component is already analyzed in the same session.
         """
         # Note: the component created in this function will be added to the database.
-        available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
-        try:
-            analysis_target = Analyzer.to_analysis_target(
-                config, available_domains, provenance_finder, provenance_payload
-            )
-        except InvalidPURLError as error:
-            raise PURLNotFoundError("Invalid input PURL.") from error
-
-        if not analysis_target.parsed_purl and not analysis_target.repo_path:
-            raise PURLNotFoundError("Cannot determine the analysis as PURL and/or repository path is not provided.")
-
         repository = None
         if analysis_target.repo_path:
             git_obj = self._prepare_repo(
@@ -547,11 +560,47 @@ def add_component(
         # available or not.
         return Component(purl=analysis_target.parsed_purl.to_string(), analysis=analysis, repository=repository)
 
+    @staticmethod
+    def parse_purl(config: Configuration) -> PackageURL | None:
+        """Parse the PURL provided in the input.
+
+        Parameters
+        ----------
+        config : Configuration
+            The target configuration that stores the user input values for the software component.
+
+        Returns
+        -------
+        PackageURL | None
+            The parsed PURL, or None if one was not provided as input.
+
+        Raises
+        ------
+        InvalidPURLError
+            If the PURL provided from the user is invalid.
+        """
+        # Due to the current design of Configuration class, repo_path, branch and digest are initialized
+        # as empty strings, and we assumed that they are always set with input values as non-empty strings.
+        # Therefore, their true types are ``str``, and an empty string indicates that the input value is not provided.
+        # The purl might be a PackageURL type, a string, or None, which should be reduced down to an optional
+        # PackageURL type.
+        if config.get_value("purl") is None or config.get_value("purl") == "":
+            return None
+        purl = config.get_value("purl")
+        if isinstance(purl, PackageURL):
+            return purl
+        try:
+            # Note that PackageURL.from_string sanitizes the unsafe characters in the purl string,
+            # which is user-controllable, by calling urllib's `urlsplit` function.
+            return PackageURL.from_string(purl)
+        except ValueError as error:
+            raise InvalidPURLError(f"Invalid input PURL: {purl}") from error
+
     @staticmethod
     def to_analysis_target(
         config: Configuration,
         available_domains: list[str],
-        provenance_finder: ProvenanceFinder | None = None,
+        parsed_purl: PackageURL | None,
         provenance_payload: InTotoPayload | None = None,
     ) -> AnalysisTarget:
         """Resolve the details of a software component from user input.
@@ -563,8 +612,8 @@ def to_analysis_target(
         available_domains : list[str]
             The list of supported git service host domain. This is used to convert repo-based PURL to a repository path
             of the corresponding software component.
-        provenance_finder: ProvenanceFinder
-            The provenance finder object to use when finding provenance.
+        parsed_purl: PackageURL | None
+            The PURL to use for the analysis target, or None if one has not been provided.
         provenance_payload : InToToPayload | None
             The provenance in-toto payload for the software component.
 
@@ -578,24 +627,6 @@ def to_analysis_target(
         InvalidPURLError
             If the PURL provided from the user is invalid.
         """
-        # Due to the current design of Configuration class, repo_path, branch and digest are initialized
-        # as empty strings, and we assumed that they are always set with input values as non-empty strings.
-        # Therefore, their true types are ``str``, and an empty string indicates that the input value is not provided.
-        # The purl might be a PackageURL type, a string, or None, which should be reduced down to an optional
-        # PackageURL type.
-        parsed_purl: PackageURL | None
-        if config.get_value("purl") is None or config.get_value("purl") == "":
-            parsed_purl = None
-        elif isinstance(config.get_value("purl"), PackageURL):
-            parsed_purl = config.get_value("purl")
-        else:
-            try:
-                # Note that PackageURL.from_string sanitizes the unsafe characters in the purl string,
-                # which is user-controllable, by calling urllib's `urlsplit` function.
-                parsed_purl = PackageURL.from_string(config.get_value("purl"))
-            except ValueError as error:
-                raise InvalidPURLError(f"Invalid input PURL: {config.get_value('purl')}") from error
-
         repo_path_input: str = config.get_value("path")
         input_branch: str = config.get_value("branch")
         input_digest: str = config.get_value("digest")
@@ -621,10 +652,8 @@ def to_analysis_target(
                 digest: str = ""
                 # parsed_purl cannot be None here, but mypy cannot detect that without some extra help.
                 if parsed_purl is not None:
-                    # Try to find repository and commit via provenance.
-                    if not provenance_payload and provenance_finder:
-                        provenance_payload = provenance_finder.find_provenance(parsed_purl)
                     if provenance_payload:
+                        # Try to find repository and commit via provenance.
                         try:
                             repo, digest = extract_repo_and_commit_from_provenance(provenance_payload)
                         except ProvenanceExtractionException as error:
@@ -721,7 +750,6 @@ def _prepare_repo(
             The pydriller.Git object of the repository or None if error.
         """
         # TODO: separate the logic for handling remote and local repos instead of putting them into this method.
-
         logger.info(
             "Preparing the repository for the analysis (path=%s, branch=%s, digest=%s)",
             repo_path,
diff --git a/tests/repo_finder/test_repo_finder.py b/tests/repo_finder/test_repo_finder.py
index 6b724d2e2..03b86c4d5 100644
--- a/tests/repo_finder/test_repo_finder.py
+++ b/tests/repo_finder/test_repo_finder.py
@@ -72,7 +72,8 @@ def test_resolve_analysis_target(
     config: Configuration, available_domains: list[str], expect: Analyzer.AnalysisTarget
 ) -> None:
     """Test the resolve analysis target method with valid inputs."""
-    assert Analyzer.to_analysis_target(config, available_domains) == expect
+    parsed_purl = Analyzer.parse_purl(config)
+    assert Analyzer.to_analysis_target(config, available_domains, parsed_purl) == expect
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/test_analyzer.py b/tests/slsa_analyzer/test_analyzer.py
index d82d6676d..3d305590e 100644
--- a/tests/slsa_analyzer/test_analyzer.py
+++ b/tests/slsa_analyzer/test_analyzer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module tests the slsa_analyzer.Gh module."""
@@ -103,7 +103,8 @@ def test_resolve_analysis_target(
     config: Configuration, available_domains: list[str], expect: Analyzer.AnalysisTarget
 ) -> None:
     """Test the resolve analysis target method with valid inputs."""
-    assert Analyzer.to_analysis_target(config, available_domains) == expect
+    parsed_purl = Analyzer.parse_purl(config)
+    assert Analyzer.to_analysis_target(config, available_domains, parsed_purl) == expect
 
 
 @given(
@@ -136,7 +137,8 @@ def test_invalid_analysis_target(
         }
     )
     try:
-        Analyzer.to_analysis_target(config, available_domains)
+        purl = Analyzer.parse_purl(config)
+        Analyzer.to_analysis_target(config, available_domains, purl)
     except InvalidPURLError:
         pass
 
@@ -151,4 +153,4 @@ def test_invalid_analysis_target(
 def test_resolve_analysis_target_invalid_purl(config: Configuration) -> None:
     """Test the resolve analysis target method with invalid inputs."""
     with pytest.raises(InvalidPURLError):
-        Analyzer.to_analysis_target(config, [])
+        Analyzer.parse_purl(config)

From bded451d0eaf7ae8ea95e98d78bc29843f27a32b Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 13 Mar 2024 13:00:09 +1000
Subject: [PATCH 08/25] chore: use GitLab URL in GitLab provenance test.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 tests/repo_finder/test_provenance_extractor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index dc4045ce0..44c6d2e84 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -312,13 +312,13 @@ def witness_gitlab_provenance_() -> str:
                             {
                                 "type": "https://witness.dev/attestations/gitlab/v0.1",
                                 "attestation": {
-                                    "projecturl": "https://github.com/oracle/macaron"
+                                    "projecturl": "https://gitlab.com/tinyMediaManager/tinyMediaManager"
                                 }
                             },
                             {
                                 "type": "https://witness.dev/attestations/git/v0.1",
                                 "attestation": {
-                                    "commithash": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                    "commithash": "cf6080a92d1c748ba5f05ea16529e05e5c641a49"
                                 }
                             }
                         ]
@@ -356,8 +356,10 @@ def witness_github_provenance_() -> str:
             """
 
 
-def test_witness_gitlab(witness_gitlab_provenance: str, target_repository: str, target_commit: str) -> None:
+def test_witness_gitlab(witness_gitlab_provenance: str) -> None:
     """Test Witness v01 GitLab provenance."""
+    target_repository = "https://gitlab.com/tinyMediaManager/tinyMediaManager"
+    target_commit = "cf6080a92d1c748ba5f05ea16529e05e5c641a49"
     payload = json.loads(witness_gitlab_provenance)
     assert isinstance(payload, dict)
     _perform_provenance_comparison(payload, target_repository, target_commit)

From 50c06077008b3532f6d575fff7bc120ef9a7b56c Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 13 Mar 2024 19:52:09 +1000
Subject: [PATCH 09/25] chore: further refactor analysis target callsite and
 functionality; refactor provenance extractor tests; assume one provenance per
 GAV in provenance finder; make npn registry namespace consistent.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       |   2 +-
 src/macaron/repo_finder/provenance_finder.py  |   4 +-
 src/macaron/slsa_analyzer/analyzer.py         |  36 +-
 .../package_registry/npm_registry.py          |   9 +-
 .../repo_finder/test_provenance_extractor.py  | 393 +++++++++---------
 tests/slsa_analyzer/test_analyzer.py          |   8 +-
 6 files changed, 241 insertions(+), 211 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 81726aed4..409f45538 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -216,7 +216,7 @@ def _clean_spdx(uri: str) -> str:
     return url
 
 
-class JsonExtractionException(BaseException):
+class JsonExtractionException(MacaronError):
     """When there is an error while extracting from JSON."""
 
 
diff --git a/src/macaron/repo_finder/provenance_finder.py b/src/macaron/repo_finder/provenance_finder.py
index fc4df1126..06018a13a 100644
--- a/src/macaron/repo_finder/provenance_finder.py
+++ b/src/macaron/repo_finder/provenance_finder.py
@@ -85,7 +85,7 @@ def find_npm_provenance(purl: PackageURL, npm_registry: NPMRegistry) -> InTotoPa
             logger.debug("The npm registry is not enabled.")
             return None
 
-        namespace = purl.namespace or ""
+        namespace = purl.namespace
         artifact_id = purl.name
         version = purl.version
 
@@ -219,7 +219,7 @@ def find_gav_provenance(purl: PackageURL, jfrog_registry: JFrogMavenRegistry) ->
             logger.debug("No payloads found in provenance files.")
             return None
 
-        # TODO decide what to do when multiple provenance payloads are present.
+        # We assume that there is only one provenance per GAV.
         provenance = provenances[0]
 
         return provenance
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 04b2c48a5..00f0aca91 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -21,7 +21,14 @@
 from macaron.database.database_manager import DatabaseManager, get_db_manager, get_db_session
 from macaron.database.table_definitions import Analysis, Component, Repository
 from macaron.dependency_analyzer import DependencyAnalyzer, DependencyInfo
-from macaron.errors import CloneError, DuplicateError, InvalidPURLError, PURLNotFoundError, RepoCheckOutError
+from macaron.errors import (
+    CloneError,
+    DuplicateError,
+    InvalidPURLError,
+    MacaronError,
+    PURLNotFoundError,
+    RepoCheckOutError,
+)
 from macaron.output_reporter.reporter import FileReporter
 from macaron.output_reporter.results import Record, Report, SCMStatus
 from macaron.repo_finder import repo_finder
@@ -313,23 +320,18 @@ def run_single(
             provenance_payload = ProvenanceFinder().find_provenance(parsed_purl)
 
         # Create the analysis target.
-        msg = ""
         available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
         try:
             analysis_target = Analyzer.to_analysis_target(config, available_domains, parsed_purl, provenance_payload)
-        except InvalidPURLError as error:
-            logger.debug("Invalid input PURL: %s", error)
-            msg = "Invalid input PURL."
-            analysis_target = None
-
-        if not analysis_target or (not analysis_target.parsed_purl and not analysis_target.repo_path):
+        except InvalidAnalysisTargetError as error:
             return Record(
                 record_id=repo_id,
-                description=msg or "Cannot determine the analysis as PURL and/or repository path is not provided.",
+                description=str(error),
                 pre_config=config,
                 status=SCMStatus.ANALYSIS_FAILED,
             )
 
+        # Create the component.
         component = None
         try:
             component = self.add_component(analysis, analysis_target, existing_records)
@@ -624,8 +626,8 @@ def to_analysis_target(
 
         Raises
         ------
-        InvalidPURLError
-            If the PURL provided from the user is invalid.
+        InvalidAnalysisTargetError
+            Raised if a valid Analysis Target cannot be created.
         """
         repo_path_input: str = config.get_value("path")
         input_branch: str = config.get_value("branch")
@@ -633,7 +635,9 @@ def to_analysis_target(
 
         match (parsed_purl, repo_path_input):
             case (None, ""):
-                return Analyzer.AnalysisTarget(parsed_purl=None, repo_path="", branch="", digest="")
+                raise InvalidAnalysisTargetError(
+                    "Cannot determine the analysis target: PURL and repository path are missing."
+                )
 
             case (None, _):
                 # If only the repository path is provided, we will use the user-provided repository path to create the
@@ -689,7 +693,9 @@ def to_analysis_target(
                 )
 
             case _:
-                return Analyzer.AnalysisTarget(parsed_purl=None, repo_path="", branch="", digest="")
+                raise InvalidAnalysisTargetError(
+                    "Cannot determine the analysis target: PURL and repository path are missing."
+                )
 
     def get_analyze_ctx(self, component: Component) -> AnalyzeContext:
         """Return the analyze context for a target component.
@@ -996,3 +1002,7 @@ def __init__(self, *args: Any, context: AnalyzeContext | None = None, **kwargs:
         """
         super().__init__(*args, **kwargs)
         self.context: AnalyzeContext | None = context
+
+
+class InvalidAnalysisTargetError(MacaronError):
+    """When a valid Analysis Target cannot be constructed."""
diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
index 1e38486ae..e62185023 100644
--- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
@@ -201,12 +201,12 @@ def download_attestation_payload(self, url: str, download_path: str) -> bool:
 
         return False
 
-    def get_latest_version(self, namespace: str, name: str) -> str | None:
+    def get_latest_version(self, namespace: str | None, name: str) -> str | None:
         """Try to retrieve the latest version of a package from the registry.
 
         Parameters
         ----------
-        namespace: str
+        namespace: str | None
             The optional namespace of the package.
         name: str
             The name of the package.
@@ -231,10 +231,7 @@ def get_latest_version(self, namespace: str, name: str) -> str | None:
             return None
 
         json_data = json.loads(response.text)
-        try:
-            version = json_data["version"]
-        except KeyError:
-            version = ""
+        version = json_data.get("version")
         if not version:
             logger.debug("No version found in response from NPM server.")
             return None
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index 44c6d2e84..e8efffd49 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -17,9 +17,10 @@
 
 
 @pytest.fixture(name="slsa_v1_gcb_1_provenance")
-def slsa_v1_gcb_1_provenance_() -> str:
+def slsa_v1_gcb_1_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type gcb and sourceToBuild."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
                     "subject": [],
@@ -42,12 +43,14 @@ def slsa_v1_gcb_1_provenance_() -> str:
                     }
                 }
             """
+    )
 
 
 @pytest.fixture(name="slsa_v1_gcb_2_provenance")
-def slsa_v1_gcb_2_provenance_() -> str:
+def slsa_v1_gcb_2_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type gcb and configSource."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
                     "subject": [],
@@ -72,12 +75,14 @@ def slsa_v1_gcb_2_provenance_() -> str:
                     }
                 }
             """
+    )
 
 
 @pytest.fixture(name="slsa_v1_github_provenance")
-def slsa_v1_github_provenance_() -> str:
+def slsa_v1_github_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type GitHub."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
                     "subject": [],
@@ -105,12 +110,14 @@ def slsa_v1_github_provenance_() -> str:
                     }
                 }
             """
+    )
 
 
 @pytest.fixture(name="slsa_v02_provenance")
-def slsa_v02_provenance_() -> str:
+def slsa_v02_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v02 provenance."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
                     "subject": [],
@@ -127,12 +134,14 @@ def slsa_v02_provenance_() -> str:
                     }
                 }
             """
+    )
 
 
 @pytest.fixture(name="slsa_v01_provenance")
-def slsa_v01_provenance_() -> str:
+def slsa_v01_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v01 provenance."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
                     "subject": [],
@@ -155,153 +164,14 @@ def slsa_v01_provenance_() -> str:
                     }
                 }
             """
-
-
-@pytest.fixture(name="target_repository")
-def target_repository_() -> str:
-    """Return the target repository URL."""
-    return "https://github.com/oracle/macaron"
-
-
-@pytest.fixture(name="target_commit")
-def target_commit_() -> str:
-    """Return the target commit hash."""
-    return "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
-
-
-def test_slsa_v1_gcb_1(slsa_v1_gcb_1_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test SLSA v1 provenance with build type gcb and sourceToBuild."""
-    payload = json.loads(slsa_v1_gcb_1_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set repository to an empty string.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], "")
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Remove repository key.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Add repository back.
-    _json_modify(
-        payload,
-        ["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"],
-        target_repository,
-    )
-    # Re-test provenance validity.
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Remove commit.
-    _json_modify(payload, ["predicate", "buildDefinition", "resolvedDependencies"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-
-def test_slsa_v1_gcb_2(slsa_v1_gcb_2_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test SLSA v1 provenance with build type gcb and configSource."""
-    payload = json.loads(slsa_v1_gcb_2_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set repository to an empty string.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], "")
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Remove repository key.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Re-add repository key with a bad value.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], "bad")
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-
-def test_slsa_v1_github(slsa_v1_github_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test SLSA v1 provenance with build type GitHub."""
-    payload = json.loads(slsa_v1_github_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set repository to an empty string.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], "")
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Remove repository key.
-    _json_modify(payload, ["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-
-def test_slsa_v02(slsa_v02_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test SLSA v0.2 provenance."""
-    payload = json.loads(slsa_v02_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set repository to an empty string.
-    _json_modify(payload, ["predicate", "invocation", "configSource", "uri"], "")
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Remove repository key.
-    _json_modify(payload, ["predicate", "invocation", "configSource", "uri"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Re-add repository and re-validate.
-    _json_modify(
-        payload, ["predicate", "invocation", "configSource", "uri"], f"git+{target_repository}@refs/heads/main"
     )
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Remove commit.
-    _json_modify(payload, ["predicate", "invocation", "configSource", "digest", "sha1"], None)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-
-def test_slsa_v01(slsa_v01_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test SLSA v0.1 provenance."""
-    payload = json.loads(slsa_v01_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set repository to an empty string.
-    materials = json_extract(payload, ["predicate", "materials"], list)
-    material_index = json_extract(payload, ["predicate", "recipe", "definedInMaterial"], int)
-    _json_modify(materials[material_index], ["uri"], "")
-    _json_modify(payload, ["predicate", "materials"], materials)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Remove repository.
-    _json_modify(materials[material_index], ["uri"], None)
-    _json_modify(payload, ["predicate", "materials"], materials)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
-
-    # Restore repository and re-validate.
-    _json_modify(materials[material_index], ["uri"], f"git+{target_repository}@refs/heads/main")
-    _json_modify(payload, ["predicate", "materials"], materials)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
-
-    # Set material index to an invalid value.
-    _json_modify(payload, ["predicate", "recipe", "definedInMaterial"], 10)
-    with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
 
 
 @pytest.fixture(name="witness_gitlab_provenance")
-def witness_gitlab_provenance_() -> str:
+def witness_gitlab_provenance_() -> dict[str, JsonType]:
     """Return a Witness v0.1 provenance with a GitLab attestation."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
                     "subject": [],
@@ -325,12 +195,14 @@ def witness_gitlab_provenance_() -> str:
                     }
                 }
             """
+    )
 
 
 @pytest.fixture(name="witness_github_provenance")
-def witness_github_provenance_() -> str:
+def witness_github_provenance_() -> dict[str, JsonType]:
     """Return a Witness v0.1 provenance with a GitHub attestation."""
-    return """
+    return _load_and_validate_josn(
+        """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
                     "subject": [],
@@ -354,51 +226,190 @@ def witness_github_provenance_() -> str:
                     }
                 }
             """
+    )
+
+
+@pytest.fixture(name="target_repository")
+def target_repository_() -> str:
+    """Return the target repository URL."""
+    return "https://github.com/oracle/macaron"
 
 
-def test_witness_gitlab(witness_gitlab_provenance: str) -> None:
-    """Test Witness v01 GitLab provenance."""
-    target_repository = "https://gitlab.com/tinyMediaManager/tinyMediaManager"
-    target_commit = "cf6080a92d1c748ba5f05ea16529e05e5c641a49"
-    payload = json.loads(witness_gitlab_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
+@pytest.fixture(name="target_commit")
+def target_commit_() -> str:
+    """Return the target commit hash."""
+    return "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+
 
-    # Set repository to an empty string.
-    attestations = json_extract(payload, ["predicate", "attestations"], list)
-    _json_modify(attestations[0], ["attestation", "projecturl"], "")
-    _json_modify(payload, ["attestation"], attestations)
+def test_slsa_v1_gcb_1_is_valid(
+    slsa_v1_gcb_1_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test valid SLSA v1 provenance with build type gcb and sourceToBuild."""
+    _perform_provenance_comparison(slsa_v1_gcb_1_provenance, target_repository, target_commit)
+
+
+@pytest.mark.parametrize(
+    ("keys", "new_value"),
+    [
+        (["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], ""),
+        (["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], None),
+        (["predicate", "buildDefinition", "externalParameters", "sourceToBuild", "repository"], "bad_url"),
+        (["predicate", "buildDefinition", "resolvedDependencies"], ""),
+        (["predicate", "buildDefinition", "resolvedDependencies"], None),
+    ],
+)
+def test_slsa_v1_gcb_1_is_invalid(
+    slsa_v1_gcb_1_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
+) -> None:
+    """Test invalidly modified SLSA v1 provenance with build type gcb and sourceToBuild."""
+    _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
+        _perform_provenance_comparison(slsa_v1_gcb_1_provenance, "", "")
+
+
+def test_slsa_v1_gcb_2_is_valid(
+    slsa_v1_gcb_2_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test valid SLSA v1 provenance with build type gcb and configSource."""
+    _perform_provenance_comparison(slsa_v1_gcb_2_provenance, target_repository, target_commit)
 
-    # Remove repository.
-    _json_modify(attestations[0], ["attestation", "projecturl"], None)
-    _json_modify(payload, ["attestation"], attestations)
+
+@pytest.mark.parametrize(
+    ("keys", "new_value"),
+    [
+        (["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], ""),
+        (["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], None),
+        (["predicate", "buildDefinition", "externalParameters", "configSource", "repository"], "bad_url"),
+    ],
+)
+def test_slsa_v1_gcb_2_is_invalid(
+    slsa_v1_gcb_2_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
+) -> None:
+    """Test invalidly modified SLSA v1 provenance with build type gcb and configSource."""
+    _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
+        _perform_provenance_comparison(slsa_v1_gcb_2_provenance, "", "")
+
 
-    # Restore repository and re-validate.
-    _json_modify(attestations[0], ["attestation", "projecturl"], target_repository)
-    _json_modify(payload, ["attestation"], attestations)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
+def test_slsa_v1_github_is_valid(
+    slsa_v1_github_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test valid SLSA v1 provenance with build type GitHub."""
+    _perform_provenance_comparison(slsa_v1_github_provenance, target_repository, target_commit)
 
-    # Set commit to an empty string.
-    _json_modify(attestations[1], ["attestation", "commithash"], "")
-    _json_modify(payload, ["attestation"], attestations)
+
+@pytest.mark.parametrize(
+    ("keys", "new_value"),
+    [
+        (["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], ""),
+        (["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], None),
+        (["predicate", "buildDefinition", "externalParameters", "workflow", "repository"], "bad_url"),
+    ],
+)
+def test_slsa_v1_github_is_invalid(
+    slsa_v1_github_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
+) -> None:
+    """Test invalidly modified SLSA v1 provenance with build type GitHub."""
+    _json_modify(slsa_v1_github_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
+        _perform_provenance_comparison(slsa_v1_github_provenance, "", "")
 
-    # Remove the Git attestation.
-    _json_modify(payload, ["attestation"], attestations[:1])
+
+def test_slsa_v02_is_valid(
+    slsa_v02_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test SLSA v0.2 provenance."""
+    _perform_provenance_comparison(slsa_v02_provenance, target_repository, target_commit)
+
+
+@pytest.mark.parametrize(
+    ("keys", "new_value"),
+    [
+        (["predicate", "invocation", "configSource", "uri"], ""),
+        (["predicate", "invocation", "configSource", "uri"], None),
+        (["predicate", "invocation", "configSource", "uri"], "bad_url"),
+        (["predicate", "invocation", "configSource", "digest", "sha1"], ""),
+        (["predicate", "invocation", "configSource", "digest", "sha1"], None),
+    ],
+)
+def test_slsa_v02_is_invalid(slsa_v02_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
+    """Test invalidly modified SLSA v0.2 provenance."""
+    _json_modify(slsa_v02_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
+        _perform_provenance_comparison(slsa_v02_provenance, "", "")
+
+
+def test_slsa_v01_is_valid(
+    slsa_v01_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test valid SLSA v0.1 provenance."""
+    _perform_provenance_comparison(slsa_v01_provenance, target_repository, target_commit)
+
+
+@pytest.mark.parametrize(
+    "new_value",
+    [
+        "",
+        None,
+    ],
+)
+def test_slsa_v01_is_invalid(slsa_v01_provenance: dict[str, JsonType], new_value: JsonType) -> None:
+    """Test invalidly modified SLSA v0.1 provenance."""
+    materials = json_extract(slsa_v01_provenance, ["predicate", "materials"], list)
+    material_index = json_extract(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], int)
+    _json_modify(materials[material_index], ["uri"], new_value)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(slsa_v01_provenance, "", "")
 
 
-def test_witness_github(witness_github_provenance: str, target_repository: str, target_commit: str) -> None:
-    """Test Witness v01 GitHub provenance."""
-    payload = json.loads(witness_github_provenance)
-    assert isinstance(payload, dict)
-    _perform_provenance_comparison(payload, target_repository, target_commit)
+def test_slsa_v01_invalid_material_index(slsa_v01_provenance: dict[str, JsonType]) -> None:
+    """Test the SLSA v0.1 provenance with an invalid materials index."""
+    _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(slsa_v01_provenance, "", "")
+
+
+def test_witness_gitlab_is_valid(witness_gitlab_provenance: dict[str, JsonType]) -> None:
+    """Test valid Witness v0.1 GitLab provenance."""
+    _perform_provenance_comparison(
+        witness_gitlab_provenance,
+        "https://gitlab.com/tinyMediaManager/tinyMediaManager",
+        "cf6080a92d1c748ba5f05ea16529e05e5c641a49",
+    )
+
+
+def test_witness_github_is_valid(
+    witness_github_provenance: dict[str, JsonType], target_repository: str, target_commit: str
+) -> None:
+    """Test valid Witness v0.1 GitHub provenance."""
+    _perform_provenance_comparison(witness_github_provenance, target_repository, target_commit)
+
+
+@pytest.mark.parametrize(
+    ("keys", "new_value", "attestation_index"),
+    [
+        (["attestation", "projecturl"], "", 0),
+        (["attestation", "projecturl"], None, 0),
+        (["attestation", "commithash"], "", 1),
+        (["attestation", "commithash"], None, 1),
+    ],
+)
+def test_witness_github_is_invalid(
+    witness_github_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType, attestation_index: int
+) -> None:
+    """Test invalidly modified Witness v0.1 GitHub provenance."""
+    attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
+    _json_modify(attestations[attestation_index], keys, new_value)
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(witness_github_provenance, "", "")
+
+
+def test_witness_github_remove_attestation(witness_github_provenance: dict[str, JsonType]) -> None:
+    """Test removing Git attestation from Witness V0.1 GitHub provenance."""
+    attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
+    _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
+    with pytest.raises(ProvenanceExtractionException):
+        _perform_provenance_comparison(witness_github_provenance, "", "")
 
 
 @pytest.mark.parametrize(
@@ -419,9 +430,8 @@ def test_invalid_type_payloads(type_: str, predicate_type: str) -> None:
         _perform_provenance_comparison(payload, "", "")
 
 
-def _perform_provenance_comparison(payload: JsonType, expected_repo: str, expected_commit: str) -> None:
+def _perform_provenance_comparison(payload: dict[str, JsonType], expected_repo: str, expected_commit: str) -> None:
     """Accept a provenance and extraction function, assert the extracted values match the expected ones."""
-    assert isinstance(payload, dict)
     provenance = validate_intoto_payload(payload)
     repo, commit = extract_repo_and_commit_from_provenance(provenance)
     assert expected_repo == repo
@@ -455,3 +465,10 @@ def _json_modify(entry: dict[str, JsonType], keys: list[str], new_value: JsonTyp
             if not isinstance(next_target, dict):
                 raise JsonExtractionException(f"Cannot extract value from non-dict type: {str(type(next_target))}")
             target = next_target
+
+
+def _load_and_validate_josn(payload: str) -> dict[str, JsonType]:
+    """Load payload as JSON and validate it is of type dict."""
+    json_payload = json.loads(payload)
+    assert isinstance(json_payload, dict)
+    return json_payload
diff --git a/tests/slsa_analyzer/test_analyzer.py b/tests/slsa_analyzer/test_analyzer.py
index 3d305590e..e5f840ba6 100644
--- a/tests/slsa_analyzer/test_analyzer.py
+++ b/tests/slsa_analyzer/test_analyzer.py
@@ -13,7 +13,7 @@
 
 from macaron.config.target_config import Configuration
 from macaron.errors import InvalidPURLError
-from macaron.slsa_analyzer.analyzer import Analyzer
+from macaron.slsa_analyzer.analyzer import Analyzer, InvalidAnalysisTargetError
 
 from ..macaron_testcase import MacaronTestCase
 
@@ -154,3 +154,9 @@ def test_resolve_analysis_target_invalid_purl(config: Configuration) -> None:
     """Test the resolve analysis target method with invalid inputs."""
     with pytest.raises(InvalidPURLError):
         Analyzer.parse_purl(config)
+
+
+def test_resolve_analysis_target_no_purl_or_repository() -> None:
+    """Test creation of an Analysis Target when no PURL or repository path is provided."""
+    with pytest.raises(InvalidAnalysisTargetError):
+        Analyzer.to_analysis_target(Configuration(), [], None)

From bafebfcf24c5fa83c0aed543996429034c3471a5 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Wed, 13 Mar 2024 20:13:43 +1000
Subject: [PATCH 10/25] chore: add type for npm latest version response to help
 mypy.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/package_registry/npm_registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
index e62185023..7786d0e1b 100644
--- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py
@@ -231,13 +231,13 @@ def get_latest_version(self, namespace: str | None, name: str) -> str | None:
             return None
 
         json_data = json.loads(response.text)
-        version = json_data.get("version")
+        version: str | None = json_data.get("version")
         if not version:
             logger.debug("No version found in response from NPM server.")
             return None
 
         logger.debug("Found version for NPM artifact: %s", version)
-        return version if isinstance(version, str) else str(version)
+        return version
 
 
 class NPMAttestationAsset(NamedTuple):

From e5e80d901f52d3cfd94ada19576a8e49010f1ee0 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 14 Mar 2024 09:05:21 +1000
Subject: [PATCH 11/25] chore: remove duplicate test; update test for analysis
 target changes.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 tests/repo_finder/test_repo_finder.py | 65 ---------------------------
 tests/slsa_analyzer/test_analyzer.py  |  5 ---
 2 files changed, 70 deletions(-)

diff --git a/tests/repo_finder/test_repo_finder.py b/tests/repo_finder/test_repo_finder.py
index 03b86c4d5..ba0bc2b20 100644
--- a/tests/repo_finder/test_repo_finder.py
+++ b/tests/repo_finder/test_repo_finder.py
@@ -6,74 +6,9 @@
 from pathlib import Path
 
 import pytest
-from packageurl import PackageURL
 
 from macaron.config.defaults import load_defaults
-from macaron.config.target_config import Configuration
 from macaron.repo_finder.repo_finder_java import JavaRepoFinder
-from macaron.slsa_analyzer.analyzer import Analyzer
-
-
-@pytest.mark.parametrize(
-    ("config", "available_domains", "expect"),
-    [
-        (
-            Configuration({"purl": ""}),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(parsed_purl=None, repo_path="", branch="", digest=""),
-        ),
-        (
-            Configuration({"purl": "pkg:github.com/apache/maven"}),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(
-                parsed_purl=PackageURL.from_string("pkg:github.com/apache/maven"),
-                repo_path="https://github.com/apache/maven",
-                branch="",
-                digest="",
-            ),
-        ),
-        (
-            Configuration({"purl": "", "path": "https://github.com/apache/maven"}),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(
-                parsed_purl=None, repo_path="https://github.com/apache/maven", branch="", digest=""
-            ),
-        ),
-        (
-            Configuration({"purl": "pkg:maven/apache/maven", "path": "https://github.com/apache/maven"}),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(
-                parsed_purl=PackageURL.from_string("pkg:maven/apache/maven"),
-                repo_path="https://github.com/apache/maven",
-                branch="",
-                digest="",
-            ),
-        ),
-        (
-            Configuration(
-                {
-                    "purl": "pkg:maven/apache/maven",
-                    "path": "https://github.com/apache/maven",
-                    "branch": "master",
-                    "digest": "abcxyz",
-                }
-            ),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(
-                parsed_purl=PackageURL.from_string("pkg:maven/apache/maven"),
-                repo_path="https://github.com/apache/maven",
-                branch="master",
-                digest="abcxyz",
-            ),
-        ),
-    ],
-)
-def test_resolve_analysis_target(
-    config: Configuration, available_domains: list[str], expect: Analyzer.AnalysisTarget
-) -> None:
-    """Test the resolve analysis target method with valid inputs."""
-    parsed_purl = Analyzer.parse_purl(config)
-    assert Analyzer.to_analysis_target(config, available_domains, parsed_purl) == expect
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/test_analyzer.py b/tests/slsa_analyzer/test_analyzer.py
index e5f840ba6..18e6eae59 100644
--- a/tests/slsa_analyzer/test_analyzer.py
+++ b/tests/slsa_analyzer/test_analyzer.py
@@ -48,11 +48,6 @@ def test_resolve_local_path(self) -> None:
 @pytest.mark.parametrize(
     ("config", "available_domains", "expect"),
     [
-        (
-            Configuration({"purl": ""}),
-            ["github.com", "gitlab.com", "bitbucket.org"],
-            Analyzer.AnalysisTarget(parsed_purl=None, repo_path="", branch="", digest=""),
-        ),
         (
             Configuration({"purl": "pkg:github.com/apache/maven"}),
             ["github.com", "gitlab.com", "bitbucket.org"],

From 77a88bbb2554f148e6b0fa5aa9200ac3cac67d80 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 14 Mar 2024 12:38:57 +1000
Subject: [PATCH 12/25] chore: minor fix.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 tests/repo_finder/test_provenance_extractor.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index e8efffd49..e98bee8bc 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -19,7 +19,7 @@
 @pytest.fixture(name="slsa_v1_gcb_1_provenance")
 def slsa_v1_gcb_1_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type gcb and sourceToBuild."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
@@ -49,7 +49,7 @@ def slsa_v1_gcb_1_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="slsa_v1_gcb_2_provenance")
 def slsa_v1_gcb_2_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type gcb and configSource."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
@@ -81,7 +81,7 @@ def slsa_v1_gcb_2_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="slsa_v1_github_provenance")
 def slsa_v1_github_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v1 provenance using build type GitHub."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v1",
@@ -116,7 +116,7 @@ def slsa_v1_github_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="slsa_v02_provenance")
 def slsa_v02_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v02 provenance."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
@@ -140,7 +140,7 @@ def slsa_v02_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="slsa_v01_provenance")
 def slsa_v01_provenance_() -> dict[str, JsonType]:
     """Return a valid SLSA v01 provenance."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
@@ -170,7 +170,7 @@ def slsa_v01_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="witness_gitlab_provenance")
 def witness_gitlab_provenance_() -> dict[str, JsonType]:
     """Return a Witness v0.1 provenance with a GitLab attestation."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
@@ -201,7 +201,7 @@ def witness_gitlab_provenance_() -> dict[str, JsonType]:
 @pytest.fixture(name="witness_github_provenance")
 def witness_github_provenance_() -> dict[str, JsonType]:
     """Return a Witness v0.1 provenance with a GitHub attestation."""
-    return _load_and_validate_josn(
+    return _load_and_validate_json(
         """
                 {
                     "_type": "https://in-toto.io/Statement/v0.1",
@@ -467,7 +467,7 @@ def _json_modify(entry: dict[str, JsonType], keys: list[str], new_value: JsonTyp
             target = next_target
 
 
-def _load_and_validate_josn(payload: str) -> dict[str, JsonType]:
+def _load_and_validate_json(payload: str) -> dict[str, JsonType]:
     """Load payload as JSON and validate it is of type dict."""
     json_payload = json.loads(payload)
     assert isinstance(json_payload, dict)

From cfd9a3e7c8eb0659416f37c4310b8ae964df0a0d Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 14 Mar 2024 14:15:15 +1000
Subject: [PATCH 13/25] chore: update comment to reflect immediate proceedings
 only; refactor json_extract function.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       | 38 +++++++++++--------
 src/macaron/slsa_analyzer/analyzer.py         |  2 +-
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 409f45538..7914797d5 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -82,11 +82,15 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
         raise ProvenanceExtractionException("Indexed material list entry is invalid.")
 
     uri = json_extract(material, ["uri"], str)
+
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(material, ["digest"], dict)
     commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
+    if not commit:
+        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
+
     return repo, commit
 
 
@@ -99,11 +103,16 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
     # The repository URL and commit are stored within the predicate -> invocation -> configSource object.
     # See https://slsa.dev/spec/v0.2/provenance
     uri = json_extract(predicate, ["invocation", "configSource", "uri"], str)
+    if not uri:
+        raise ProvenanceExtractionException("Failed to extract repository URL from provenance.")
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
     commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
 
+    if not commit:
+        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
+
     return repo, commit
 
 
@@ -223,15 +232,15 @@ class JsonExtractionException(MacaronError):
 T = TypeVar("T", bound=JsonType)
 
 
-def json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) -> T:
+def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T:
     """Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
-    The value must be truthy, and be of the passed type.
+    The value must be of the passed type.
 
     Parameters
     ----------
-    entry: dict[str, JsonType]
-        An entry point into the JSON structure.
+    entry: JsonType
+        An entry point into a JSON structure.
     keys: list[str]
         The list of depth-sequential keys within the JSON.
     type: type[T]
@@ -248,16 +257,15 @@ def json_extract(entry: dict[str, JsonType], keys: list[str], type_: type[T]) ->
         Raised if an error occurs while searching for or validating the value.
     """
     target = entry
+
     for index, key in enumerate(keys):
+        if not isinstance(target, dict):
+            raise JsonExtractionException(f"Expect the value .{'.'.join(keys[:index])} to be a dict.")
         if key not in target:
-            raise JsonExtractionException(f"JSON key not found: {key}")
-        next_target = target[key]
-        if index == len(keys) - 1:
-            if next_target and isinstance(next_target, type_):
-                return next_target
-        else:
-            if not isinstance(next_target, dict):
-                raise JsonExtractionException(f"Cannot extract value from non-dict type: {str(type(next_target))}")
-            target = next_target
-
-    raise JsonExtractionException(f"Failed to find '{' > '.join(keys)}' as type '{type_}' in JSON dictionary.")
+            raise JsonExtractionException(f"JSON key '{key}' not found in .{'.'.join(keys[:index])}.")
+        target = target[key]
+
+    if isinstance(target, type_):
+        return target
+
+    raise JsonExtractionException(f"Expect the value .{'.'.join(keys)} to be of type '{type_}'.")
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 00f0aca91..5c0a38758 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -671,7 +671,7 @@ def to_analysis_target(
                             digest=digest,
                         )
 
-                    # The commit was not found from provenance. Proceed with Repo and Commit Finder.
+                    # The commit was not found from provenance. Proceed with Repo Finder.
                     converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains)
                     if converted_repo_path is None:
                         # Try to find repo from PURL

From 488f64a0baaa66c8ffad79a6821cc55883fd9b45 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 18 Mar 2024 11:02:07 +1000
Subject: [PATCH 14/25] chore: restrict in-toto digest set algorithms; refactor
 provenance extractor tests.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       |  1 -
 src/macaron/slsa_analyzer/analyzer.py         |  4 +-
 .../provenance/intoto/v01/__init__.py         | 22 +----
 .../provenance/intoto/v1/__init__.py          | 23 +----
 .../repo_finder/test_provenance_extractor.py  | 97 +++++++++----------
 5 files changed, 55 insertions(+), 92 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 7914797d5..cca65559d 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -204,7 +204,6 @@ def _extract_commit_from_digest_set(digest_set: dict[str, JsonType], valid_algor
 
     The DigestSet is an in-toto object that maps algorithm types to commit hashes (digests).
     """
-    # TODO decide on a preference for which algorithm to accept.
     if len(digest_set.keys()) > 1:
         logger.debug("DigestSet contains multiple algorithms: %s", digest_set.keys())
 
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 5c0a38758..c3604d080 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -586,9 +586,9 @@ def parse_purl(config: Configuration) -> PackageURL | None:
         # Therefore, their true types are ``str``, and an empty string indicates that the input value is not provided.
         # The purl might be a PackageURL type, a string, or None, which should be reduced down to an optional
         # PackageURL type.
-        if config.get_value("purl") is None or config.get_value("purl") == "":
-            return None
         purl = config.get_value("purl")
+        if purl is None or purl == "":
+            return None
         if isinstance(purl, PackageURL):
             return purl
         try:
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index 1833e41be..fb8a83963 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -10,29 +10,11 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The full list of cryptographic algorithms supported in in-toto v0.1 provenance.
+# The list of cryptographic algorithms supported in in-toto v0.1 provenance.
 # These are used as keys within the digest set of the resource descriptors within the subject.
-# For v0.1 see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
+# For the full v0.1 list see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
 VALID_ALGORITHMS = [
-    "sha256",
-    "sha224",
-    "sha384",
-    "sha512",
-    "sha512_224",
-    "sha512_256",
-    "sha3_224",
-    "sha3_256",
-    "sha3_384",
-    "sha3_512",
-    "shake128",
-    "shake256",
-    "blake2b",
-    "blake2s",
-    "ripemd160",
-    "sm3",
-    "gost",
     "sha1",
-    "md5",
 ]
 
 
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
index 9f1b95eb7..c6cbf75cd 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
@@ -11,29 +11,12 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The full list of cryptographic algorithms supported in in-toto v1 provenance.
+# The list of cryptographic algorithms supported in in-toto v1 provenance.
 # These are used as keys within the digest set of the resource descriptors within the subject.
-# For v1 see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
+# For the full v1 list see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
 VALID_ALGORITHMS = [
-    "sha256",
-    "sha224",
-    "sha384",
-    "sha512",
-    "sha512_224",
-    "sha512_256",
-    "sha3_224",
-    "sha3_256",
-    "sha3_384",
-    "sha3_512",
-    "shake128",
-    "shake256",
-    "blake2b",
-    "blake2s",
-    "ripemd160",
-    "sm3",
-    "gost",
     "sha1",
-    "md5",
+    "gitCommit",  # This special git value is equivalent to SHA-1 or SHA-256. See the v1 spec for more information.
 ]
 
 
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index e98bee8bc..2c502d891 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -7,7 +7,6 @@
 import pytest
 
 from macaron.repo_finder.provenance_extractor import (
-    JsonExtractionException,
     ProvenanceExtractionException,
     extract_repo_and_commit_from_provenance,
     json_extract,
@@ -157,7 +156,7 @@ def slsa_v01_provenance_() -> dict[str, JsonType]:
                             {
                                 "uri": "git+https://github.com/oracle/macaron@refs/heads/main",
                                 "digest": {
-                                    "sha256": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
+                                    "sha1": "51aa22a42ec1bffa71518041a6a6d42d40bf50f0"
                                 }
                             }
                         ]
@@ -245,7 +244,7 @@ def test_slsa_v1_gcb_1_is_valid(
     slsa_v1_gcb_1_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test valid SLSA v1 provenance with build type gcb and sourceToBuild."""
-    _perform_provenance_comparison(slsa_v1_gcb_1_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_1_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -262,16 +261,16 @@ def test_slsa_v1_gcb_1_is_invalid(
     slsa_v1_gcb_1_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and sourceToBuild."""
-    _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
+    assert _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v1_gcb_1_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_1_provenance)
 
 
 def test_slsa_v1_gcb_2_is_valid(
     slsa_v1_gcb_2_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test valid SLSA v1 provenance with build type gcb and configSource."""
-    _perform_provenance_comparison(slsa_v1_gcb_2_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_2_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -286,16 +285,16 @@ def test_slsa_v1_gcb_2_is_invalid(
     slsa_v1_gcb_2_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and configSource."""
-    _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
+    assert _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v1_gcb_2_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_2_provenance)
 
 
 def test_slsa_v1_github_is_valid(
     slsa_v1_github_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test valid SLSA v1 provenance with build type GitHub."""
-    _perform_provenance_comparison(slsa_v1_github_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(slsa_v1_github_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -310,16 +309,16 @@ def test_slsa_v1_github_is_invalid(
     slsa_v1_github_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type GitHub."""
-    _json_modify(slsa_v1_github_provenance, keys, new_value)
+    assert _json_modify(slsa_v1_github_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v1_github_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v1_github_provenance)
 
 
 def test_slsa_v02_is_valid(
     slsa_v02_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test SLSA v0.2 provenance."""
-    _perform_provenance_comparison(slsa_v02_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(slsa_v02_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -334,16 +333,16 @@ def test_slsa_v02_is_valid(
 )
 def test_slsa_v02_is_invalid(slsa_v02_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
     """Test invalidly modified SLSA v0.2 provenance."""
-    _json_modify(slsa_v02_provenance, keys, new_value)
+    assert _json_modify(slsa_v02_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v02_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v02_provenance)
 
 
 def test_slsa_v01_is_valid(
     slsa_v01_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test valid SLSA v0.1 provenance."""
-    _perform_provenance_comparison(slsa_v01_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -357,21 +356,21 @@ def test_slsa_v01_is_invalid(slsa_v01_provenance: dict[str, JsonType], new_value
     """Test invalidly modified SLSA v0.1 provenance."""
     materials = json_extract(slsa_v01_provenance, ["predicate", "materials"], list)
     material_index = json_extract(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], int)
-    _json_modify(materials[material_index], ["uri"], new_value)
+    assert _json_modify(materials[material_index], ["uri"], new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v01_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
 
 def test_slsa_v01_invalid_material_index(slsa_v01_provenance: dict[str, JsonType]) -> None:
     """Test the SLSA v0.1 provenance with an invalid materials index."""
-    _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
+    assert _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(slsa_v01_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
 
 def test_witness_gitlab_is_valid(witness_gitlab_provenance: dict[str, JsonType]) -> None:
     """Test valid Witness v0.1 GitLab provenance."""
-    _perform_provenance_comparison(
+    _test_extract_repo_and_commit_from_provenance(
         witness_gitlab_provenance,
         "https://gitlab.com/tinyMediaManager/tinyMediaManager",
         "cf6080a92d1c748ba5f05ea16529e05e5c641a49",
@@ -382,7 +381,7 @@ def test_witness_github_is_valid(
     witness_github_provenance: dict[str, JsonType], target_repository: str, target_commit: str
 ) -> None:
     """Test valid Witness v0.1 GitHub provenance."""
-    _perform_provenance_comparison(witness_github_provenance, target_repository, target_commit)
+    _test_extract_repo_and_commit_from_provenance(witness_github_provenance, target_repository, target_commit)
 
 
 @pytest.mark.parametrize(
@@ -399,17 +398,17 @@ def test_witness_github_is_invalid(
 ) -> None:
     """Test invalidly modified Witness v0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
-    _json_modify(attestations[attestation_index], keys, new_value)
+    assert _json_modify(attestations[attestation_index], keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(witness_github_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
 
 def test_witness_github_remove_attestation(witness_github_provenance: dict[str, JsonType]) -> None:
     """Test removing Git attestation from Witness V0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
-    _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
+    assert _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(witness_github_provenance, "", "")
+        _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
 
 @pytest.mark.parametrize(
@@ -423,14 +422,14 @@ def test_witness_github_remove_attestation(witness_github_provenance: dict[str,
 )
 def test_invalid_type_payloads(type_: str, predicate_type: str) -> None:
     """Test payloads with invalid type combinations."""
-    payload_text = '{ "_type": ' + f'"{type_}",' + ' "predicateType": ' + f'"{predicate_type}",'
-    payload_text = f"{payload_text}" + '"subject": [], "predicate": {} }'
-    payload = json.loads(payload_text)
+    payload: dict[str, JsonType] = {"_type": type_, "predicateType": predicate_type, "subject": [], "predicate": {}}
     with pytest.raises(ProvenanceExtractionException):
-        _perform_provenance_comparison(payload, "", "")
+        _test_extract_repo_and_commit_from_provenance(payload)
 
 
-def _perform_provenance_comparison(payload: dict[str, JsonType], expected_repo: str, expected_commit: str) -> None:
+def _test_extract_repo_and_commit_from_provenance(
+    payload: dict[str, JsonType], expected_repo: str = "", expected_commit: str = ""
+) -> None:
     """Accept a provenance and extraction function, assert the extracted values match the expected ones."""
     provenance = validate_intoto_payload(payload)
     repo, commit = extract_repo_and_commit_from_provenance(provenance)
@@ -438,7 +437,7 @@ def _perform_provenance_comparison(payload: dict[str, JsonType], expected_repo:
     assert expected_commit == commit
 
 
-def _json_modify(entry: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
+def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> bool:
     """Modify the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
     The found value will be overwritten by the new_value parameter.
@@ -446,25 +445,25 @@ def _json_modify(entry: dict[str, JsonType], keys: list[str], new_value: JsonTyp
     If the final key does not exist, it will be created as new_value.
     """
     target = entry
-    for index, key in enumerate(keys):
+    last_target = None
+
+    for key in keys:
+        if not isinstance(target, dict):
+            return False
         if key not in target:
-            if index == len(keys) - 1:
-                # Add key.
-                target[key] = new_value
-                return
-            raise JsonExtractionException(f"JSON key not found: {key}")
-        next_target = target[key]
-        if index == len(keys) - 1:
-            if new_value is None:
-                # Remove value.
-                del target[key]
-            else:
-                # Replace value
-                target[key] = new_value
-        else:
-            if not isinstance(next_target, dict):
-                raise JsonExtractionException(f"Cannot extract value from non-dict type: {str(type(next_target))}")
-            target = next_target
+            return False
+        last_target = target
+        target = target[key]
+
+    if last_target is None:
+        return False
+
+    if new_value is None:
+        del last_target[keys[len(keys) - 1]]
+    else:
+        last_target[keys[len(keys) - 1]] = new_value
+
+    return True
 
 
 def _load_and_validate_json(payload: str) -> dict[str, JsonType]:

From 7d995cf7a99001fc905bb2b6d4ccf98c3f983a9e Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 18 Mar 2024 11:06:18 +1000
Subject: [PATCH 15/25] chore: improve digest set debug information.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/repo_finder/provenance_extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index cca65559d..0d84c3f64 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -212,7 +212,7 @@ def _extract_commit_from_digest_set(digest_set: dict[str, JsonType], valid_algor
             value = digest_set.get(key)
             if isinstance(value, str):
                 return value
-    raise ProvenanceExtractionException("No valid digest in digest set.")
+    raise ProvenanceExtractionException(f"No valid digest in digest set: {digest_set.keys()} not in {valid_algorithms}")
 
 
 def _clean_spdx(uri: str) -> str:

From 9a826389cd59e927460b3ac72992e6203be98a9b Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Mon, 18 Mar 2024 11:18:19 +1000
Subject: [PATCH 16/25] chore: separate SLSA extraction digest set algorithms
 from in-toto acceptance list.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../repo_finder/provenance_extractor.py       | 12 ++++++----
 .../provenance/intoto/v01/__init__.py         | 22 +++++++++++++++++--
 .../provenance/intoto/v1/__init__.py          | 22 +++++++++++++++++--
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 0d84c3f64..fee18a90c 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -6,7 +6,6 @@
 from typing import TypeVar
 
 from macaron.errors import MacaronError
-from macaron.slsa_analyzer.provenance import intoto
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
 from macaron.util import JsonType
 
@@ -17,6 +16,11 @@ class ProvenanceExtractionException(MacaronError):
     """When there is an error while extracting from provenance."""
 
 
+SLSA_V01_DIGEST_SET_ALGORITHMS = ["sha1"]
+SLSA_V02_DIGEST_SET_ALGORITHMS = ["sha1"]
+SLSA_V1_DIGEST_SET_ALGORITHMS = ["sha1", "gitCommit"]
+
+
 def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the passed provenance payload.
 
@@ -86,7 +90,7 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(material, ["digest"], dict)
-    commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
+    commit = _extract_commit_from_digest_set(digest_set, SLSA_V01_DIGEST_SET_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
@@ -108,7 +112,7 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
-    commit = _extract_commit_from_digest_set(digest_set, intoto.v01.VALID_ALGORITHMS)
+    commit = _extract_commit_from_digest_set(digest_set, SLSA_V02_DIGEST_SET_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
@@ -149,7 +153,7 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
         if url != repo:
             continue
         digest_set = json_extract(dep, ["digest"], dict)
-        commit = _extract_commit_from_digest_set(digest_set, intoto.v1.VALID_ALGORITHMS)
+        commit = _extract_commit_from_digest_set(digest_set, SLSA_V1_DIGEST_SET_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index fb8a83963..1833e41be 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -10,11 +10,29 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The list of cryptographic algorithms supported in in-toto v0.1 provenance.
+# The full list of cryptographic algorithms supported in in-toto v0.1 provenance.
 # These are used as keys within the digest set of the resource descriptors within the subject.
-# For the full v0.1 list see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
+# For v0.1 see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
 VALID_ALGORITHMS = [
+    "sha256",
+    "sha224",
+    "sha384",
+    "sha512",
+    "sha512_224",
+    "sha512_256",
+    "sha3_224",
+    "sha3_256",
+    "sha3_384",
+    "sha3_512",
+    "shake128",
+    "shake256",
+    "blake2b",
+    "blake2s",
+    "ripemd160",
+    "sm3",
+    "gost",
     "sha1",
+    "md5",
 ]
 
 
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
index c6cbf75cd..fc25bcd07 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
@@ -11,11 +11,29 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The list of cryptographic algorithms supported in in-toto v1 provenance.
+# The full list of cryptographic algorithms supported in in-toto v1 provenance.
 # These are used as keys within the digest set of the resource descriptors within the subject.
-# For the full v1 list see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
+# For v1 see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
 VALID_ALGORITHMS = [
+    "sha256",
+    "sha224",
+    "sha384",
+    "sha512",
+    "sha512_224",
+    "sha512_256",
+    "sha3_224",
+    "sha3_256",
+    "sha3_384",
+    "sha3_512",
+    "shake128",
+    "shake256",
+    "blake2b",
+    "blake2s",
+    "ripemd160",
+    "sm3",
+    "gost",
     "sha1",
+    "md5",
     "gitCommit",  # This special git value is equivalent to SHA-1 or SHA-256. See the v1 spec for more information.
 ]
 

From 673b5620fd21c30a74085e5f7361253fd5e97454 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 19 Mar 2024 09:24:27 +1000
Subject: [PATCH 17/25] chore: address PR feedback.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../provenance/witness/__init__.py            |  2 -
 .../repo_finder/test_provenance_extractor.py  | 40 +++++++------------
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/src/macaron/slsa_analyzer/provenance/witness/__init__.py b/src/macaron/slsa_analyzer/provenance/witness/__init__.py
index cbe1afe8e..408fb31ca 100644
--- a/src/macaron/slsa_analyzer/provenance/witness/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/witness/__init__.py
@@ -136,8 +136,6 @@ def extract_witness_provenance_subjects(witness_payload: InTotoPayload) -> set[W
     dict[str, str]
         A dictionary in which each key is a subject name and each value is the corresponding SHA256 digest.
     """
-    # TODO: add support for in-toto v1 provenances.
-
     if isinstance(witness_payload, InTotoV01Payload):
         subjects = witness_payload.statement["subject"]
         subject_digests = set()
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index 2c502d891..e2db85c94 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -261,7 +261,7 @@ def test_slsa_v1_gcb_1_is_invalid(
     slsa_v1_gcb_1_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and sourceToBuild."""
-    assert _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
+    _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_1_provenance)
 
@@ -285,7 +285,7 @@ def test_slsa_v1_gcb_2_is_invalid(
     slsa_v1_gcb_2_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and configSource."""
-    assert _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
+    _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_2_provenance)
 
@@ -309,7 +309,7 @@ def test_slsa_v1_github_is_invalid(
     slsa_v1_github_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type GitHub."""
-    assert _json_modify(slsa_v1_github_provenance, keys, new_value)
+    _json_modify(slsa_v1_github_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_github_provenance)
 
@@ -333,7 +333,7 @@ def test_slsa_v02_is_valid(
 )
 def test_slsa_v02_is_invalid(slsa_v02_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
     """Test invalidly modified SLSA v0.2 provenance."""
-    assert _json_modify(slsa_v02_provenance, keys, new_value)
+    _json_modify(slsa_v02_provenance, keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v02_provenance)
 
@@ -356,14 +356,14 @@ def test_slsa_v01_is_invalid(slsa_v01_provenance: dict[str, JsonType], new_value
     """Test invalidly modified SLSA v0.1 provenance."""
     materials = json_extract(slsa_v01_provenance, ["predicate", "materials"], list)
     material_index = json_extract(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], int)
-    assert _json_modify(materials[material_index], ["uri"], new_value)
+    _json_modify(materials[material_index], ["uri"], new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
 
 def test_slsa_v01_invalid_material_index(slsa_v01_provenance: dict[str, JsonType]) -> None:
     """Test the SLSA v0.1 provenance with an invalid materials index."""
-    assert _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
+    _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
@@ -398,7 +398,7 @@ def test_witness_github_is_invalid(
 ) -> None:
     """Test invalidly modified Witness v0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
-    assert _json_modify(attestations[attestation_index], keys, new_value)
+    _json_modify(attestations[attestation_index], keys, new_value)
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
@@ -406,7 +406,7 @@ def test_witness_github_is_invalid(
 def test_witness_github_remove_attestation(witness_github_provenance: dict[str, JsonType]) -> None:
     """Test removing Git attestation from Witness V0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
-    assert _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
+    _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
     with pytest.raises(ProvenanceExtractionException):
         _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
@@ -440,28 +440,16 @@ def _test_extract_repo_and_commit_from_provenance(
 def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> bool:
     """Modify the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
-    The found value will be overwritten by the new_value parameter.
-    If new_value is None, the value will be removed.
-    If the final key does not exist, it will be created as new_value.
+    The found value will be overwritten by the `new_value` parameter.
+    If `new_value` is `None`, the value will be removed.
+    If the final key does not exist, it will be created as `new_value`.
     """
-    target = entry
-    last_target = None
-
-    for key in keys:
-        if not isinstance(target, dict):
-            return False
-        if key not in target:
-            return False
-        last_target = target
-        target = target[key]
-
-    if last_target is None:
-        return False
+    target: dict[str, JsonType] = json_extract(entry, keys[:-1], dict)
 
     if new_value is None:
-        del last_target[keys[len(keys) - 1]]
+        del target[keys[-1]]
     else:
-        last_target[keys[len(keys) - 1]] = new_value
+        target[keys[-1]] = new_value
 
     return True
 

From c6481f654ca86391d5d64bcd7662d5421251cfe4 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 19 Mar 2024 09:28:19 +1000
Subject: [PATCH 18/25] chore: minor fix.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 tests/repo_finder/test_provenance_extractor.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index e2db85c94..aa485f892 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -437,7 +437,7 @@ def _test_extract_repo_and_commit_from_provenance(
     assert expected_commit == commit
 
 
-def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> bool:
+def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> None:
     """Modify the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
     The found value will be overwritten by the `new_value` parameter.
@@ -451,8 +451,6 @@ def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> bool:
     else:
         target[keys[-1]] = new_value
 
-    return True
-
 
 def _load_and_validate_json(payload: str) -> dict[str, JsonType]:
     """Load payload as JSON and validate it is of type dict."""

From c48c6508d687a210dff3a5a92cc017dd41e52e10 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 19 Mar 2024 16:05:39 +1000
Subject: [PATCH 19/25] chore: unify digest set validation across in-toto
 versions.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index 1833e41be..52d30c81d 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -186,7 +186,9 @@ def is_valid_digest_set(digest: dict[str, JsonType]) -> TypeGuard[dict[str, str]
         ``True`` if the digest set is valid according to the spec, in which case its type
         is narrowed to a ``dict[str, str]``; ``False`` otherwise.
     """
-    for value in digest.values():
-        if not isinstance(value, str):
+    for key in digest:
+        if key not in VALID_ALGORITHMS:
+            return False
+        if not isinstance(digest[key], str):
             return False
     return True

From 2267b185bfbfa5c4fd01a4ed1fef8c0df0000570 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 21 Mar 2024 09:16:40 +1000
Subject: [PATCH 20/25] chore: specify Git in SLSA digest set algorithm list.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/repo_finder/provenance_extractor.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index fee18a90c..048e1c09c 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -16,9 +16,9 @@ class ProvenanceExtractionException(MacaronError):
     """When there is an error while extracting from provenance."""
 
 
-SLSA_V01_DIGEST_SET_ALGORITHMS = ["sha1"]
-SLSA_V02_DIGEST_SET_ALGORITHMS = ["sha1"]
-SLSA_V1_DIGEST_SET_ALGORITHMS = ["sha1", "gitCommit"]
+SLSA_V01_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
+SLSA_V02_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
+SLSA_V1_DIGEST_SET_GIT_ALGORITHMS = ["sha1", "gitCommit"]
 
 
 def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str, str]:
@@ -90,7 +90,7 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(material, ["digest"], dict)
-    commit = _extract_commit_from_digest_set(digest_set, SLSA_V01_DIGEST_SET_ALGORITHMS)
+    commit = _extract_commit_from_digest_set(digest_set, SLSA_V01_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
@@ -112,7 +112,7 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
-    commit = _extract_commit_from_digest_set(digest_set, SLSA_V02_DIGEST_SET_ALGORITHMS)
+    commit = _extract_commit_from_digest_set(digest_set, SLSA_V02_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
@@ -153,7 +153,7 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
         if url != repo:
             continue
         digest_set = json_extract(dep, ["digest"], dict)
-        commit = _extract_commit_from_digest_set(digest_set, SLSA_V1_DIGEST_SET_ALGORITHMS)
+        commit = _extract_commit_from_digest_set(digest_set, SLSA_V1_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
         raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")

From b545b67b38459c9aa59d4bbdc3365870060f1f34 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 21 Mar 2024 10:45:20 +1000
Subject: [PATCH 21/25] chore: remove algorithm validation in digest set.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../provenance/intoto/v01/__init__.py         | 27 ------------------
 .../provenance/intoto/v1/__init__.py          | 28 -------------------
 2 files changed, 55 deletions(-)

diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index 52d30c81d..95fc3b304 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -10,31 +10,6 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The full list of cryptographic algorithms supported in in-toto v0.1 provenance.
-# These are used as keys within the digest set of the resource descriptors within the subject.
-# For v0.1 see: https://github.com/in-toto/attestation/blob/main/spec/v0.1.0/field_types.md#DigestSet
-VALID_ALGORITHMS = [
-    "sha256",
-    "sha224",
-    "sha384",
-    "sha512",
-    "sha512_224",
-    "sha512_256",
-    "sha3_224",
-    "sha3_256",
-    "sha3_384",
-    "sha3_512",
-    "shake128",
-    "shake256",
-    "blake2b",
-    "blake2s",
-    "ripemd160",
-    "sm3",
-    "gost",
-    "sha1",
-    "md5",
-]
-
 
 class InTotoV01Statement(TypedDict):
     """An in-toto version 0.1 statement.
@@ -187,8 +162,6 @@ def is_valid_digest_set(digest: dict[str, JsonType]) -> TypeGuard[dict[str, str]
         is narrowed to a ``dict[str, str]``; ``False`` otherwise.
     """
     for key in digest:
-        if key not in VALID_ALGORITHMS:
-            return False
         if not isinstance(digest[key], str):
             return False
     return True
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
index fc25bcd07..3ffe08bd6 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
@@ -11,32 +11,6 @@
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.util import JsonType
 
-# The full list of cryptographic algorithms supported in in-toto v1 provenance.
-# These are used as keys within the digest set of the resource descriptors within the subject.
-# For v1 see: https://github.com/in-toto/attestation/blob/main/spec/v1/digest_set.md
-VALID_ALGORITHMS = [
-    "sha256",
-    "sha224",
-    "sha384",
-    "sha512",
-    "sha512_224",
-    "sha512_256",
-    "sha3_224",
-    "sha3_256",
-    "sha3_384",
-    "sha3_512",
-    "shake128",
-    "shake256",
-    "blake2b",
-    "blake2s",
-    "ripemd160",
-    "sm3",
-    "gost",
-    "sha1",
-    "md5",
-    "gitCommit",  # This special git value is equivalent to SHA-1 or SHA-256. See the v1 spec for more information.
-]
-
 
 class InTotoV1Statement(TypedDict):
     """An in-toto version 1 statement.
@@ -191,8 +165,6 @@ def is_valid_digest_set(digest: JsonType) -> bool:
     if not isinstance(digest, dict):
         return False
     for key in digest:
-        if key not in VALID_ALGORITHMS:
-            return False
         if not isinstance(digest[key], str):
             return False
     return True

From 26de80605e50223cf56512be9ef8760dbb178cb5 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 21 Mar 2024 16:46:27 +1000
Subject: [PATCH 22/25] chore: Move JSON utility function; Move errors to error
 script.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/errors.py                         |  8 ++
 .../repo_finder/provenance_extractor.py       | 90 +++++--------------
 src/macaron/slsa_analyzer/analyzer.py         |  8 +-
 .../repo_finder/test_provenance_extractor.py  | 26 +++---
 4 files changed, 43 insertions(+), 89 deletions(-)

diff --git a/src/macaron/errors.py b/src/macaron/errors.py
index 5e892e1a6..a98a3bef5 100644
--- a/src/macaron/errors.py
+++ b/src/macaron/errors.py
@@ -58,3 +58,11 @@ class InvalidHTTPResponseError(MacaronError):
 
 class CheckRegistryError(MacaronError):
     """The Check Registry Error class."""
+
+
+class ProvenanceError(MacaronError):
+    """When there is an error while extracting from provenance."""
+
+
+class JsonError(MacaronError):
+    """When there is an error while extracting from JSON."""
diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index 048e1c09c..c30376a34 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -3,19 +3,15 @@
 
 """This module contains methods for extracting repository and commit metadata from provenance files."""
 import logging
-from typing import TypeVar
 
-from macaron.errors import MacaronError
+from macaron.errors import JsonError, ProvenanceError
+from macaron.json_tools import json_extract
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
 from macaron.util import JsonType
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-class ProvenanceExtractionException(MacaronError):
-    """When there is an error while extracting from provenance."""
-
-
 SLSA_V01_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
 SLSA_V02_DIGEST_SET_GIT_ALGORITHMS = ["sha1"]
 SLSA_V1_DIGEST_SET_GIT_ALGORITHMS = ["sha1", "gitCommit"]
@@ -36,7 +32,7 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
 
     Raises
     ------
-    ProvenanceExtractionException
+    ProvenanceError
         If the extraction process fails for any reason.
     """
     repo = ""
@@ -53,9 +49,9 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
                 repo, commit = _extract_from_slsa_v01(payload)
             if predicate_type == "https://witness.testifysec.com/attestation-collection/v0.1":
                 repo, commit = _extract_from_witness_provenance(payload)
-    except JsonExtractionException as error:
+    except JsonError as error:
         logger.debug(error)
-        raise ProvenanceExtractionException("JSON exception while extracting from provenance.") from error
+        raise ProvenanceError("JSON exception while extracting from provenance.") from error
 
     if not repo or not commit:
         msg = (
@@ -63,7 +59,7 @@ def extract_repo_and_commit_from_provenance(payload: InTotoPayload) -> tuple[str
             f"predicate_type {predicate_type}, in-toto {str(type(payload))}."
         )
         logger.debug(msg)
-        raise ProvenanceExtractionException(msg)
+        raise ProvenanceError(msg)
 
     logger.debug("Extracted repo and commit from provenance: %s, %s", repo, commit)
     return repo, commit
@@ -73,17 +69,17 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v01 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        raise ProvenanceExtractionException("No predicate in payload statement.")
+        raise ProvenanceError("No predicate in payload statement.")
 
     # The repository URL and commit are stored inside an entry in the list of predicate -> materials.
     # In predicate -> recipe -> definedInMaterial we find the list index that points to the correct entry.
     list_index = json_extract(predicate, ["recipe", "definedInMaterial"], int)
     material_list = json_extract(predicate, ["materials"], list)
     if list_index >= len(material_list):
-        raise ProvenanceExtractionException("Material list index outside of material list bounds.")
+        raise ProvenanceError("Material list index outside of material list bounds.")
     material = material_list[list_index]
     if not material or not isinstance(material, dict):
-        raise ProvenanceExtractionException("Indexed material list entry is invalid.")
+        raise ProvenanceError("Indexed material list entry is invalid.")
 
     uri = json_extract(material, ["uri"], str)
 
@@ -93,7 +89,7 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str, str]:
     commit = _extract_commit_from_digest_set(digest_set, SLSA_V01_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
-        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
+        raise ProvenanceError("Failed to extract commit hash from provenance.")
 
     return repo, commit
 
@@ -102,20 +98,20 @@ def _extract_from_slsa_v02(payload: InTotoV01Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v02 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        raise ProvenanceExtractionException("No predicate in payload statement.")
+        raise ProvenanceError("No predicate in payload statement.")
 
     # The repository URL and commit are stored within the predicate -> invocation -> configSource object.
     # See https://slsa.dev/spec/v0.2/provenance
     uri = json_extract(predicate, ["invocation", "configSource", "uri"], str)
     if not uri:
-        raise ProvenanceExtractionException("Failed to extract repository URL from provenance.")
+        raise ProvenanceError("Failed to extract repository URL from provenance.")
     repo = _clean_spdx(uri)
 
     digest_set = json_extract(predicate, ["invocation", "configSource", "digest"], dict)
     commit = _extract_commit_from_digest_set(digest_set, SLSA_V02_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
-        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
+        raise ProvenanceError("Failed to extract commit hash from provenance.")
 
     return repo, commit
 
@@ -124,7 +120,7 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
     """Extract the repository and commit metadata from the slsa v1 provenance payload."""
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        raise ProvenanceExtractionException("No predicate in payload statement.")
+        raise ProvenanceError("No predicate in payload statement.")
 
     build_def = json_extract(predicate, ["buildDefinition"], dict)
     build_type = json_extract(build_def, ["buildType"], str)
@@ -134,13 +130,13 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
     if build_type == "https://slsa-framework.github.io/gcb-buildtypes/triggered-build/v1":
         try:
             repo = json_extract(build_def, ["externalParameters", "sourceToBuild", "repository"], str)
-        except JsonExtractionException:
+        except JsonError:
             repo = json_extract(build_def, ["externalParameters", "configSource", "repository"], str)
     if build_type == "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1":
         repo = json_extract(build_def, ["externalParameters", "workflow", "repository"], str)
 
     if not repo:
-        raise ProvenanceExtractionException("Failed to extract repository URL from provenance.")
+        raise ProvenanceError("Failed to extract repository URL from provenance.")
 
     # Extract the commit hash.
     commit = ""
@@ -156,7 +152,7 @@ def _extract_from_slsa_v1(payload: InTotoV1Payload) -> tuple[str, str]:
         commit = _extract_commit_from_digest_set(digest_set, SLSA_V1_DIGEST_SET_GIT_ALGORITHMS)
 
     if not commit:
-        raise ProvenanceExtractionException("Failed to extract commit hash from provenance.")
+        raise ProvenanceError("Failed to extract commit hash from provenance.")
 
     return repo, commit
 
@@ -179,7 +175,7 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
     """
     predicate: dict[str, JsonType] | None = payload.statement.get("predicate")
     if not predicate:
-        raise ProvenanceExtractionException("No predicate in payload statement.")
+        raise ProvenanceError("No predicate in payload statement.")
 
     attestations = json_extract(predicate, ["attestations"], list)
     commit = ""
@@ -198,7 +194,7 @@ def _extract_from_witness_provenance(payload: InTotoV01Payload) -> tuple[str, st
             repo = json_extract(entry, ["attestation", "projecturl"], str)
 
     if not commit or not repo:
-        raise ProvenanceExtractionException("Could not extract repo and commit from provenance.")
+        raise ProvenanceError("Could not extract repo and commit from provenance.")
 
     return repo, commit
 
@@ -216,7 +212,7 @@ def _extract_commit_from_digest_set(digest_set: dict[str, JsonType], valid_algor
             value = digest_set.get(key)
             if isinstance(value, str):
                 return value
-    raise ProvenanceExtractionException(f"No valid digest in digest set: {digest_set.keys()} not in {valid_algorithms}")
+    raise ProvenanceError(f"No valid digest in digest set: {digest_set.keys()} not in {valid_algorithms}")
 
 
 def _clean_spdx(uri: str) -> str:
@@ -226,49 +222,3 @@ def _clean_spdx(uri: str) -> str:
     """
     url, _, _ = uri.lstrip("git+").rpartition("@")
     return url
-
-
-class JsonExtractionException(MacaronError):
-    """When there is an error while extracting from JSON."""
-
-
-T = TypeVar("T", bound=JsonType)
-
-
-def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T:
-    """Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
-
-    The value must be of the passed type.
-
-    Parameters
-    ----------
-    entry: JsonType
-        An entry point into a JSON structure.
-    keys: list[str]
-        The list of depth-sequential keys within the JSON.
-    type: type[T]
-        The type to check the value against and return it as.
-
-    Returns
-    -------
-    T:
-        The found value as the type of the type parameter.
-
-    Raises
-    ------
-    JsonExtractionException
-        Raised if an error occurs while searching for or validating the value.
-    """
-    target = entry
-
-    for index, key in enumerate(keys):
-        if not isinstance(target, dict):
-            raise JsonExtractionException(f"Expect the value .{'.'.join(keys[:index])} to be a dict.")
-        if key not in target:
-            raise JsonExtractionException(f"JSON key '{key}' not found in .{'.'.join(keys[:index])}.")
-        target = target[key]
-
-    if isinstance(target, type_):
-        return target
-
-    raise JsonExtractionException(f"Expect the value .{'.'.join(keys)} to be of type '{type_}'.")
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index c3604d080..a8e90112d 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -26,6 +26,7 @@
     DuplicateError,
     InvalidPURLError,
     MacaronError,
+    ProvenanceError,
     PURLNotFoundError,
     RepoCheckOutError,
 )
@@ -33,10 +34,7 @@
 from macaron.output_reporter.results import Record, Report, SCMStatus
 from macaron.repo_finder import repo_finder
 from macaron.repo_finder.commit_finder import find_commit
-from macaron.repo_finder.provenance_extractor import (
-    ProvenanceExtractionException,
-    extract_repo_and_commit_from_provenance,
-)
+from macaron.repo_finder.provenance_extractor import extract_repo_and_commit_from_provenance
 from macaron.repo_finder.provenance_finder import ProvenanceFinder
 from macaron.slsa_analyzer import git_url
 from macaron.slsa_analyzer.analyze_context import AnalyzeContext
@@ -660,7 +658,7 @@ def to_analysis_target(
                         # Try to find repository and commit via provenance.
                         try:
                             repo, digest = extract_repo_and_commit_from_provenance(provenance_payload)
-                        except ProvenanceExtractionException as error:
+                        except ProvenanceError as error:
                             logger.debug("Failed to extract repo and commit from provenance: %s", error)
 
                     if repo and digest:
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index aa485f892..1ee27aa4e 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -6,11 +6,9 @@
 
 import pytest
 
-from macaron.repo_finder.provenance_extractor import (
-    ProvenanceExtractionException,
-    extract_repo_and_commit_from_provenance,
-    json_extract,
-)
+from macaron.errors import ProvenanceError
+from macaron.json_tools import json_extract
+from macaron.repo_finder.provenance_extractor import extract_repo_and_commit_from_provenance
 from macaron.slsa_analyzer.provenance.intoto import validate_intoto_payload
 from macaron.util import JsonType
 
@@ -262,7 +260,7 @@ def test_slsa_v1_gcb_1_is_invalid(
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and sourceToBuild."""
     _json_modify(slsa_v1_gcb_1_provenance, keys, new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_1_provenance)
 
 
@@ -286,7 +284,7 @@ def test_slsa_v1_gcb_2_is_invalid(
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type gcb and configSource."""
     _json_modify(slsa_v1_gcb_2_provenance, keys, new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_gcb_2_provenance)
 
 
@@ -310,7 +308,7 @@ def test_slsa_v1_github_is_invalid(
 ) -> None:
     """Test invalidly modified SLSA v1 provenance with build type GitHub."""
     _json_modify(slsa_v1_github_provenance, keys, new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v1_github_provenance)
 
 
@@ -334,7 +332,7 @@ def test_slsa_v02_is_valid(
 def test_slsa_v02_is_invalid(slsa_v02_provenance: dict[str, JsonType], keys: list[str], new_value: JsonType) -> None:
     """Test invalidly modified SLSA v0.2 provenance."""
     _json_modify(slsa_v02_provenance, keys, new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v02_provenance)
 
 
@@ -357,14 +355,14 @@ def test_slsa_v01_is_invalid(slsa_v01_provenance: dict[str, JsonType], new_value
     materials = json_extract(slsa_v01_provenance, ["predicate", "materials"], list)
     material_index = json_extract(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], int)
     _json_modify(materials[material_index], ["uri"], new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
 
 def test_slsa_v01_invalid_material_index(slsa_v01_provenance: dict[str, JsonType]) -> None:
     """Test the SLSA v0.1 provenance with an invalid materials index."""
     _json_modify(slsa_v01_provenance, ["predicate", "recipe", "definedInMaterial"], 10)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(slsa_v01_provenance)
 
 
@@ -399,7 +397,7 @@ def test_witness_github_is_invalid(
     """Test invalidly modified Witness v0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
     _json_modify(attestations[attestation_index], keys, new_value)
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
 
@@ -407,7 +405,7 @@ def test_witness_github_remove_attestation(witness_github_provenance: dict[str,
     """Test removing Git attestation from Witness V0.1 GitHub provenance."""
     attestations = json_extract(witness_github_provenance, ["predicate", "attestations"], list)
     _json_modify(witness_github_provenance, ["predicate", "attestations"], attestations[:1])
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(witness_github_provenance)
 
 
@@ -423,7 +421,7 @@ def test_witness_github_remove_attestation(witness_github_provenance: dict[str,
 def test_invalid_type_payloads(type_: str, predicate_type: str) -> None:
     """Test payloads with invalid type combinations."""
     payload: dict[str, JsonType] = {"_type": type_, "predicateType": predicate_type, "subject": [], "predicate": {}}
-    with pytest.raises(ProvenanceExtractionException):
+    with pytest.raises(ProvenanceError):
         _test_extract_repo_and_commit_from_provenance(payload)
 
 

From f8badc09396f22cf3628078b21039d0ddc7db1d7 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Thu, 21 Mar 2024 18:03:29 +1000
Subject: [PATCH 23/25] chore: move InvalidAnalysisTargetError to errors.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 src/macaron/errors.py                 | 4 ++++
 src/macaron/slsa_analyzer/analyzer.py | 6 +-----
 tests/slsa_analyzer/test_analyzer.py  | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/macaron/errors.py b/src/macaron/errors.py
index a98a3bef5..f05540b6d 100644
--- a/src/macaron/errors.py
+++ b/src/macaron/errors.py
@@ -66,3 +66,7 @@ class ProvenanceError(MacaronError):
 
 class JsonError(MacaronError):
     """When there is an error while extracting from JSON."""
+
+
+class InvalidAnalysisTargetError(MacaronError):
+    """When a valid Analysis Target cannot be constructed."""
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index a8e90112d..1687045b1 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -24,8 +24,8 @@
 from macaron.errors import (
     CloneError,
     DuplicateError,
+    InvalidAnalysisTargetError,
     InvalidPURLError,
-    MacaronError,
     ProvenanceError,
     PURLNotFoundError,
     RepoCheckOutError,
@@ -1000,7 +1000,3 @@ def __init__(self, *args: Any, context: AnalyzeContext | None = None, **kwargs:
         """
         super().__init__(*args, **kwargs)
         self.context: AnalyzeContext | None = context
-
-
-class InvalidAnalysisTargetError(MacaronError):
-    """When a valid Analysis Target cannot be constructed."""
diff --git a/tests/slsa_analyzer/test_analyzer.py b/tests/slsa_analyzer/test_analyzer.py
index 18e6eae59..f4e68f321 100644
--- a/tests/slsa_analyzer/test_analyzer.py
+++ b/tests/slsa_analyzer/test_analyzer.py
@@ -12,8 +12,8 @@
 from packageurl import PackageURL
 
 from macaron.config.target_config import Configuration
-from macaron.errors import InvalidPURLError
-from macaron.slsa_analyzer.analyzer import Analyzer, InvalidAnalysisTargetError
+from macaron.errors import InvalidAnalysisTargetError, InvalidPURLError
+from macaron.slsa_analyzer.analyzer import Analyzer
 
 from ..macaron_testcase import MacaronTestCase
 

From 64fb176ee6040ac2a1a5870232325e59d1248790 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Fri, 22 Mar 2024 09:14:17 +1000
Subject: [PATCH 24/25] chore: add integration test for provenance extractor;
 add json_tools script.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 scripts/dev_scripts/integration_tests.sh   |  9 ++++
 src/macaron/json_tools.py                  | 50 ++++++++++++++++++++++
 tests/e2e/defaults/disable_repo_finder.ini |  5 +++
 3 files changed, 64 insertions(+)
 create mode 100644 src/macaron/json_tools.py
 create mode 100644 tests/e2e/defaults/disable_repo_finder.ini

diff --git a/scripts/dev_scripts/integration_tests.sh b/scripts/dev_scripts/integration_tests.sh
index c0828fa83..8d85b8b75 100755
--- a/scripts/dev_scripts/integration_tests.sh
+++ b/scripts/dev_scripts/integration_tests.sh
@@ -99,6 +99,15 @@ if [[ -z "$NO_NPM_TEST" ]]; then
     $RUN_MACARON analyze -purl pkg:npm/@sigstore/mock@0.1.0 -rp https://github.com/sigstore/sigstore-js -b main -d ebdcfdfbdfeb9c9aeee6df53674ef230613629f5 --skip-deps || log_fail
 
     check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
+
+    echo -e "\n----------------------------------------------------------------------------------"
+    echo "semver@7.6.0: Extracting repository URL and commit from provenance while Repo Finder is disabled."
+    echo -e "----------------------------------------------------------------------------------\n"
+    JSON_EXPECTED=$WORKSPACE/tests/e2e/expected_results/purl/npm/semver/semver.json
+    JSON_RESULT=$WORKSPACE/output/reports/npm/semver/semver.json
+    $RUN_MACARON -dp tests/e2e/defaults/disable_repo_finder.ini analyze -purl pkg:npm/semver@7.6.0 || log_fail
+
+    check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
 fi
 
 echo -e "\n----------------------------------------------------------------------------------"
diff --git a/src/macaron/json_tools.py b/src/macaron/json_tools.py
new file mode 100644
index 000000000..64ad2cfd5
--- /dev/null
+++ b/src/macaron/json_tools.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+"""This module provides utility functions for JSON data."""
+
+from typing import TypeVar
+
+from macaron.errors import JsonError
+from macaron.util import JsonType
+
+T = TypeVar("T", bound=JsonType)
+
+
+def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T:
+    """Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
+
+    The value must be of the passed type.
+
+    Parameters
+    ----------
+    entry: JsonType
+        An entry point into a JSON structure.
+    keys: list[str]
+        The list of depth-sequential keys within the JSON.
+    type: type[T]
+        The type to check the value against and return it as.
+
+    Returns
+    -------
+    T:
+        The found value as the type of the type parameter.
+
+    Raises
+    ------
+    JsonError
+        Raised if an error occurs while searching for or validating the value.
+    """
+    target = entry
+
+    for index, key in enumerate(keys):
+        if not isinstance(target, dict):
+            raise JsonError(f"Expect the value .{'.'.join(keys[:index])} to be a dict.")
+        if key not in target:
+            raise JsonError(f"JSON key '{key}' not found in .{'.'.join(keys[:index])}.")
+        target = target[key]
+
+    if isinstance(target, type_):
+        return target
+
+    raise JsonError(f"Expect the value .{'.'.join(keys)} to be of type '{type_}'.")
diff --git a/tests/e2e/defaults/disable_repo_finder.ini b/tests/e2e/defaults/disable_repo_finder.ini
new file mode 100644
index 000000000..ec4fd9216
--- /dev/null
+++ b/tests/e2e/defaults/disable_repo_finder.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+[repofinder]
+find_repos = False

From 2e4678981b3f3a5ecf9bb397035f5802454a71cf Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Fri, 22 Mar 2024 09:16:42 +1000
Subject: [PATCH 25/25] chore: add integration test expected result.

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 .../purl/npm/semver/semver.json               | 334 ++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 tests/e2e/expected_results/purl/npm/semver/semver.json

diff --git a/tests/e2e/expected_results/purl/npm/semver/semver.json b/tests/e2e/expected_results/purl/npm/semver/semver.json
new file mode 100644
index 000000000..9fa549cba
--- /dev/null
+++ b/tests/e2e/expected_results/purl/npm/semver/semver.json
@@ -0,0 +1,334 @@
+{
+    "metadata": {
+        "timestamps": "2024-03-22 09:02:56",
+        "has_passing_check": true,
+        "run_checks": [
+            "mcn_provenance_available_1",
+            "mcn_provenance_expectation_1",
+            "mcn_provenance_witness_level_one_1",
+            "mcn_trusted_builder_level_three_1",
+            "mcn_build_as_code_1",
+            "mcn_build_script_1",
+            "mcn_build_service_1",
+            "mcn_infer_artifact_pipeline_1",
+            "mcn_provenance_level_three_1",
+            "mcn_version_control_system_1"
+        ],
+        "check_tree": {
+            "mcn_provenance_available_1": {
+                "mcn_provenance_level_three_1": {},
+                "mcn_provenance_expectation_1": {},
+                "mcn_provenance_witness_level_one_1": {}
+            },
+            "mcn_version_control_system_1": {
+                "mcn_trusted_builder_level_three_1": {
+                    "mcn_build_as_code_1": {
+                        "mcn_build_service_1": {
+                            "mcn_build_script_1": {}
+                        },
+                        "mcn_infer_artifact_pipeline_1": {}
+                    }
+                }
+            }
+        }
+    },
+    "target": {
+        "info": {
+            "full_name": "pkg:npm/semver@7.6.0",
+            "local_cloned_path": "git_repos/github_com/npm/node-semver",
+            "remote_path": "https://github.com/npm/node-semver",
+            "branch": null,
+            "commit_hash": "377f709718053a477ed717089c4403c4fec332a1",
+            "commit_date": "2024-02-05T09:03:38-08:00"
+        },
+        "provenances": {
+            "is_inferred": false,
+            "content": {
+                "github_actions": [
+                    {
+                        "_type": "https://in-toto.io/Statement/v0.1",
+                        "subject": [],
+                        "predicateType": "https://slsa.dev/provenance/v0.2",
+                        "predicate": {
+                            "builder": {
+                                "id": "<URI>"
+                            },
+                            "buildType": "<URI>",
+                            "invocation": {
+                                "configSource": {
+                                    "uri": "<URI>",
+                                    "digest": {
+                                        "sha1": "<STING>"
+                                    },
+                                    "entryPoint": "<STRING>"
+                                },
+                                "parameters": {},
+                                "environment": {}
+                            },
+                            "buildConfig": {
+                                "jobID": "<STRING>",
+                                "stepID": "<STRING>"
+                            },
+                            "metadata": {
+                                "buildInvocationId": "<STRING>",
+                                "buildStartedOn": "<TIMESTAMP>",
+                                "buildFinishedOn": "<TIMESTAMP>",
+                                "completeness": {
+                                    "parameters": "false",
+                                    "environment": "false",
+                                    "materials": "false"
+                                },
+                                "reproducible": "false"
+                            },
+                            "materials": [
+                                {
+                                    "uri": "<URI>",
+                                    "digest": {}
+                                }
+                            ]
+                        }
+                    }
+                ],
+                "npm Registry": [
+                    {
+                        "_type": "https://in-toto.io/Statement/v1",
+                        "subject": [
+                            {
+                                "name": "pkg:npm/semver@7.6.0",
+                                "digest": {
+                                    "sha512": "127c1786b9705cc93d80abb9fdf971e6cbff6a7e7b024469946de14caebc5bb1510cdfa4f8e5818fae4cefbd7d3a403cd972c1c6b717d0a4878fe5f908e84e56"
+                                }
+                            }
+                        ],
+                        "predicateType": "https://slsa.dev/provenance/v1",
+                        "predicate": {
+                            "buildDefinition": {
+                                "buildType": "https://slsa-framework.github.io/github-actions-buildtypes/workflow/v1",
+                                "externalParameters": {
+                                    "workflow": {
+                                        "ref": "refs/heads/main",
+                                        "repository": "https://github.com/npm/node-semver",
+                                        "path": ".github/workflows/release.yml"
+                                    }
+                                },
+                                "internalParameters": {
+                                    "github": {
+                                        "event_name": "push",
+                                        "repository_id": "1357199",
+                                        "repository_owner_id": "6078720"
+                                    }
+                                },
+                                "resolvedDependencies": [
+                                    {
+                                        "uri": "git+https://github.com/npm/node-semver@refs/heads/main",
+                                        "digest": {
+                                            "gitCommit": "377f709718053a477ed717089c4403c4fec332a1"
+                                        }
+                                    }
+                                ]
+                            },
+                            "runDetails": {
+                                "builder": {
+                                    "id": "https://github.com/actions/runner/github-hosted"
+                                },
+                                "metadata": {
+                                    "invocationId": "https://github.com/npm/node-semver/actions/runs/7788106733/attempts/1"
+                                }
+                            }
+                        }
+                    }
+                ]
+            }
+        },
+        "checks": {
+            "summary": {
+                "DISABLED": 0,
+                "FAILED": 4,
+                "PASSED": 5,
+                "SKIPPED": 0,
+                "UNKNOWN": 1
+            },
+            "results": [
+                {
+                    "check_id": "mcn_provenance_expectation_1",
+                    "check_description": "Check whether the SLSA provenance for the produced artifact conforms to the expected value.",
+                    "slsa_requirements": [
+                        "Provenance conforms with expectations - SLSA Level 3"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "UNKNOWN"
+                },
+                {
+                    "check_id": "mcn_build_as_code_1",
+                    "check_description": "The build definition and configuration executed by the build service is verifiably derived from text file definitions stored in a version control system.",
+                    "slsa_requirements": [
+                        "Build as code - SLSA Level 3"
+                    ],
+                    "justification": [
+                        "build_tool_name: npm",
+                        "ci_service_name: github_actions",
+                        "deploy_command: [\"npm\", \"publish\", \"--provenance\", \"--tag=\\\"$1\\\"\"]",
+                        {
+                            "build_trigger": "https://github.com/npm/node-semver/blob/377f709718053a477ed717089c4403c4fec332a1/.github/workflows/release-integration.yml"
+                        }
+                    ],
+                    "result_type": "PASSED"
+                },
+                {
+                    "check_id": "mcn_build_script_1",
+                    "check_description": "Check if the target repo has a valid build script.",
+                    "slsa_requirements": [
+                        "Scripted Build - SLSA Level 1"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "PASSED"
+                },
+                {
+                    "check_id": "mcn_build_service_1",
+                    "check_description": "Check if the target repo has a valid build service.",
+                    "slsa_requirements": [
+                        "Build service - SLSA Level 2"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "PASSED"
+                },
+                {
+                    "check_id": "mcn_provenance_available_1",
+                    "check_description": "Check whether the target has intoto provenance.",
+                    "slsa_requirements": [
+                        "Provenance - Available - SLSA Level 1",
+                        "Provenance content - Identifies build instructions - SLSA Level 1",
+                        "Provenance content - Identifies artifacts - SLSA Level 1",
+                        "Provenance content - Identifies builder - SLSA Level 1"
+                    ],
+                    "justification": [
+                        "asset_name: semver",
+                        {
+                            "asset_url": "https://registry.npmjs.org/-/npm/v1/attestations/semver@7.6.0"
+                        }
+                    ],
+                    "result_type": "PASSED"
+                },
+                {
+                    "check_id": "mcn_version_control_system_1",
+                    "check_description": "Check whether the target repo uses a version control system.",
+                    "slsa_requirements": [
+                        "Version controlled - SLSA Level 2"
+                    ],
+                    "justification": [
+                        {
+                            "git_repo": "https://github.com/npm/node-semver"
+                        }
+                    ],
+                    "result_type": "PASSED"
+                },
+                {
+                    "check_id": "mcn_infer_artifact_pipeline_1",
+                    "check_description": "Detects potential pipelines from which an artifact is published.",
+                    "slsa_requirements": [
+                        "Build as code - SLSA Level 3"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "FAILED"
+                },
+                {
+                    "check_id": "mcn_provenance_level_three_1",
+                    "check_description": "Check whether the target has SLSA provenance level 3.",
+                    "slsa_requirements": [
+                        "Provenance - Non falsifiable - SLSA Level 3",
+                        "Provenance content - Includes all build parameters - SLSA Level 3",
+                        "Provenance content - Identifies entry point - SLSA Level 3",
+                        "Provenance content - Identifies source code - SLSA Level 2"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "FAILED"
+                },
+                {
+                    "check_id": "mcn_provenance_witness_level_one_1",
+                    "check_description": "Check whether the target has a level-1 witness provenance.",
+                    "slsa_requirements": [
+                        "Provenance - Available - SLSA Level 1",
+                        "Provenance content - Identifies build instructions - SLSA Level 1",
+                        "Provenance content - Identifies artifacts - SLSA Level 1",
+                        "Provenance content - Identifies builder - SLSA Level 1"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "FAILED"
+                },
+                {
+                    "check_id": "mcn_trusted_builder_level_three_1",
+                    "check_description": "Check whether the target uses a trusted SLSA level 3 builder.",
+                    "slsa_requirements": [
+                        "Hermetic - SLSA Level 4",
+                        "Isolated - SLSA Level 3",
+                        "Parameterless - SLSA Level 4",
+                        "Ephemeral environment - SLSA Level 3"
+                    ],
+                    "justification": [
+                        "Not Available."
+                    ],
+                    "result_type": "FAILED"
+                }
+            ]
+        }
+    },
+    "dependencies": {
+        "analyzed_deps": 0,
+        "unique_dep_repos": 0,
+        "checks_summary": [
+            {
+                "check_id": "mcn_provenance_available_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_provenance_expectation_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_provenance_witness_level_one_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_trusted_builder_level_three_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_build_as_code_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_build_script_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_build_service_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_infer_artifact_pipeline_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_provenance_level_three_1",
+                "num_deps_pass": 0
+            },
+            {
+                "check_id": "mcn_version_control_system_1",
+                "num_deps_pass": 0
+            }
+        ],
+        "dep_status": []
+    }
+}