diff --git a/dev-requirements.in b/dev-requirements.in index e079f8a6..12136494 100644 --- a/dev-requirements.in +++ b/dev-requirements.in @@ -1 +1,2 @@ pytest +pytest-mock diff --git a/dev-requirements.txt b/dev-requirements.txt index 396ad16b..8a075ef7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -23,6 +23,12 @@ pluggy==1.4.0 \ pytest==8.0.0 \ --hash=sha256:249b1b0864530ba251b7438274c4d251c58d868edaaec8762893ad4a0d71c36c \ --hash=sha256:50fb9cbe836c3f20f0dfa99c565201fb75dc54c8d76373cd1bde06b06657bdb6 + # via + # -r dev-requirements.in + # pytest-mock +pytest-mock==3.12.0 \ + --hash=sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f \ + --hash=sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9 # via -r dev-requirements.in tomli==2.0.1 \ --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ diff --git a/sbom.py b/sbom.py index 43a2e9da..dac85678 100644 --- a/sbom.py +++ b/sbom.py @@ -12,12 +12,16 @@ import datetime import hashlib +import io import json import os import re import subprocess import sys import tarfile +import zipfile +from urllib.request import urlopen +import typing def spdx_id(value: str) -> str: @@ -132,6 +136,186 @@ def recursive_sort_in_place(value): recursive_sort_in_place(sbom_data) +def fetch_package_metadata_from_pypi(project: str, version: str, filename: str | None = None) -> tuple[str, str] | None: + """ + Fetches the SHA256 checksum and download location from PyPI. + If we're given a filename then we match with that, otherwise we use wheels. + """ + # Get the package download URL from PyPI. + try: + raw_text = urlopen(f"https://pypi.org/pypi/{project}/{version}/json").read() + release_metadata = json.loads(raw_text) + url: dict[str, typing.Any] + + # Look for a matching artifact filename and then check + # its remote checksum to the local one. + for url in release_metadata["urls"]: + # pip can only use Python-only dependencies, so there's + # no risk of picking the 'incorrect' wheel here. + if ( + (filename is None and url["packagetype"] == "bdist_wheel") + or (filename is not None and url["filename"] == filename) + ): + break + else: + raise ValueError(f"No matching filename on PyPI for '{filename}'") + + # Successfully found the download URL for the matching artifact. + download_url = url["url"] + checksum_sha256 = url["digests"]["sha256"] + return download_url, checksum_sha256 + + except (OSError, ValueError) as e: + raise ValueError(f"Couldn't fetch metadata for project '{project}' from PyPI: {e}") + + +def remove_pip_from_sbom(sbom_data: dict[str, typing.Any]) -> None: + """ + Removes pip and its dependencies from the SBOM data. + This is only necessary if there's potential we get + pip SBOM data from the CPython source SBOM. + """ + sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip") + sbom_spdx_ids_to_remove = {sbom_pip_spdx_id} + + # Find all package SPDXIDs that pip depends on. + for sbom_relationship in sbom_data["relationships"]: + if ( + sbom_relationship["relationshipType"] == "DEPENDS_ON" + and sbom_relationship["spdxElementId"] == sbom_pip_spdx_id + ): + sbom_spdx_ids_to_remove.add(sbom_relationship["relatedSpdxElement"]) + + # Remove all the packages and relationships. + sbom_data["packages"] = [ + sbom_package for sbom_package in sbom_data["packages"] + if sbom_package["SPDXID"] not in sbom_spdx_ids_to_remove + ] + sbom_data["relationships"] = [ + sbom_relationship for sbom_relationship in sbom_data["relationships"] + if sbom_relationship["relatedSpdxElement"] not in sbom_spdx_ids_to_remove + ] + + +def create_pip_sbom_from_wheel( + sbom_data: dict[str, typing.Any], + pip_wheel_filename: str, + pip_wheel_bytes: bytes +) -> None: + """ + pip is a part of a packaging ecosystem (Python, surprise!) so it's actually + automatable to discover the metadata we need like the version and checksums + so let's do that on behalf of our friends at the PyPA. This function also + discovers vendored packages within pip and fetches their metadata. + """ + # Remove pip from the SBOM in case it's included in the CPython source code SBOM. + remove_pip_from_sbom(sbom_data) + + # Wheel filename format puts the version right after the project name. + pip_version = pip_wheel_filename.split("-")[1] + pip_checksum_sha256 = hashlib.sha256(pip_wheel_bytes).hexdigest() + + pip_download_url, pip_actual_sha256 = fetch_package_metadata_from_pypi( + project="pip", + version=pip_version, + filename=pip_wheel_filename, + ) + if pip_actual_sha256 != pip_checksum_sha256: + raise ValueError("pip wheel checksum doesn't match PyPI") + + # Parse 'pip/_vendor/vendor.txt' from the wheel for sub-dependencies. + with zipfile.ZipFile(io.BytesIO(pip_wheel_bytes)) as whl: + vendor_txt_data = whl.read("pip/_vendor/vendor.txt").decode() + + # With this version regex we're assuming that pip isn't using pre-releases. + # If any version doesn't match we get a failure below, so we're safe doing this. + version_pin_re = re.compile(r"^([a-zA-Z0-9_.-]+)==([0-9.]*[0-9])$") + sbom_pip_dependency_spdx_ids = set() + for line in vendor_txt_data.splitlines(): + line = line.partition("#")[0].strip() # Strip comments and whitespace. + if not line: # Skip empty lines. + continue + + # Non-empty lines we must be able to match. + match = version_pin_re.match(line) + assert match is not None, f"Unparseable line in vendor.txt: {line!r}" # Make mypy happy. + + # Parse out and normalize the project name. + project_name, project_version = match.groups() + project_name = project_name.lower() + + # Fetch the metadata from PyPI + project_download_url, project_checksum_sha256 = ( + fetch_package_metadata_from_pypi(project_name, project_version) + ) + + # Update our SBOM data with what we received from PyPI. + sbom_project_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{project_name}") + sbom_pip_dependency_spdx_ids.add(sbom_project_spdx_id) + sbom_data["packages"].append({ + "SPDXID": sbom_project_spdx_id, + "name": project_name, + "versionInfo": project_version, + "downloadLocation": project_download_url, + "checksums": [ + {"algorithm": "SHA256", "checksumValue": project_checksum_sha256} + ], + "externalRefs": [ + { + "referenceCategory": "PACKAGE_MANAGER", + "referenceLocator": f"pkg:pypi/{project_name}@{project_version}", + "referenceType": "purl", + }, + ], + "primaryPackagePurpose": "SOURCE", + "licenseConcluded": "NOASSERTION", + }) + + # Now we add pip to the SBOM and dependency relationships + sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip") + sbom_data["packages"].append( + { + "SPDXID": sbom_pip_spdx_id, + "name": "pip", + "versionInfo": pip_version, + "originator": "Organization: Python Packaging Authority", + "licenseConcluded": "NOASSERTION", + "downloadLocation": pip_download_url, + "checksums": [ + {"algorithm": "SHA256", "checksumValue": pip_checksum_sha256} + ], + "externalRefs": [ + { + "referenceCategory": "SECURITY", + "referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*", + "referenceType": "cpe23Type", + }, + { + "referenceCategory": "PACKAGE_MANAGER", + "referenceLocator": f"pkg:pypi/pip@{pip_version}", + "referenceType": "purl", + }, + ], + "primaryPackagePurpose": "SOURCE", + } + ) + for sbom_dep_spdx_id in sorted(sbom_pip_dependency_spdx_ids): + sbom_data["relationships"].append({ + "spdxElementId": sbom_pip_spdx_id, + "relatedSpdxElement": sbom_dep_spdx_id, + "relationshipType": "DEPENDS_ON" + }) + + # Finally, CPython depends on pip. + sbom_data["relationships"].append( + { + "spdxElementId": "SPDXRef-PACKAGE-cpython", + "relatedSpdxElement": sbom_pip_spdx_id, + "relationshipType": "DEPENDS_ON", + } + ) + + def create_sbom_for_source_tarball(tarball_path: str): """Stitches together an SBOM for a source tarball""" tarball_name = os.path.basename(tarball_path) @@ -163,9 +347,9 @@ def create_sbom_for_source_tarball(tarball_path: str): "Tarball doesn't contain an SBOM at 'Misc/sbom.spdx.json'" ) from None sbom_bytes = tarball.extractfile(sbom_tarball_member).read() + sbom_data = json.loads(sbom_bytes) - sbom = json.loads(sbom_bytes) - sbom.update({ + sbom_data.update({ "SPDXID": "SPDXRef-DOCUMENT", "spdxVersion": "SPDX-2.3", "name": "CPython SBOM", @@ -211,18 +395,36 @@ def create_sbom_for_source_tarball(tarball_path: str): } # The top-level CPython package depends on every vendored sub-package. - for sbom_package in sbom["packages"]: - sbom["relationships"].append({ + for sbom_package in sbom_data["packages"]: + sbom_data["relationships"].append({ "spdxElementId": sbom_cpython_package["SPDXID"], "relatedSpdxElement": sbom_package["SPDXID"], "relationshipType": "DEPENDS_ON", }) - sbom["packages"].append(sbom_cpython_package) + sbom_data["packages"].append(sbom_cpython_package) + + # Find the pip wheel in ensurepip in the tarball + for member in tarball.getmembers(): + match = re.match(rf"^Python-{cpython_version}/Lib/ensurepip/_bundled/(pip-.*\.whl)$", member.name) + if match is not None: + pip_wheel_filename = match.group(1) + pip_wheel_bytes = tarball.extractfile(member).read() + break + else: + raise ValueError("Could not find pip wheel in 'Lib/ensurepip/_bundled/...'") + + # Now add pip to the SBOM. We do this after the above step to avoid + # CPython being dependent on packages that pip is dependent on. + create_pip_sbom_from_wheel( + sbom_data=sbom_data, + pip_wheel_filename=pip_wheel_filename, + pip_wheel_bytes=pip_wheel_bytes + ) # Extract all currently known files from the SBOM with their checksums. known_sbom_files = {} - for sbom_file in sbom["files"]: + for sbom_file in sbom_data["files"]: sbom_filename = sbom_file["fileName"] # Look for the expected SHA256 checksum. @@ -267,7 +469,7 @@ def create_sbom_for_source_tarball(tarball_path: str): # If this is a new file, then it's a part of the 'CPython' SBOM package. else: sbom_file_spdx_id = spdx_id(f"SPDXRef-FILE-{member_name_no_prefix}") - sbom["files"].append( + sbom_data["files"].append( { "SPDXID": sbom_file_spdx_id, "fileName": member_name_no_prefix, @@ -283,7 +485,7 @@ def create_sbom_for_source_tarball(tarball_path: str): ], } ) - sbom["relationships"].append( + sbom_data["relationships"].append( { "spdxElementId": sbom_cpython_package["SPDXID"], "relatedSpdxElement": sbom_file_spdx_id, @@ -300,7 +502,7 @@ def create_sbom_for_source_tarball(tarball_path: str): ) # Final relationship, this SBOM describes the CPython package. - sbom["relationships"].append( + sbom_data["relationships"].append( { "spdxElementId": "SPDXRef-DOCUMENT", "relatedSpdxElement": sbom_cpython_package["SPDXID"], @@ -310,17 +512,17 @@ def create_sbom_for_source_tarball(tarball_path: str): # Apply the 'supplier' tag to every package since we're shipping # the package in the tarball itself. Originator field is used for maintainers. - for sbom_package in sbom["packages"]: + for sbom_package in sbom_data["packages"]: sbom_package["supplier"] = "Organization: Python Software Foundation" sbom_package["filesAnalyzed"] = True # Calculate the 'packageVerificationCode' values for files in packages. - calculate_package_verification_codes(sbom) + calculate_package_verification_codes(sbom_data) # Normalize SBOM structures for reproducibility. - normalize_sbom_data(sbom) + normalize_sbom_data(sbom_data) - return sbom + return sbom_data def main() -> None: diff --git a/tests/test_sbom.py b/tests/test_sbom.py index f79b8c46..427e895c 100644 --- a/tests/test_sbom.py +++ b/tests/test_sbom.py @@ -1,6 +1,10 @@ -import pytest +import json import random import hashlib +import unittest.mock + +import pytest + import sbom @@ -58,3 +62,57 @@ def test_normalization(): "a": [1, 2, 3, {"b": ["c", 4, ["2", 7, True, {}]]}], "b": [["a", 1, 2], ["b", 1, 2]] } + + +def test_fetch_project_metadata_from_pypi(mocker): + + mock_urlopen = mocker.patch("sbom.urlopen") + mock_urlopen.return_value = unittest.mock.Mock() + + # This is only a partial response using the information + # that this function uses. + mock_urlopen.return_value.read.return_value = json.dumps({ + "urls": [ + { + "digests": { + "blake2b_256": "94596638090c25e9bc4ce0c42817b5a234e183872a1129735a9330c472cc2056", + "md5": "1331aabb4d1a2677f493effeebda3605", + "sha256": "ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2" + }, + "filename": "pip-24.0.tar.gz", + "packagetype": "sdist", + "url": "https://files.pythonhosted.org/packages/.../pip-24.0.tar.gz", + }, + { + "digests": { + "blake2b_256": "8a6a19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b", + "md5": "74e3c5e4082113b1239ca0e9abfd1e82", + "sha256": "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc" + }, + "filename": "pip-24.0-py3-none-any.whl", + "packagetype": "bdist_wheel", + "url": "https://files.pythonhosted.org/packages/.../pip-24.0-py3-none-any.whl", + } + ] + }).encode() + + # Default filename is the wheel + download_url, checksum_sha256 = sbom.fetch_package_metadata_from_pypi( + project="pip", + version="24.0", + ) + + mock_urlopen.assert_called_once_with("https://pypi.org/pypi/pip/24.0/json") + assert download_url == "https://files.pythonhosted.org/packages/.../pip-24.0-py3-none-any.whl" + assert checksum_sha256 == "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc" + + # If we ask for the sdist (which we don't do normally) + # then it'll be returned instead. + download_url, checksum_sha256 = sbom.fetch_package_metadata_from_pypi( + project="pip", + version="24.0", + filename="pip-24.0.tar.gz" + ) + + assert download_url == "https://files.pythonhosted.org/packages/.../pip-24.0.tar.gz" + assert checksum_sha256 == "ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"