11"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
2-
2+ import os
33import re
44import hashlib
55import json
66import glob
77import pathlib
88import subprocess
9+ import sys
910import typing
11+ from urllib .request import urlopen
12+
13+ CPYTHON_ROOT_DIR = pathlib .Path (__file__ ).parent .parent .parent
1014
1115# Before adding a new entry to this list, double check that
1216# the license expression is a valid SPDX license expression:
@@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
4347# values to 'exclude' if we create new files within tracked
4448# directories that aren't sourced from third-party packages.
4549PACKAGE_TO_FILES = {
50+ # NOTE: pip's entry in this structure is automatically generated in
51+ # the 'discover_pip_sbom_package()' function below.
4652 "mpdecimal" : PackageFiles (
4753 include = ["Modules/_decimal/libmpdec/**" ]
4854 ),
4955 "expat" : PackageFiles (
5056 include = ["Modules/expat/**" ]
5157 ),
52- "pip" : PackageFiles (
53- include = ["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl" ]
54- ),
5558 "macholib" : PackageFiles (
5659 include = ["Lib/ctypes/macholib/**" ],
5760 exclude = [
@@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
106109 return sorted ([line .split ()[- 1 ] for line in git_check_ignore_lines if line .startswith ("::" )])
107110
108111
112+ def discover_pip_sbom_package (sbom_data : dict [str , typing .Any ]) -> None :
113+ """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
114+ automatable to discover the metadata we need like the version and checksums
115+ so let's do that on behalf of our friends at the PyPA.
116+ """
117+ global PACKAGE_TO_FILES
118+
119+ ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
120+ pip_wheels = []
121+
122+ # Find the hopefully one pip wheel in the bundled directory.
123+ for wheel_filename in os .listdir (ensurepip_bundled_dir ):
124+ if wheel_filename .startswith ("pip-" ):
125+ pip_wheels .append (wheel_filename )
126+ if len (pip_wheels ) != 1 :
127+ print ("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'" )
128+ sys .exit (1 )
129+ pip_wheel_filename = pip_wheels [0 ]
130+
131+ # Add the wheel filename to the list of files so the SBOM file
132+ # and relationship generator can work its magic on the wheel too.
133+ PACKAGE_TO_FILES ["pip" ] = PackageFiles (
134+ include = [f"Lib/ensurepip/_bundled/{ pip_wheel_filename } " ]
135+ )
136+
137+ # Wheel filename format puts the version right after the project name.
138+ pip_version = pip_wheel_filename .split ("-" )[1 ]
139+ pip_checksum_sha256 = hashlib .sha256 (
140+ (ensurepip_bundled_dir / pip_wheel_filename ).read_bytes ()
141+ ).hexdigest ()
142+
143+ # Get pip's download location from PyPI. Check that the checksum is correct too.
144+ try :
145+ raw_text = urlopen (f"https://pypi.org/pypi/pip/{ pip_version } /json" ).read ()
146+ pip_release_metadata = json .loads (raw_text )
147+ url : dict [str , typing .Any ]
148+
149+ # Look for a matching artifact filename and then check
150+ # its remote checksum to the local one.
151+ for url in pip_release_metadata ["urls" ]:
152+ if url ["filename" ] == pip_wheel_filename :
153+ break
154+ else :
155+ raise ValueError (f"No matching filename on PyPI for '{ pip_wheel_filename } '" )
156+ if url ["digests" ]["sha256" ] != pip_checksum_sha256 :
157+ raise ValueError (f"Local pip checksum doesn't match artifact on PyPI" )
158+
159+ # Successfully found the download URL for the matching artifact.
160+ pip_download_url = url ["url" ]
161+
162+ except (OSError , ValueError ) as e :
163+ print (f"Couldn't fetch pip's metadata from PyPI: { e } " )
164+ sys .exit (1 )
165+
166+ # Remove pip from the existing SBOM packages if it's there
167+ # and then overwrite its entry with our own generated one.
168+ sbom_data ["packages" ] = [
169+ sbom_package
170+ for sbom_package in sbom_data ["packages" ]
171+ if sbom_package ["name" ] != "pip"
172+ ]
173+ sbom_data ["packages" ].append (
174+ {
175+ "SPDXID" : spdx_id ("SPDXRef-PACKAGE-pip" ),
176+ "name" : "pip" ,
177+ "versionInfo" : pip_version ,
178+ "originator" : "Organization: Python Packaging Authority" ,
179+ "licenseConcluded" : "MIT" ,
180+ "downloadLocation" : pip_download_url ,
181+ "checksums" : [
182+ {"algorithm" : "SHA256" , "checksumValue" : pip_checksum_sha256 }
183+ ],
184+ "externalRefs" : [
185+ {
186+ "referenceCategory" : "SECURITY" ,
187+ "referenceLocator" : f"cpe:2.3:a:pypa:pip:{ pip_version } :*:*:*:*:*:*:*" ,
188+ "referenceType" : "cpe23Type" ,
189+ },
190+ {
191+ "referenceCategory" : "PACKAGE_MANAGER" ,
192+ "referenceLocator" : f"pkg:pypi/pip@{ pip_version } " ,
193+ "referenceType" : "purl" ,
194+ },
195+ ],
196+ "primaryPackagePurpose" : "SOURCE" ,
197+ }
198+ )
199+
200+
109201def main () -> None :
110- root_dir = pathlib .Path (__file__ ).parent .parent .parent
111- sbom_path = root_dir / "Misc/sbom.spdx.json"
202+ sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
112203 sbom_data = json .loads (sbom_path .read_bytes ())
113204
114- # Make a bunch of assertions about the SBOM data to ensure it's consistent.
205+ # Insert pip's SBOM metadata from the wheel.
206+ discover_pip_sbom_package (sbom_data )
207+
208+ # Ensure all packages in this tool are represented also in the SBOM file.
115209 assert {package ["name" ] for package in sbom_data ["packages" ]} == set (PACKAGE_TO_FILES )
210+
211+ # Make a bunch of assertions about the SBOM data to ensure it's consistent.
116212 for package in sbom_data ["packages" ]:
117213
118214 # Properties and ID must be properly formed.
@@ -138,17 +234,17 @@ def main() -> None:
138234 for include in sorted (files .include ):
139235
140236 # Find all the paths and then filter them through .gitignore.
141- paths = glob .glob (include , root_dir = root_dir , recursive = True )
237+ paths = glob .glob (include , root_dir = CPYTHON_ROOT_DIR , recursive = True )
142238 paths = filter_gitignored_paths (paths )
143239 assert paths , include # Make sure that every value returns something!
144240
145241 for path in paths :
146242 # Skip directories and excluded files
147- if not (root_dir / path ).is_file () or path in exclude :
243+ if not (CPYTHON_ROOT_DIR / path ).is_file () or path in exclude :
148244 continue
149245
150246 # SPDX requires SHA1 to be used for files, but we provide SHA256 too.
151- data = (root_dir / path ).read_bytes ()
247+ data = (CPYTHON_ROOT_DIR / path ).read_bytes ()
152248 checksum_sha1 = hashlib .sha1 (data ).hexdigest ()
153249 checksum_sha256 = hashlib .sha256 (data ).hexdigest ()
154250
0 commit comments