From 1a0eab11fec431f46799eb7e03cceb7af6ac3842 Mon Sep 17 00:00:00 2001 From: Dustin Ingram Date: Tue, 13 Feb 2024 00:09:04 +0000 Subject: [PATCH 1/3] Add a metadata backfill task --- tests/unit/packaging/test_tasks.py | 97 ++++++++++++++++++++++++++++++ warehouse/packaging/__init__.py | 4 ++ warehouse/packaging/tasks.py | 67 +++++++++++++++++++++ 3 files changed, 168 insertions(+) diff --git a/tests/unit/packaging/test_tasks.py b/tests/unit/packaging/test_tasks.py index 45c4c62ef424..e7d006196bb1 100644 --- a/tests/unit/packaging/test_tasks.py +++ b/tests/unit/packaging/test_tasks.py @@ -10,10 +10,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import builtins import tempfile from contextlib import contextmanager from itertools import product +from pathlib import Path import pretend import pytest @@ -24,8 +26,11 @@ import warehouse.packaging.tasks from warehouse.accounts.models import WebAuthn +from warehouse.metrics.interfaces import IMetricsService +from warehouse.packaging.interfaces import IFileStorage from warehouse.packaging.models import Description from warehouse.packaging.tasks import ( + backfill_metadata, check_file_cache_tasks_outstanding, compute_2fa_metrics, compute_packaging_metrics, @@ -888,3 +893,95 @@ def test_compute_2fa_metrics(db_request, monkeypatch): pretend.call("warehouse.2fa.total_users_with_webauthn_enabled", 1), pretend.call("warehouse.2fa.total_users_with_two_factor_enabled", 2), ] + + +def test_backfill_metadata(db_request, monkeypatch, metrics): + project = ProjectFactory() + release1 = ReleaseFactory(project=project) + release2 = ReleaseFactory(project=project) + FileFactory(release=release1, packagetype="sdist") + FileFactory( + release=release1, + packagetype="bdist_wheel", + metadata_file_sha256_digest="d34db33f", + ) + FileFactory(release=release2, packagetype="sdist") + backfillable_file = FileFactory( + release=release2, packagetype="bdist_wheel", metadata_file_sha256_digest=None + ) + + metadata_contents = b"some\nmetadata\ncontents" + stub_dist = pretend.stub( + _dist=pretend.stub(_files={Path("METADATA"): metadata_contents}) + ) + stub_session = pretend.stub() + dist_from_wheel_url = pretend.call_recorder( + lambda project_name, file_url, session: stub_dist + ) + monkeypatch.setattr( + warehouse.packaging.tasks, "dist_from_wheel_url", dist_from_wheel_url + ) + monkeypatch.setattr(warehouse.packaging.tasks, "PipSession", lambda: stub_session) + archive_service = pretend.stub( + store=pretend.call_recorder(lambda path_out, path_in, meta: None), + ) + db_request.find_service = pretend.call_recorder( + lambda iface, name=None, context=None: { + IFileStorage: { + "archive": archive_service, + "metrics": metrics, + }, + IMetricsService: {None: metrics}, + }[iface][name] + ) + + @contextmanager + def mock_temporary_directory(): + yield "/tmp/wutang" + + monkeypatch.setattr(tempfile, "TemporaryDirectory", mock_temporary_directory) + + mock_write = pretend.call_recorder(lambda value: None) + + @contextmanager + def mock_open(filename, perms): + yield pretend.stub(write=mock_write) + + monkeypatch.setattr(builtins, "open", mock_open) + + backfill_metadata(db_request) + + assert dist_from_wheel_url.calls == [ + pretend.call( + project.normalized_name, + f"https://test-files.pythonhosted.org/packages/{backfillable_file.path}", + stub_session, + ) + ] + + assert backfillable_file.metadata_file_sha256_digest == ( + "e85ce4c9e2d2eddba19c396ed04470efaa2a9c2a6b3c6463e6876a41e55d828d" + ) + assert backfillable_file.metadata_file_blake2_256_digest == ( + "39cc629504be4087d48889e8666392bd379b91e1826e269cd8467bb29298da82" + ) + assert archive_service.store.calls == [ + pretend.call( + backfillable_file.metadata_path, + f"/tmp/wutang/{backfillable_file.filename}.metadata", + meta={ + "project": project.normalized_name, + "version": release2.version, + "package-type": backfillable_file.packagetype, + "python-version": backfillable_file.python_version, + }, + ), + ] + + assert metrics.increment.calls == [ + pretend.call("warehouse.packaging.metadata_backfill.files"), + pretend.call("warehouse.packaging.metadata_backfill.tasks"), + ] + assert metrics.gauge.calls == [ + pretend.call("warehouse.packaging.metadata_backfill.remaining", 0) + ] diff --git a/warehouse/packaging/__init__.py b/warehouse/packaging/__init__.py index a219800967b8..dd92f01825a0 100644 --- a/warehouse/packaging/__init__.py +++ b/warehouse/packaging/__init__.py @@ -27,6 +27,7 @@ from warehouse.packaging.models import File, Project, Release, Role from warehouse.packaging.services import project_service_factory from warehouse.packaging.tasks import ( + backfill_metadata, check_file_cache_tasks_outstanding, compute_2fa_metrics, compute_packaging_metrics, @@ -193,3 +194,6 @@ def includeme(config): # TODO: restore this # if config.get_settings().get("warehouse.release_files_table"): # config.add_periodic_task(crontab(minute=0), sync_bigquery_release_files) + + # Backfill wheel metadata + config.add_periodic_task(crontab(minute="*/5"), backfill_metadata) diff --git a/warehouse/packaging/tasks.py b/warehouse/packaging/tasks.py index 7a43e6c3d50f..98b5cefe8be0 100644 --- a/warehouse/packaging/tasks.py +++ b/warehouse/packaging/tasks.py @@ -11,13 +11,19 @@ # limitations under the License. import datetime +import hashlib import logging +import os import tempfile from collections import namedtuple from itertools import product +from pathlib import Path from google.cloud.bigquery import LoadJobConfig +from pip._internal.network.lazy_wheel import dist_from_wheel_url +from pip._internal.network.session import PipSession +from sqlalchemy import desc from sqlalchemy.orm import joinedload from warehouse import tasks @@ -54,6 +60,67 @@ def sync_file_to_cache(request, file_id): file.cached = True +@tasks.task(ignore_result=True, acks_late=True) +def backfill_metadata(request): + metrics = request.find_service(IMetricsService, context=None) + is_pypi = request.registry.settings.get("warehouse.domain") == "pypi.org" + subdomain = "files" if is_pypi else "test-files" + base_url = f"https://{subdomain}.pythonhosted.org/packages" + + storage = request.find_service(IFileStorage, name="archive") + session = PipSession() + + # Get all wheel files without metadata in reverse chronologicial order + files_without_metadata = ( + request.db.query(File) + .filter(File.packagetype == "bdist_wheel") + .filter(File.metadata_file_sha256_digest.is_(None)) + .order_by(desc(File.upload_time)) + ) + + with tempfile.TemporaryDirectory() as tmpdir: + for file_ in files_without_metadata.yield_per(100): + # Use pip to download just the metadata of the wheel + file_url = f"{base_url}/{file_.path}" + lazy_dist = dist_from_wheel_url( + file_.release.project.normalized_name, file_url, session + ) + wheel_metadata_contents = lazy_dist._dist._files[Path("METADATA")] + + # Write the metadata to a temporary file + temporary_filename = os.path.join(tmpdir, file_.filename) + ".metadata" + with open(temporary_filename, "wb") as fp: + fp.write(wheel_metadata_contents) + + # Hash the metadata and add it to the File instance + file_.metadata_file_sha256_digest = ( + hashlib.sha256(wheel_metadata_contents).hexdigest().lower() + ) + file_.metadata_file_blake2_256_digest = ( + hashlib.blake2b(wheel_metadata_contents, digest_size=256 // 8) + .hexdigest() + .lower() + ) + + # Store the metadata file via our object storage backend + storage.store( + file_.metadata_path, + temporary_filename, + meta={ + "project": file_.release.project.normalized_name, + "version": file_.release.version, + "package-type": file_.packagetype, + "python-version": file_.python_version, + }, + ) + metrics.increment("warehouse.packaging.metadata_backfill.files") + metrics.increment("warehouse.packaging.metadata_backfill.tasks") + metrics.gauge( + "warehouse.packaging.metadata_backfill.remaining", + files_without_metadata.count(), + ) + + @tasks.task(ignore_result=True, acks_late=True) def compute_packaging_metrics(request): counts = dict( From 8454b27868524848a25119a08de99bfc0d50b899 Mon Sep 17 00:00:00 2001 From: Dustin Ingram Date: Tue, 13 Feb 2024 00:35:25 +0000 Subject: [PATCH 2/3] Use files.url --- tests/unit/packaging/test_tasks.py | 6 +++++- warehouse/packaging/tasks.py | 6 ++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/unit/packaging/test_tasks.py b/tests/unit/packaging/test_tasks.py index e7d006196bb1..dcc49776775d 100644 --- a/tests/unit/packaging/test_tasks.py +++ b/tests/unit/packaging/test_tasks.py @@ -949,12 +949,16 @@ def mock_open(filename, perms): monkeypatch.setattr(builtins, "open", mock_open) + db_request.registry.settings[ + "files.url" + ] = "https://files.example.com/packages/{path}" + backfill_metadata(db_request) assert dist_from_wheel_url.calls == [ pretend.call( project.normalized_name, - f"https://test-files.pythonhosted.org/packages/{backfillable_file.path}", + f"https://files.example.com/packages/{backfillable_file.path}", stub_session, ) ] diff --git a/warehouse/packaging/tasks.py b/warehouse/packaging/tasks.py index 98b5cefe8be0..bab27dbf5b19 100644 --- a/warehouse/packaging/tasks.py +++ b/warehouse/packaging/tasks.py @@ -63,9 +63,7 @@ def sync_file_to_cache(request, file_id): @tasks.task(ignore_result=True, acks_late=True) def backfill_metadata(request): metrics = request.find_service(IMetricsService, context=None) - is_pypi = request.registry.settings.get("warehouse.domain") == "pypi.org" - subdomain = "files" if is_pypi else "test-files" - base_url = f"https://{subdomain}.pythonhosted.org/packages" + base_url = request.registry.settings.get("files.url") storage = request.find_service(IFileStorage, name="archive") session = PipSession() @@ -81,7 +79,7 @@ def backfill_metadata(request): with tempfile.TemporaryDirectory() as tmpdir: for file_ in files_without_metadata.yield_per(100): # Use pip to download just the metadata of the wheel - file_url = f"{base_url}/{file_.path}" + file_url = base_url.format(path=file_.path) lazy_dist = dist_from_wheel_url( file_.release.project.normalized_name, file_url, session ) From c8a6a88d9b846d952eaab80858f7401efafe1c22 Mon Sep 17 00:00:00 2001 From: Dustin Ingram Date: Tue, 13 Feb 2024 16:41:24 +0000 Subject: [PATCH 3/3] Also write metadata file to cache --- tests/unit/packaging/test_tasks.py | 37 ++++++++++++++++++------------ warehouse/packaging/tasks.py | 16 +++++++++++-- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/tests/unit/packaging/test_tasks.py b/tests/unit/packaging/test_tasks.py index dcc49776775d..0b188493a708 100644 --- a/tests/unit/packaging/test_tasks.py +++ b/tests/unit/packaging/test_tasks.py @@ -922,14 +922,17 @@ def test_backfill_metadata(db_request, monkeypatch, metrics): warehouse.packaging.tasks, "dist_from_wheel_url", dist_from_wheel_url ) monkeypatch.setattr(warehouse.packaging.tasks, "PipSession", lambda: stub_session) - archive_service = pretend.stub( + archive_storage = pretend.stub( + store=pretend.call_recorder(lambda path_out, path_in, meta: None), + ) + cache_storage = pretend.stub( store=pretend.call_recorder(lambda path_out, path_in, meta: None), ) db_request.find_service = pretend.call_recorder( lambda iface, name=None, context=None: { IFileStorage: { - "archive": archive_service, - "metrics": metrics, + "archive": archive_storage, + "cache": cache_storage, }, IMetricsService: {None: metrics}, }[iface][name] @@ -969,18 +972,22 @@ def mock_open(filename, perms): assert backfillable_file.metadata_file_blake2_256_digest == ( "39cc629504be4087d48889e8666392bd379b91e1826e269cd8467bb29298da82" ) - assert archive_service.store.calls == [ - pretend.call( - backfillable_file.metadata_path, - f"/tmp/wutang/{backfillable_file.filename}.metadata", - meta={ - "project": project.normalized_name, - "version": release2.version, - "package-type": backfillable_file.packagetype, - "python-version": backfillable_file.python_version, - }, - ), - ] + assert ( + archive_storage.store.calls + == cache_storage.store.calls + == [ + pretend.call( + backfillable_file.metadata_path, + f"/tmp/wutang/{backfillable_file.filename}.metadata", + meta={ + "project": project.normalized_name, + "version": release2.version, + "package-type": backfillable_file.packagetype, + "python-version": backfillable_file.python_version, + }, + ), + ] + ) assert metrics.increment.calls == [ pretend.call("warehouse.packaging.metadata_backfill.files"), diff --git a/warehouse/packaging/tasks.py b/warehouse/packaging/tasks.py index bab27dbf5b19..bf548f3bb73b 100644 --- a/warehouse/packaging/tasks.py +++ b/warehouse/packaging/tasks.py @@ -65,7 +65,8 @@ def backfill_metadata(request): metrics = request.find_service(IMetricsService, context=None) base_url = request.registry.settings.get("files.url") - storage = request.find_service(IFileStorage, name="archive") + archive_storage = request.find_service(IFileStorage, name="archive") + cache_storage = request.find_service(IFileStorage, name="cache") session = PipSession() # Get all wheel files without metadata in reverse chronologicial order @@ -101,7 +102,18 @@ def backfill_metadata(request): ) # Store the metadata file via our object storage backend - storage.store( + archive_storage.store( + file_.metadata_path, + temporary_filename, + meta={ + "project": file_.release.project.normalized_name, + "version": file_.release.version, + "package-type": file_.packagetype, + "python-version": file_.python_version, + }, + ) + # Write it to our storage cache as well + cache_storage.store( file_.metadata_path, temporary_filename, meta={