Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 36 additions & 12 deletions tests/unit/packaging/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from warehouse.packaging.models import Description
from warehouse.packaging.tasks import (
backfill_metadata,
backfill_metadata_individual,
check_file_cache_tasks_outstanding,
compute_2fa_metrics,
compute_packaging_metrics,
Expand Down Expand Up @@ -911,6 +912,38 @@ def test_backfill_metadata(db_request, monkeypatch, metrics):
release=release2, packagetype="bdist_wheel", metadata_file_sha256_digest=None
)

delay = pretend.call_recorder(lambda x: None)
db_request.task = pretend.call_recorder(lambda x: pretend.stub(delay=delay))
db_request.registry.settings["backfill_metadata.batch_size"] = 500

backfill_metadata(db_request)

assert db_request.task.calls == [pretend.call(backfill_metadata_individual)]
assert delay.calls == [pretend.call(backfillable_file.id)]

assert metrics.increment.calls == [
pretend.call("warehouse.packaging.metadata_backfill.tasks"),
]
assert metrics.gauge.calls == [
pretend.call("warehouse.packaging.metadata_backfill.remaining", 1)
]


def test_backfill_metadata_individual(db_request, monkeypatch, metrics):
project = ProjectFactory()
release1 = ReleaseFactory(project=project)
release2 = ReleaseFactory(project=project)
FileFactory(release=release1, packagetype="sdist")
FileFactory(
release=release1,
packagetype="bdist_wheel",
metadata_file_sha256_digest="d34db33f",
)
FileFactory(release=release2, packagetype="sdist")
backfillable_file = FileFactory(
release=release2, packagetype="bdist_wheel", metadata_file_sha256_digest=None
)

metadata_contents = b"some\nmetadata\ncontents"
stub_dist = pretend.stub(
_dist=pretend.stub(_files={Path("METADATA"): metadata_contents})
Expand Down Expand Up @@ -957,7 +990,7 @@ def mock_open(filename, perms):
"files.url"
] = "https://files.example.com/packages/{path}"

backfill_metadata(db_request)
backfill_metadata_individual(pretend.stub(), db_request, backfillable_file.id)

assert dist_from_wheel_url.calls == [
pretend.call(
Expand Down Expand Up @@ -992,10 +1025,6 @@ def mock_open(filename, perms):

assert metrics.increment.calls == [
pretend.call("warehouse.packaging.metadata_backfill.files"),
pretend.call("warehouse.packaging.metadata_backfill.tasks"),
]
assert metrics.gauge.calls == [
pretend.call("warehouse.packaging.metadata_backfill.remaining", 0)
]


Expand Down Expand Up @@ -1027,12 +1056,7 @@ def test_backfill_metadata_file_unbackfillable(db_request, monkeypatch, metrics)

assert backfillable_file.metadata_file_unbackfillable is False

backfill_metadata(db_request)
backfill_metadata_individual(pretend.stub(), db_request, backfillable_file.id)

assert backfillable_file.metadata_file_unbackfillable is True
assert metrics.increment.calls == [
pretend.call("warehouse.packaging.metadata_backfill.tasks"),
]
assert metrics.gauge.calls == [
pretend.call("warehouse.packaging.metadata_backfill.remaining", 0)
]
assert metrics.increment.calls == []
1 change: 1 addition & 0 deletions tests/unit/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def __init__(self):
"oidc.backend": "warehouse.oidc.services.OIDCPublisherService",
"warehouse.organizations.max_undecided_organization_applications": 3,
"reconcile_file_storages.batch_size": 100,
"metadata_backfill.batch_size": 500,
"gcloud.service_account_info": {},
}
if environment == config.Environment.development:
Expand Down
7 changes: 7 additions & 0 deletions warehouse/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,13 @@ def configure(settings=None):
coercer=int,
default=100,
)
maybe_set(
settings,
"metadata_backfill.batch_size",
"METADATA_BACKFILL_BATCH_SIZE",
coercer=int,
default=500,
)
maybe_set_compound(settings, "billing", "backend", "BILLING_BACKEND")
maybe_set_compound(settings, "files", "backend", "FILES_BACKEND")
maybe_set_compound(settings, "archive_files", "backend", "ARCHIVE_FILES_BACKEND")
Expand Down
122 changes: 65 additions & 57 deletions warehouse/packaging/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,7 @@ def sync_file_to_cache(request, file_id):
@tasks.task(ignore_result=True, acks_late=True)
def backfill_metadata(request):
metrics = request.find_service(IMetricsService, context=None)
base_url = request.registry.settings.get("files.url")

archive_storage = request.find_service(IFileStorage, name="archive")
cache_storage = request.find_service(IFileStorage, name="cache")
session = PipSession()
batch_size = request.registry.settings["backfill_metadata.batch_size"]

# Get all wheel files without metadata in reverse chronologicial order
files_without_metadata = (
Expand All @@ -78,65 +74,77 @@ def backfill_metadata(request):
.filter(File.metadata_file_unbackfillable.isnot(True))
.order_by(desc(File.upload_time))
)
for file_ in files_without_metadata.yield_per(100).limit(batch_size):
request.task(backfill_metadata_individual).delay(file_.id)

with tempfile.TemporaryDirectory() as tmpdir:
for file_ in files_without_metadata.yield_per(100).limit(500):
# Use pip to download just the metadata of the wheel
file_url = base_url.format(path=file_.path)
try:
lazy_dist = dist_from_wheel_url(
file_.release.project.normalized_name, file_url, session
)
wheel_metadata_contents = lazy_dist._dist._files[Path("METADATA")]
except UnsupportedWheel:
file_.metadata_file_unbackfillable = True
continue

# Write the metadata to a temporary file
temporary_filename = os.path.join(tmpdir, file_.filename) + ".metadata"
with open(temporary_filename, "wb") as fp:
fp.write(wheel_metadata_contents)

# Hash the metadata and add it to the File instance
file_.metadata_file_sha256_digest = (
hashlib.sha256(wheel_metadata_contents).hexdigest().lower()
)
file_.metadata_file_blake2_256_digest = (
hashlib.blake2b(wheel_metadata_contents, digest_size=256 // 8)
.hexdigest()
.lower()
)

# Store the metadata file via our object storage backend
archive_storage.store(
file_.metadata_path,
temporary_filename,
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
# Write it to our storage cache as well
cache_storage.store(
file_.metadata_path,
temporary_filename,
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
metrics.increment("warehouse.packaging.metadata_backfill.files")
metrics.increment("warehouse.packaging.metadata_backfill.tasks")
metrics.increment("warehouse.packaging.metadata_backfill.tasks")
metrics.gauge(
"warehouse.packaging.metadata_backfill.remaining",
files_without_metadata.count(),
)


@tasks.task(ignore_result=True, acks_late=True)
def backfill_metadata_individual(_task, request, file_id):
file_ = request.db.get(File, file_id)
base_url = request.registry.settings.get("files.url")
file_url = base_url.format(path=file_.path)
metrics = request.find_service(IMetricsService, context=None)
cache_storage = request.find_service(IFileStorage, name="cache")
archive_storage = request.find_service(IFileStorage, name="archive")
session = PipSession()

# Use pip to download just the metadata of the wheel
try:
lazy_dist = dist_from_wheel_url(
file_.release.project.normalized_name, file_url, session
)
wheel_metadata_contents = lazy_dist._dist._files[Path("METADATA")]
except UnsupportedWheel:
file_.metadata_file_unbackfillable = True
return

# Hash the metadata and add it to the File instance
file_.metadata_file_sha256_digest = (
hashlib.sha256(wheel_metadata_contents).hexdigest().lower()
)
file_.metadata_file_blake2_256_digest = (
hashlib.blake2b(wheel_metadata_contents, digest_size=256 // 8)
.hexdigest()
.lower()
)

with tempfile.TemporaryDirectory() as tmpdir:
# Write the metadata to a temporary file
temporary_filename = os.path.join(tmpdir, file_.filename) + ".metadata"
with open(temporary_filename, "wb") as fp:
fp.write(wheel_metadata_contents)

# Store the metadata file via our object storage backend
cache_storage.store(
file_.metadata_path,
temporary_filename,
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
# Write it to our archive storage as well
archive_storage.store(
file_.metadata_path,
temporary_filename,
meta={
"project": file_.release.project.normalized_name,
"version": file_.release.version,
"package-type": file_.packagetype,
"python-version": file_.python_version,
},
)
metrics.increment("warehouse.packaging.metadata_backfill.files")


@tasks.task(ignore_result=True, acks_late=True)
def compute_packaging_metrics(request):
counts = dict(
Expand Down