diff --git a/dev/environment b/dev/environment index 90362c7b78fb..592bb96cc194 100644 --- a/dev/environment +++ b/dev/environment @@ -31,6 +31,7 @@ HIBP_API_KEY="something-not-real" DOCS_URL="https://pythonhosted.org/{project}/" FILES_BACKEND=warehouse.packaging.services.LocalFileStorage path=/var/opt/warehouse/packages/ url=http://localhost:9001/packages/{path} +ARCHIVE_FILES_BACKEND=warehouse.packaging.services.LocalArchiveFileStorage path=/var/opt/warehouse/packages-archive/ url=http://localhost:9001/packages-archive/{path} SIMPLE_BACKEND=warehouse.packaging.services.LocalSimpleStorage path=/var/opt/warehouse/simple/ url=http://localhost:9001/simple/{path} DOCS_BACKEND=warehouse.packaging.services.LocalDocsStorage path=/var/opt/warehouse/docs/ SPONSORLOGOS_BACKEND=warehouse.admin.services.LocalSponsorLogoStorage path=/var/opt/warehouse/sponsorlogos/ diff --git a/docker-compose.yml b/docker-compose.yml index 5c86d3c7f6d6..99fae7287094 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,6 +3,7 @@ version: '3.9' volumes: simple: packages: + packages-archive: sponsorlogos: policies: vault: @@ -98,6 +99,7 @@ services: - ./pyproject.toml:/opt/warehouse/src/pyproject.toml:z - .coveragerc:/opt/warehouse/src/.coveragerc:z - packages:/var/opt/warehouse/packages + - packages-archive:/var/opt/warehouse/packages-archive - sponsorlogos:/var/opt/warehouse/sponsorlogos - policies:/var/opt/warehouse/policies - simple:/var/opt/warehouse/simple @@ -128,6 +130,7 @@ services: command: python -m http.server 9001 volumes: - packages:/var/opt/warehouse/packages + - packages-archive:/var/opt/warehouse/packages-archive - sponsorlogos:/var/opt/warehouse/sponsorlogos - simple:/var/opt/warehouse/simple ports: @@ -139,10 +142,13 @@ services: command: hupper -m celery -A warehouse worker -B -S redbeat.RedBeatScheduler -l info volumes: - ./warehouse:/opt/warehouse/src/warehouse:z + - packages:/var/opt/warehouse/packages + - packages-archive:/var/opt/warehouse/packages-archive env_file: dev/environment environment: C_FORCE_ROOT: "1" FILES_BACKEND: "warehouse.packaging.services.LocalFileStorage path=/var/opt/warehouse/packages/ url=http://files:9001/packages/{path}" + ARCHIVE_FILES_BACKEND: "warehouse.packaging.services.LocalArchiveFileStorage path=/var/opt/warehouse/packages-archive/ url=http://files:9001/packages-archive/{path}" SIMPLE_BACKEND: "warehouse.packaging.services.LocalSimpleStorage path=/var/opt/warehouse/simple/ url=http://files:9001/simple/{path}" static: diff --git a/pyproject.toml b/pyproject.toml index 76d2a4619bc2..c64c2952e196 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ exclude = ["warehouse/locale/.*", "warehouse/migrations/versions.*"] module = [ "automat.*", "bpython.*", # https://github.com/bpython/bpython/issues/892 + "b2sdk.*", # https://github.com/Backblaze/b2-sdk-python/issues/148 "celery.app.backends.*", "celery.backends.redis.*", "citext.*", diff --git a/requirements/main.in b/requirements/main.in index 273c2325e46b..369ab4dd165f 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -1,6 +1,7 @@ alembic>=0.7.0 Automat argon2-cffi +b2sdk Babel bcrypt boto3 diff --git a/requirements/main.txt b/requirements/main.txt index b7a2fa725640..f27099194cbf 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -39,6 +39,10 @@ argon2-cffi-bindings==21.2.0 \ --hash=sha256:f1152ac548bd5b8bcecfb0b0371f082037e47128653df2e8ba6e914d384f3c3e \ --hash=sha256:f9f8b450ed0547e3d473fdc8612083fd08dd2120d6ac8f73828df9b7d45bb351 # via argon2-cffi +arrow==1.2.3 \ + --hash=sha256:3934b30ca1b9f292376d9db15b19446088d12ec58629bc3f0da28fd55fb633a1 \ + --hash=sha256:5a49ab92e3b7b71d96cd6bfcc4df14efefc9dfa96ea19045815914a6ab6b1fe2 + # via b2sdk asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -55,6 +59,9 @@ automat==22.10.0 \ --hash=sha256:c3164f8742b9dc440f3682482d32aaff7bb53f71740dd018533f9de286b64180 \ --hash=sha256:e56beb84edad19dcc11d30e8d9b895f75deeb5ef5e96b84a467066b3b84bb04e # via -r requirements/main.in +b2sdk==1.20.0 \ + --hash=sha256:b394d9fbdada1a4ffc0837cd6c930351f5fccc24cd0af23e41edd850d67fb687 + # via -r requirements/main.in babel==2.12.1 \ --hash=sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610 \ --hash=sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455 @@ -843,6 +850,10 @@ limits==3.3.1 \ --hash=sha256:df8685b1aff349b5199628ecdf41a9f339a35233d8e4fcd9c3e10002e4419b45 \ --hash=sha256:dfc59ed5b4847e33a33b88ec16033bed18ce444ce6a76287a4e054db9a683861 # via -r requirements/main.in +logfury==1.0.1 \ + --hash=sha256:130a5daceab9ad534924252ddf70482aa2c96662b3a3825a7d30981d03b76a26 \ + --hash=sha256:b4f04be1701a1df644afc3384d6167d64c899f8036b7eefc3b6c570c6a9b290b + # via b2sdk lxml==4.9.2 \ --hash=sha256:01d36c05f4afb8f7c20fd9ed5badca32a2029b93b1750f571ccc0b142531caf7 \ --hash=sha256:04876580c050a8c5341d706dd464ff04fd597095cc8c023252566a8826505726 \ @@ -1327,6 +1338,7 @@ python-dateutil==2.8.2 \ --hash=sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86 \ --hash=sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9 # via + # arrow # botocore # celery-redbeat # elasticsearch-dsl @@ -1358,6 +1370,7 @@ requests==2.28.2 \ --hash=sha256:98b1b2782e3c6c4904938b84c0eb932721069dfdb9134313beff7c83c2df24bf # via # -r requirements/main.in + # b2sdk # datadog # forcediphttpsadapter # google-api-core @@ -1475,6 +1488,10 @@ text-unidecode==1.3 \ --hash=sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8 \ --hash=sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93 # via python-slugify +tqdm==4.65.0 \ + --hash=sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5 \ + --hash=sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671 + # via b2sdk transaction==3.1.0 \ --hash=sha256:65d0b1ea92dbe7c4e3b237fb6bd8b41dea23d7459e7bdd8c3880bffdaf912fa4 \ --hash=sha256:8376a959aa71821df1bdd7d066858a3f9f34b7f5f1c0a0e1efbd11d626895449 diff --git a/tests/conftest.py b/tests/conftest.py index f61a9686ab2e..ec5e1cc2c3f9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,6 +89,7 @@ def metrics_timing(*args, **kwargs): def metrics(): return pretend.stub( event=pretend.call_recorder(lambda *args, **kwargs: None), + gauge=pretend.call_recorder(lambda *args, **kwargs: None), increment=pretend.call_recorder(lambda *args, **kwargs: None), histogram=pretend.call_recorder(lambda *args, **kwargs: None), timing=pretend.call_recorder(lambda *args, **kwargs: None), @@ -173,6 +174,13 @@ def pyramid_request(pyramid_services, jinja, remote_addr): dummy_request.registry.registerUtility(jinja, IJinja2Environment, name=".jinja2") + dummy_request._task_stub = pretend.stub( + delay=pretend.call_recorder(lambda *a, **kw: None) + ) + dummy_request.task = pretend.call_recorder( + lambda *a, **kw: dummy_request._task_stub + ) + def localize(message, **kwargs): ts = TranslationString(message, **kwargs) return ts.interpolate() @@ -264,6 +272,7 @@ def app_config(database): "ratelimit.url": "memory://", "elasticsearch.url": "https://localhost/warehouse", "files.backend": "warehouse.packaging.services.LocalFileStorage", + "archive_files.backend": "warehouse.packaging.services.LocalArchiveFileStorage", "simple.backend": "warehouse.packaging.services.LocalSimpleStorage", "docs.backend": "warehouse.packaging.services.LocalDocsStorage", "sponsorlogos.backend": "warehouse.admin.services.LocalSponsorLogoStorage", @@ -273,6 +282,7 @@ def app_config(database): "warehouse.malware.services.PrinterMalwareCheckService" ), "files.url": "http://localhost:7000/", + "archive_files.url": "http://localhost:7000/archive", "sessions.secret": "123456", "sessions.url": "redis://localhost:0/", "statuspage.url": "https://2p66nmmycsj3.statuspage.io", diff --git a/tests/unit/forklift/test_legacy.py b/tests/unit/forklift/test_legacy.py index 240e3a18cdc0..6bbd2fee0a1a 100644 --- a/tests/unit/forklift/test_legacy.py +++ b/tests/unit/forklift/test_legacy.py @@ -48,7 +48,10 @@ Release, Role, ) -from warehouse.packaging.tasks import update_bigquery_release_files +from warehouse.packaging.tasks import ( + sync_file_to_archive, + update_bigquery_release_files, +) from warehouse.utils.security_policy import AuthenticationMethod from ...common.db.accounts import EmailFactory, UserFactory @@ -1426,17 +1429,12 @@ def storage_service_store(path, file_path, *, meta): "warehouse.release_files_table": "example.pypi.distributions" } - update_bigquery = pretend.stub( - delay=pretend.call_recorder(lambda *a, **kw: None) - ) - db_request.task = pretend.call_recorder(lambda *a, **kw: update_bigquery) - resp = legacy.file_upload(db_request) assert resp.status_code == 200 assert db_request.find_service.calls == [ pretend.call(IMetricsService, context=None), - pretend.call(IFileStorage), + pretend.call(IFileStorage, name="primary"), ] assert len(storage_service.store.calls) == 2 if has_signature else 1 assert storage_service.store.calls[0] == pretend.call( @@ -1508,7 +1506,10 @@ def storage_service_store(path, file_path, *, meta): ) ] - assert db_request.task.calls == [pretend.call(update_bigquery_release_files)] + assert db_request.task.calls == [ + pretend.call(update_bigquery_release_files), + pretend.call(sync_file_to_archive), + ] assert metrics.increment.calls == [ pretend.call("warehouse.upload.attempt"), @@ -2567,6 +2568,47 @@ def test_upload_fails_with_unsafe_filename( assert resp.status_code == 400 assert resp.status == "400 Cannot upload a file with '/' or '\\' in the name." + @pytest.mark.parametrize("character", [*(chr(x) for x in range(32)), chr(127)]) + def test_upload_fails_with_disallowed_in_filename( + self, pyramid_config, db_request, character + ): + + user = UserFactory.create() + pyramid_config.testing_securitypolicy(identity=user) + db_request.user = user + EmailFactory.create(user=user) + project = ProjectFactory.create() + release = ReleaseFactory.create(project=project, version="1.0") + RoleFactory.create(user=user, project=project) + + filename = f"{project.name}{character}-{release.version}.tar.wat" + + db_request.POST = MultiDict( + { + "metadata_version": "1.2", + "name": project.name, + "version": release.version, + "filetype": "sdist", + "md5_digest": "nope!", + "content": pretend.stub( + filename=filename, + file=io.BytesIO(b"a" * (legacy.MAX_FILESIZE + 1)), + type="application/tar", + ), + } + ) + + with pytest.raises(HTTPBadRequest) as excinfo: + legacy.file_upload(db_request) + + resp = excinfo.value + + assert resp.status_code == 400 + assert resp.status == ( + "400 Cannot upload a file with non-printable characters (ordinals 0-31) " + "or the DEL character (ordinal 127) in the name." + ) + def test_upload_fails_without_user_permission(self, pyramid_config, db_request): user1 = UserFactory.create() EmailFactory.create(user=user1) @@ -2786,7 +2828,7 @@ def storage_service_store(path, file_path, *, meta): assert resp.status_code == 200 assert db_request.find_service.calls == [ pretend.call(IMetricsService, context=None), - pretend.call(IFileStorage), + pretend.call(IFileStorage, name="primary"), ] assert storage_service.store.calls == [ pretend.call( @@ -2898,7 +2940,7 @@ def storage_service_store(path, file_path, *, meta): assert resp.status_code == 200 assert db_request.find_service.calls == [ pretend.call(IMetricsService, context=None), - pretend.call(IFileStorage), + pretend.call(IFileStorage, name="primary"), ] assert storage_service.store.calls == [ pretend.call( diff --git a/tests/unit/packaging/test_init.py b/tests/unit/packaging/test_init.py index d2861e8bef97..5d00a6aefe55 100644 --- a/tests/unit/packaging/test_init.py +++ b/tests/unit/packaging/test_init.py @@ -28,6 +28,7 @@ from warehouse.packaging.models import File, Project, Release, Role from warehouse.packaging.services import project_service_factory from warehouse.packaging.tasks import ( # sync_bigquery_release_files, + check_file_archive_tasks_outstanding, compute_2fa_mandate, update_description_html, ) @@ -49,6 +50,7 @@ def key_factory(keystring, iterate_on=None, if_attr_exists=None): monkeypatch.setattr(packaging, "key_factory", key_factory) settings = { "files.backend": "foo.bar", + "archive_files.backend": "peas.carrots", "simple.backend": "bread.butter", "docs.backend": "wu.tang", "warehouse.packaging.project_create_user_ratelimit_string": "20 per hour", @@ -73,7 +75,8 @@ def key_factory(keystring, iterate_on=None, if_attr_exists=None): packaging.includeme(config) assert config.register_service_factory.calls == [ - pretend.call(storage_class.create_service, IFileStorage), + pretend.call(storage_class.create_service, IFileStorage, name="primary"), + pretend.call(storage_class.create_service, IFileStorage, name="archive"), pretend.call(storage_class.create_service, ISimpleStorage), pretend.call(storage_class.create_service, IDocsStorage), pretend.call( @@ -169,6 +172,10 @@ def key_factory(keystring, iterate_on=None, if_attr_exists=None): in config.add_periodic_task.calls ) + assert ( + pretend.call(crontab(minute="*/1"), check_file_archive_tasks_outstanding) + in config.add_periodic_task.calls + ) assert ( pretend.call(crontab(minute="*/5"), update_description_html) in config.add_periodic_task.calls diff --git a/tests/unit/packaging/test_services.py b/tests/unit/packaging/test_services.py index dc300ccde570..522cc5272cb9 100644 --- a/tests/unit/packaging/test_services.py +++ b/tests/unit/packaging/test_services.py @@ -13,6 +13,7 @@ import io import os.path +import b2sdk.v2 import boto3.session import botocore.exceptions import pretend @@ -24,12 +25,15 @@ from warehouse.packaging.interfaces import IDocsStorage, IFileStorage, ISimpleStorage from warehouse.packaging.services import ( + B2FileStorage, GCSFileStorage, GCSSimpleStorage, GenericLocalBlobStorage, + LocalArchiveFileStorage, LocalDocsStorage, LocalFileStorage, LocalSimpleStorage, + S3ArchiveFileStorage, S3DocsStorage, S3FileStorage, project_service_factory, @@ -76,6 +80,22 @@ def test_stores_file(self, tmpdir): with open(os.path.join(storage_dir, "foo/bar.txt"), "rb") as fp: assert fp.read() == b"Test File!" + def test_stores_and_gets_metadata(self, tmpdir): + filename = str(tmpdir.join("testfile.txt")) + with open(filename, "wb") as fp: + fp.write(b"Test File!") + + storage_dir = str(tmpdir.join("storage")) + storage = LocalFileStorage(storage_dir) + storage.store("foo/bar.txt", filename, meta={"foo": "bar", "wu": "tang"}) + + with open(os.path.join(storage_dir, "foo/bar.txt"), "rb") as fp: + assert fp.read() == b"Test File!" + with open(os.path.join(storage_dir, "foo/bar.txt.meta"), "rb") as fp: + assert fp.read() == b'{"foo": "bar", "wu": "tang"}' + + assert storage.get_metadata("foo/bar.txt") == {"foo": "bar", "wu": "tang"} + def test_stores_two_files(self, tmpdir): filename1 = str(tmpdir.join("testfile1.txt")) with open(filename1, "wb") as fp: @@ -97,6 +117,18 @@ def test_stores_two_files(self, tmpdir): assert fp.read() == b"Second Test File!" +class TestLocalArchiveFileStorage: + def test_verify_service(self): + assert verifyClass(IFileStorage, LocalArchiveFileStorage) + + def test_create_service(self): + request = pretend.stub( + registry=pretend.stub(settings={"archive_files.path": "/the/one/two/"}) + ) + storage = LocalArchiveFileStorage.create_service(None, request) + assert storage.base == "/the/one/two/" + + class TestLocalDocsStorage: def test_verify_service(self): assert verifyClass(IDocsStorage, LocalDocsStorage) @@ -202,6 +234,130 @@ def test_stores_two_files(self, tmpdir): assert fp.read() == b"Second Test File!" +class TestB2FileStorage: + def test_verify_service(self): + assert verifyClass(IFileStorage, B2FileStorage) + + def test_basic_init(self): + bucket = pretend.stub() + prefix = "segakcap" + storage = B2FileStorage(bucket, prefix=prefix) + assert storage.bucket is bucket + assert storage.prefix == "segakcap" + + def test_create_service(self): + bucket_stub = pretend.stub() + mock_b2_api = pretend.stub( + get_bucket_by_name=pretend.call_recorder(lambda bucket_name: bucket_stub) + ) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + assert request.find_service.calls == [pretend.call(name="b2.api")] + assert storage.bucket == bucket_stub + assert mock_b2_api.get_bucket_by_name.calls == [pretend.call("froblob")] + + def test_gets_file(self): + bucket_stub = pretend.stub( + download_file_by_name=pretend.call_recorder( + lambda path: pretend.stub( + save=lambda file_obj: file_obj.write(b"my contents") + ) + ) + ) + mock_b2_api = pretend.stub(get_bucket_by_name=lambda bucket_name: bucket_stub) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + file_object = storage.get("file.txt") + + assert file_object.read() == b"my contents" + assert bucket_stub.download_file_by_name.calls == [pretend.call("file.txt")] + + def test_gets_metadata(self): + bucket_stub = pretend.stub( + get_file_info_by_name=pretend.call_recorder( + lambda path: pretend.stub(file_info={"foo": "bar", "wu": "tang"}) + ) + ) + mock_b2_api = pretend.stub(get_bucket_by_name=lambda bucket_name: bucket_stub) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + metadata = storage.get_metadata("file.txt") + + assert metadata == {"foo": "bar", "wu": "tang"} + assert bucket_stub.get_file_info_by_name.calls == [pretend.call("file.txt")] + + def test_raises_when_key_non_existent(self): + def raiser(path): + raise b2sdk.v2.exception.FileNotPresent() + + bucket_stub = pretend.stub(download_file_by_name=raiser) + mock_b2_api = pretend.stub(get_bucket_by_name=lambda bucket_name: bucket_stub) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + with pytest.raises(FileNotFoundError): + storage.get("file.txt") + + def test_get_metadata_raises_when_key_non_existent(self): + def raiser(path): + raise b2sdk.v2.exception.FileNotPresent() + + bucket_stub = pretend.stub(get_file_info_by_name=raiser) + mock_b2_api = pretend.stub(get_bucket_by_name=lambda bucket_name: bucket_stub) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + with pytest.raises(FileNotFoundError): + storage.get_metadata("file.txt") + + def test_stores_file(self, tmpdir): + filename = str(tmpdir.join("testfile.txt")) + with open(filename, "wb") as fp: + fp.write(b"Test File!") + + bucket_stub = pretend.stub( + upload_local_file=pretend.call_recorder( + lambda local_file=None, file_name=None, file_infos=None: None + ) + ) + mock_b2_api = pretend.stub(get_bucket_by_name=lambda bucket_name: bucket_stub) + + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: mock_b2_api), + registry=pretend.stub(settings={"files.bucket": "froblob"}), + ) + storage = B2FileStorage.create_service(None, request) + + storage.store("foo/bar.txt", filename) + + assert bucket_stub.upload_local_file.calls == [ + pretend.call(local_file=filename, file_name="foo/bar.txt", file_infos=None) + ] + + class TestS3FileStorage: def test_verify_service(self): assert verifyClass(IFileStorage, S3FileStorage) @@ -234,6 +390,16 @@ def test_gets_file(self): assert file_object.read() == b"my contents" assert bucket.Object.calls == [pretend.call("file.txt")] + def test_gets_metadata(self): + s3key = pretend.stub(metadata={"foo": "bar", "wu": "tang"}) + bucket = pretend.stub(Object=pretend.call_recorder(lambda path: s3key)) + storage = S3FileStorage(bucket) + + metadata = storage.get_metadata("file.txt") + + assert metadata == {"foo": "bar", "wu": "tang"} + assert bucket.Object.calls == [pretend.call("file.txt")] + def test_raises_when_key_non_existent(self): def raiser(): raise botocore.exceptions.ClientError( @@ -249,6 +415,18 @@ def raiser(): assert bucket.Object.calls == [pretend.call("file.txt")] + def test_get_metadata_raises_when_key_non_existent(self): + def raiser(*a, **kw): + raise botocore.exceptions.ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "No Key!"}}, "some operation" + ) + + bucket = pretend.stub(Object=raiser) + storage = S3FileStorage(bucket) + + with pytest.raises(FileNotFoundError): + storage.get_metadata("file.txt") + def test_passes_up_error_when_not_no_such_key(self): def raiser(): raise botocore.exceptions.ClientError( @@ -263,6 +441,19 @@ def raiser(): with pytest.raises(botocore.exceptions.ClientError): storage.get("file.txt") + def test_get_metadata_passes_up_error_when_not_no_such_key(self): + def raiser(*a, **kw): + raise botocore.exceptions.ClientError( + {"Error": {"Code": "SomeOtherError", "Message": "Who Knows!"}}, + "some operation", + ) + + bucket = pretend.stub(Object=raiser) + storage = S3FileStorage(bucket) + + with pytest.raises(botocore.exceptions.ClientError): + storage.get_metadata("file.txt") + def test_stores_file(self, tmpdir): filename = str(tmpdir.join("testfile.txt")) with open(filename, "wb") as fp: @@ -337,6 +528,24 @@ def test_hashed_path_without_prefix(self): assert bucket.Object.calls == [pretend.call("ab/file.txt")] +class TestS3ArchiveFileStorage: + def test_verify_service(self): + assert verifyClass(IFileStorage, S3ArchiveFileStorage) + + def test_create_service(self): + session = boto3.session.Session( + aws_access_key_id="foo", aws_secret_access_key="bar" + ) + request = pretend.stub( + find_service=pretend.call_recorder(lambda name: session), + registry=pretend.stub(settings={"archive_files.bucket": "froblob"}), + ) + storage = S3ArchiveFileStorage.create_service(None, request) + + assert request.find_service.calls == [pretend.call(name="aws.session")] + assert storage.bucket.name == "froblob" + + class TestGCSFileStorage: def test_verify_service(self): assert verifyClass(IFileStorage, GCSFileStorage) @@ -365,6 +574,12 @@ def test_gets_file_raises(self): with pytest.raises(NotImplementedError): storage.get("file.txt") + def test_get_metadata_raises(self): + storage = GCSFileStorage(pretend.stub()) + + with pytest.raises(NotImplementedError): + storage.get_metadata("file.txt") + def test_stores_file(self, tmpdir): filename = str(tmpdir.join("testfile.txt")) with open(filename, "wb") as fp: diff --git a/tests/unit/packaging/test_tasks.py b/tests/unit/packaging/test_tasks.py index 365ebf554577..58850369c84a 100644 --- a/tests/unit/packaging/test_tasks.py +++ b/tests/unit/packaging/test_tasks.py @@ -10,6 +10,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile + +from contextlib import contextmanager from itertools import product import pretend @@ -23,9 +26,11 @@ from warehouse.accounts.models import WebAuthn from warehouse.packaging.models import Description from warehouse.packaging.tasks import ( + check_file_archive_tasks_outstanding, compute_2fa_mandate, compute_2fa_metrics, sync_bigquery_release_files, + sync_file_to_archive, update_bigquery_release_files, update_description_html, ) @@ -43,6 +48,61 @@ ) +@pytest.mark.parametrize("archived", [True, False]) +def test_sync_file_to_archive(db_request, monkeypatch, archived): + file = FileFactory(archived=archived) + primary_stub = pretend.stub( + get_metadata=pretend.call_recorder(lambda path: {"fizz": "buzz"}), + get=pretend.call_recorder( + lambda path: pretend.stub(read=lambda: b"my content") + ), + ) + archive_stub = pretend.stub( + store=pretend.call_recorder(lambda filename, path, meta=None: None) + ) + db_request.find_service = pretend.call_recorder( + lambda iface, name=None: {"primary": primary_stub, "archive": archive_stub}[ + name + ] + ) + + @contextmanager + def mock_named_temporary_file(): + yield pretend.stub( + name="/tmp/wutang", + write=lambda bites: None, + flush=lambda: None, + ) + + monkeypatch.setattr(tempfile, "NamedTemporaryFile", mock_named_temporary_file) + + sync_file_to_archive(db_request, file.id) + + assert file.archived + + if not archived: + assert primary_stub.get_metadata.calls == [pretend.call(file.path)] + assert primary_stub.get.calls == [pretend.call(file.path)] + assert archive_stub.store.calls == [ + pretend.call(file.path, "/tmp/wutang", meta={"fizz": "buzz"}) + ] + else: + assert primary_stub.get_metadata.calls == [] + assert primary_stub.get.calls == [] + assert archive_stub.store.calls == [] + + +def test_check_file_archive_tasks_outstanding(db_request, metrics): + [FileFactory(archived=True) for _ in range(12)] + [FileFactory(archived=False) for _ in range(3)] + + check_file_archive_tasks_outstanding(db_request) + + assert metrics.gauge.calls == [ + pretend.call("warehouse.packaging.files.not_archived", 3) + ] + + def test_update_description_html(monkeypatch, db_request): current_version = "24.0" previous_version = "23.0" diff --git a/tests/unit/test_b2.py b/tests/unit/test_b2.py new file mode 100644 index 000000000000..7c3b829e25ce --- /dev/null +++ b/tests/unit/test_b2.py @@ -0,0 +1,50 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import b2sdk.v2 +import pretend + +from warehouse import b2 + + +def test_b2_api_factory(monkeypatch): + mock_in_memory_account_info = pretend.call_recorder(lambda: "InMemoryAccountInfo") + monkeypatch.setattr(b2sdk.v2, "InMemoryAccountInfo", mock_in_memory_account_info) + mock_b2_api = pretend.stub( + authorize_account=pretend.call_recorder(lambda mode, key_id, key: None) + ) + mock_b2_api_class = pretend.call_recorder(lambda account_info: mock_b2_api) + monkeypatch.setattr(b2sdk.v2, "B2Api", mock_b2_api_class) + + request = pretend.stub( + registry=pretend.stub( + settings={"b2.application_key_id": "key_id", "b2.application_key": "key"} + ) + ) + + assert b2.b2_api_factory(None, request) is mock_b2_api + assert mock_b2_api_class.calls == [pretend.call("InMemoryAccountInfo")] + assert mock_b2_api.authorize_account.calls == [ + pretend.call("production", "key_id", "key") + ] + + +def test_includeme(): + config = pretend.stub( + register_service_factory=pretend.call_recorder(lambda factory, name: None) + ) + + b2.includeme(config) + + assert config.register_service_factory.calls == [ + pretend.call(b2.b2_api_factory, name="b2.api") + ] diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 4eb788f00e9c..aba882805015 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -348,6 +348,7 @@ def __init__(self): pretend.call(".policy"), pretend.call(".search"), pretend.call(".aws"), + pretend.call(".b2"), pretend.call(".gcloud"), pretend.call(".sessions"), pretend.call(".cache.http"), diff --git a/warehouse/b2.py b/warehouse/b2.py new file mode 100644 index 000000000000..3b6da1271249 --- /dev/null +++ b/warehouse/b2.py @@ -0,0 +1,27 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import b2sdk.v2 + + +def b2_api_factory(context, request): + b2_api = b2sdk.v2.B2Api(b2sdk.v2.InMemoryAccountInfo()) + b2_api.authorize_account( + "production", + request.registry.settings["b2.application_key_id"], + request.registry.settings["b2.application_key"], + ) + return b2_api + + +def includeme(config): + config.register_service_factory(b2_api_factory, name="b2.api") diff --git a/warehouse/config.py b/warehouse/config.py index e302a8a62c7c..6707138d43eb 100644 --- a/warehouse/config.py +++ b/warehouse/config.py @@ -162,6 +162,8 @@ def configure(settings=None): maybe_set(settings, "aws.key_id", "AWS_ACCESS_KEY_ID") maybe_set(settings, "aws.secret_key", "AWS_SECRET_ACCESS_KEY") maybe_set(settings, "aws.region", "AWS_REGION") + maybe_set(settings, "b2.application_key_id", "B2_APPLICATION_KEY_ID") + maybe_set(settings, "b2.application_key", "B2_APPLICATION_KEY") maybe_set(settings, "gcloud.credentials", "GCLOUD_CREDENTIALS") maybe_set(settings, "gcloud.project", "GCLOUD_PROJECT") maybe_set( @@ -231,6 +233,7 @@ def configure(settings=None): ) maybe_set_compound(settings, "billing", "backend", "BILLING_BACKEND") maybe_set_compound(settings, "files", "backend", "FILES_BACKEND") + maybe_set_compound(settings, "archive_files", "backend", "ARCHIVE_FILES_BACKEND") maybe_set_compound(settings, "simple", "backend", "SIMPLE_BACKEND") maybe_set_compound(settings, "docs", "backend", "DOCS_BACKEND") maybe_set_compound(settings, "sponsorlogos", "backend", "SPONSORLOGOS_BACKEND") @@ -554,8 +557,9 @@ def configure(settings=None): config.include(".search") - # Register the support for AWS and Google Cloud + # Register the support for AWS, Backblaze,and Google Cloud config.include(".aws") + config.include(".b2") config.include(".gcloud") # Register our session support diff --git a/warehouse/forklift/legacy.py b/warehouse/forklift/legacy.py index 6fd0c321de5b..a74adbc49862 100644 --- a/warehouse/forklift/legacy.py +++ b/warehouse/forklift/legacy.py @@ -61,7 +61,10 @@ Project, Release, ) -from warehouse.packaging.tasks import update_bigquery_release_files +from warehouse.packaging.tasks import ( + sync_file_to_archive, + update_bigquery_release_files, +) from warehouse.rate_limiting.interfaces import RateLimiterException from warehouse.utils import http, readme from warehouse.utils.project import PROJECT_NAME_RE, validate_project_name @@ -613,6 +616,40 @@ def full_validate(self): ) +def _validate_filename(filename): + # Our object storage does not tolerate some specific characters + # ref: https://www.backblaze.com/b2/docs/files.html#file-names + # + # Also, its hard to imagine a usecase for them that isn't ✨malicious✨ + # or completely by mistake. + disallowed = [*(chr(x) for x in range(32)), chr(127)] + if [char for char in filename if char in disallowed]: + raise _exc_with_message( + HTTPBadRequest, + ( + "Cannot upload a file with " + "non-printable characters (ordinals 0-31) " + "or the DEL character (ordinal 127) " + "in the name." + ), + ) + + # Make sure that the filename does not contain any path separators. + if "/" in filename or "\\" in filename: + raise _exc_with_message( + HTTPBadRequest, "Cannot upload a file with '/' or '\\' in the name." + ) + + # Make sure the filename ends with an allowed extension. + if _dist_file_re.search(filename) is None: + raise _exc_with_message( + HTTPBadRequest, + "Invalid file extension: Use .egg, .tar.gz, .whl or .zip " + "extension. See https://www.python.org/dev/peps/pep-0527 " + "for more information.", + ) + + _safe_zipnames = re.compile(r"(purelib|platlib|headers|scripts|data).+", re.I) # .tar uncompressed, .tar.gz .tgz, .tar.bz2 .tbz2 _tar_filenames_re = re.compile(r"\.(?:tar$|t(?:ar\.)?(?Pgz|bz2)$)") @@ -1133,20 +1170,8 @@ def file_upload(request): # Pull the filename out of our POST data. filename = request.POST["content"].filename - # Make sure that the filename does not contain any path separators. - if "/" in filename or "\\" in filename: - raise _exc_with_message( - HTTPBadRequest, "Cannot upload a file with '/' or '\\' in the name." - ) - - # Make sure the filename ends with an allowed extension. - if _dist_file_re.search(filename) is None: - raise _exc_with_message( - HTTPBadRequest, - "Invalid file extension: Use .egg, .tar.gz, .whl or .zip " - "extension. See https://www.python.org/dev/peps/pep-0527 " - "for more information.", - ) + # Ensure the filename doesn't contain any characters that are too 🌶️spicy🥵 + _validate_filename(filename) # Make sure that our filename matches the project that it is being uploaded # to. @@ -1383,7 +1408,7 @@ def file_upload(request): # this won't take affect until after a commit has happened, for # now we'll just ignore it and save it before the transaction is # committed. - storage = request.find_service(IFileStorage) + storage = request.find_service(IFileStorage, name="primary") storage.store( file_.path, os.path.join(tmpdir, filename), @@ -1457,6 +1482,9 @@ def file_upload(request): # Log a successful upload metrics.increment("warehouse.upload.ok", tags=[f"filetype:{form.filetype.data}"]) + # Dispatch our archive task to sync this as soon as possible + request.task(sync_file_to_archive).delay(file_.id) + return Response() diff --git a/warehouse/migrations/versions/d142f435bb39_add_archived_column_to_files.py b/warehouse/migrations/versions/d142f435bb39_add_archived_column_to_files.py new file mode 100644 index 000000000000..547d8bdc864d --- /dev/null +++ b/warehouse/migrations/versions/d142f435bb39_add_archived_column_to_files.py @@ -0,0 +1,46 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +add archived column to files + +Revision ID: d142f435bb39 +Revises: 665b6f8fd9ac +Create Date: 2023-04-11 10:11:22.602965 +""" + +import sqlalchemy as sa + +from alembic import op + +revision = "d142f435bb39" +down_revision = "665b6f8fd9ac" + + +def upgrade(): + op.add_column( + "release_files", + sa.Column( + "archived", + sa.Boolean(), + server_default=sa.text("false"), + nullable=False, + comment="If True, the object has been archived to our archival bucket.", + ), + ) + op.create_index( + "release_files_archived_idx", "release_files", ["archived"], unique=False + ) + + +def downgrade(): + op.drop_index("release_files_archived_idx", table_name="release_files") + op.drop_column("release_files", "archived") diff --git a/warehouse/packaging/__init__.py b/warehouse/packaging/__init__.py index ad323e50889e..ec39b9c5fe27 100644 --- a/warehouse/packaging/__init__.py +++ b/warehouse/packaging/__init__.py @@ -27,6 +27,7 @@ from warehouse.packaging.models import File, Project, Release, Role from warehouse.packaging.services import project_service_factory from warehouse.packaging.tasks import ( + check_file_archive_tasks_outstanding, compute_2fa_mandate, compute_2fa_metrics, update_description_html, @@ -62,7 +63,16 @@ def includeme(config): # Register whatever file storage backend has been configured for storing # our package files. files_storage_class = config.maybe_dotted(config.registry.settings["files.backend"]) - config.register_service_factory(files_storage_class.create_service, IFileStorage) + config.register_service_factory( + files_storage_class.create_service, IFileStorage, name="primary" + ) + + archive_files_storage_class = config.maybe_dotted( + config.registry.settings["archive_files.backend"] + ) + config.register_service_factory( + archive_files_storage_class.create_service, IFileStorage, name="archive" + ) simple_storage_class = config.maybe_dotted( config.registry.settings["simple.backend"] @@ -165,6 +175,10 @@ def includeme(config): ], ) + config.add_periodic_task( + crontab(minute="*/1"), check_file_archive_tasks_outstanding + ) + config.add_periodic_task(crontab(minute="*/5"), update_description_html) config.add_periodic_task(crontab(minute="*/5"), update_role_invitation_status) diff --git a/warehouse/packaging/interfaces.py b/warehouse/packaging/interfaces.py index b8d87d092618..9b611ed64e8c 100644 --- a/warehouse/packaging/interfaces.py +++ b/warehouse/packaging/interfaces.py @@ -32,6 +32,13 @@ def get(path): at the given path. """ + def get_metadata(path): + """ + Return a dictionary containing any user-created metadata associated + with the file at a given path. Implementations may or may not store + or provide such metadata. + """ + def store(path, file_path, *, meta=None): """ Save the file located at file_path to the file storage at the location diff --git a/warehouse/packaging/models.py b/warehouse/packaging/models.py index 455a53b27d53..8c0ab6036075 100644 --- a/warehouse/packaging/models.py +++ b/warehouse/packaging/models.py @@ -647,6 +647,7 @@ def __table_args__(cls): # noqa ), ), Index("release_files_release_id_idx", "release_id"), + Index("release_files_archived_idx", "archived"), ) release_id = Column( @@ -683,6 +684,13 @@ def __table_args__(cls): # noqa # of all of them and then remove this column. allow_multiple_sdist = Column(Boolean, nullable=False, server_default=sql.false()) + archived = Column( + Boolean, + comment="If True, the object has been archived to our archival bucket.", + nullable=False, + server_default=sql.false(), + ) + @hybrid_property def pgp_path(self): return self.path + ".asc" diff --git a/warehouse/packaging/services.py b/warehouse/packaging/services.py index 8ccb0e695bee..a19e60c126c5 100644 --- a/warehouse/packaging/services.py +++ b/warehouse/packaging/services.py @@ -11,11 +11,14 @@ # limitations under the License. import collections +import io +import json import logging import os.path import shutil import warnings +import b2sdk import botocore.exceptions import google.api_core.exceptions import google.api_core.retry @@ -64,12 +67,18 @@ def create_service(cls, context, request): def get(self, path): return open(os.path.join(self.base, path), "rb") + def get_metadata(self, path): + return json.loads(open(os.path.join(self.base, path + ".meta")).read()) + def store(self, path, file_path, *, meta=None): destination = os.path.join(self.base, path) os.makedirs(os.path.dirname(destination), exist_ok=True) with open(destination, "wb") as dest_fp: with open(file_path, "rb") as src_fp: dest_fp.write(src_fp.read()) + if meta is not None: + with open(destination + ".meta", "w") as dest_fp: + dest_fp.write(json.dumps(meta)) @implementer(IFileStorage) @@ -79,6 +88,13 @@ def create_service(cls, context, request): return cls(request.registry.settings["files.path"]) +@implementer(IFileStorage) +class LocalArchiveFileStorage(GenericLocalBlobStorage): + @classmethod + def create_service(cls, context, request): + return cls(request.registry.settings["archive_files.path"]) + + @implementer(ISimpleStorage) class LocalSimpleStorage(GenericLocalBlobStorage): @classmethod @@ -129,9 +145,47 @@ def _get_path(self, path): return path +class GenericB2BlobStorage(GenericBlobStorage): + def get(self, path): + path = self._get_path(path) + try: + file_obj = io.BytesIO() + downloaded_file = self.bucket.download_file_by_name(path) + downloaded_file.save(file_obj) + file_obj.seek(0) + return file_obj + except b2sdk.v2.exception.FileNotPresent: + raise FileNotFoundError(f"No such key: {path!r}") from None + + def get_metadata(self, path): + path = self._get_path(path) + try: + return self.bucket.get_file_info_by_name(path).file_info + except b2sdk.v2.exception.FileNotPresent: + raise FileNotFoundError(f"No such key: {path!r}") from None + + def store(self, path, file_path, *, meta=None): + path = self._get_path(path) + self.bucket.upload_local_file( + local_file=file_path, + file_name=path, + file_infos=meta, + ) + + +@implementer(IFileStorage) +class B2FileStorage(GenericB2BlobStorage): + @classmethod + def create_service(cls, context, request): + b2_api = request.find_service(name="b2.api") + bucket = b2_api.get_bucket_by_name(request.registry.settings["files.bucket"]) + prefix = request.registry.settings.get("files.prefix") + return cls(bucket, prefix=prefix) + + class GenericS3BlobStorage(GenericBlobStorage): def get(self, path): - # Note: this is not actually used in production, instead our CDN is + # Note: this is not actually used to serve files, instead our CDN is # configured to connect directly to our storage bucket. See: # https://github.com/python/pypi-infra/blob/master/terraform/file-hosting/vcl/main.vcl try: @@ -141,6 +195,14 @@ def get(self, path): raise raise FileNotFoundError(f"No such key: {path!r}") from None + def get_metadata(self, path): + try: + return self.bucket.Object(self._get_path(path)).metadata + except botocore.exceptions.ClientError as exc: + if exc.response["Error"]["Code"] != "NoSuchKey": + raise + raise FileNotFoundError(f"No such key: {path!r}") from None + def store(self, path, file_path, *, meta=None): extra_args = {} if meta is not None: @@ -162,6 +224,17 @@ def create_service(cls, context, request): return cls(bucket, prefix=prefix) +@implementer(IFileStorage) +class S3ArchiveFileStorage(GenericS3BlobStorage): + @classmethod + def create_service(cls, context, request): + session = request.find_service(name="aws.session") + s3 = session.resource("s3") + bucket = s3.Bucket(request.registry.settings["archive_files.bucket"]) + prefix = request.registry.settings.get("archive_files.prefix") + return cls(bucket, prefix=prefix) + + @implementer(IDocsStorage) class S3DocsStorage: def __init__(self, s3_client, bucket_name, *, prefix=None): @@ -197,11 +270,14 @@ def remove_by_prefix(self, prefix): class GenericGCSBlobStorage(GenericBlobStorage): def get(self, path): - # Note: this is not actually used in production, instead our CDN is + # Note: this is not actually used in to serve files, instead our CDN is # configured to connect directly to our storage bucket. See: # https://github.com/python/pypi-infra/blob/master/terraform/file-hosting/vcl/main.vcl raise NotImplementedError + def get_metadata(self, path): + raise NotImplementedError + @google.api_core.retry.Retry( predicate=google.api_core.retry.if_exception_type( google.api_core.exceptions.ServiceUnavailable diff --git a/warehouse/packaging/tasks.py b/warehouse/packaging/tasks.py index 680674b31f90..c66d63203303 100644 --- a/warehouse/packaging/tasks.py +++ b/warehouse/packaging/tasks.py @@ -11,6 +11,7 @@ # limitations under the License. import datetime +import tempfile from itertools import product @@ -23,10 +24,40 @@ from warehouse.accounts.models import User, WebAuthn from warehouse.email import send_two_factor_mandate_email from warehouse.metrics import IMetricsService +from warehouse.packaging.interfaces import IFileStorage from warehouse.packaging.models import Description, File, Project, Release, Role from warehouse.utils import readme +@tasks.task(ignore_result=True, acks_late=True) +def sync_file_to_archive(request, file_id): + file = request.db.query(File).get(file_id) + if not file.archived: + primary_storage = request.find_service(IFileStorage, name="primary") + archive_storage = request.find_service(IFileStorage, name="archive") + metadata = primary_storage.get_metadata(file.path) + file_obj = primary_storage.get(file.path) + with tempfile.NamedTemporaryFile() as file_for_archive: + file_for_archive.write(file_obj.read()) + file_for_archive.flush() + archive_storage.store(file.path, file_for_archive.name, meta=metadata) + file.archived = True + + +@tasks.task(ignore_result=True, acks_late=True) +def check_file_archive_tasks_outstanding(request): + metrics = request.find_service(IMetricsService, context=None) + + files_not_archived = ( + request.db.query(File).filter(File.archived == False).count() # noqa: E712 + ) + + metrics.gauge( + "warehouse.packaging.files.not_archived", + files_not_archived, + ) + + @tasks.task(ignore_result=True, acks_late=True) def compute_2fa_mandate(request): # Get our own production dependencies