From 17f3703bfb95aaa31bedf876ad2a98a204d53a05 Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Mon, 23 Jun 2025 19:35:20 -0400 Subject: [PATCH 1/4] Cache final file from resume retry process --- src/pip/_internal/network/download.py | 66 ++++++++++++++++++++++++++- tests/unit/test_network_download.py | 61 ++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 1a00a5f246e..00017bfeda0 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -11,7 +11,10 @@ from http import HTTPStatus from typing import BinaryIO +from pip._vendor.requests import PreparedRequest from pip._vendor.requests.models import Response +from pip._vendor.urllib3 import HTTPResponse as URLlib3Response +from pip._vendor.urllib3._collections import HTTPHeaderDict from pip._vendor.urllib3.exceptions import ReadTimeoutError from pip._internal.cli.progress_bars import get_download_progress_renderer @@ -19,7 +22,7 @@ from pip._internal.models.index import PyPI from pip._internal.models.link import Link from pip._internal.network.cache import is_from_cache -from pip._internal.network.session import PipSession +from pip._internal.network.session import CacheControlAdapter, PipSession from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext @@ -250,6 +253,67 @@ def _attempt_resumes_or_redownloads( os.remove(download.output_file.name) raise IncompleteDownloadError(download) + # If we successfully completed the download via resume, manually cache it + # as a complete response to enable future caching + if download.reattempts > 0: + self._cache_resumed_download(download, first_resp) + + def _cache_resumed_download( + self, download: _FileDownload, original_response: Response + ) -> None: + """ + Manually cache a file that was successfully downloaded via resume retries. + + cachecontrol doesn't cache 206 (Partial Content) responses, since they + are not complete files. This method manually adds the final file to the + cache as though it was downloaded in a single request, so that future + requests can use the cache. + """ + url = download.link.url_without_fragment + if url.startswith("https://"): + adapter = self._session.adapters["https://"] + elif url.startswith("http://"): + adapter = self._session.adapters["http://"] + else: + return + + # Check if the adapter is the CacheControlAdapter (i.e. caching is enabled) + if not isinstance(adapter, CacheControlAdapter): + logger.debug( + "Skipping resume download caching: no cache controller for %s", url + ) + return + + synthetic_request = PreparedRequest() + synthetic_request.prepare(method="GET", url=url, headers={}) + + synthetic_response_headers = HTTPHeaderDict() + for key, value in original_response.headers.items(): + if key.lower() not in ["content-range", "content-length"]: + synthetic_response_headers[key] = value + synthetic_response_headers["content-length"] = str(download.size) + + synthetic_response = URLlib3Response( + body="", + headers=synthetic_response_headers, + status=200, + preload_content=False, + ) + + # Use the cache controller to store this as a complete response + download.output_file.flush() + with open(download.output_file.name, "rb") as f: + adapter.controller.cache_response( + synthetic_request, + synthetic_response, + body=f.read(), + status_codes=(200, 203, 300, 301, 308), + ) + + logger.debug( + "Cached resumed download as complete response for future use: %s", url + ) + def _http_get_resume( self, download: _FileDownload, should_match: Response ) -> Response: diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index 1aa8c9267fa..542cb46c3b5 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -16,7 +16,7 @@ parse_content_disposition, sanitize_content_filename, ) -from pip._internal.network.session import PipSession +from pip._internal.network.session import CacheControlAdapter, PipSession from pip._internal.network.utils import HEADERS from tests.lib.requests_mocks import MockResponse @@ -350,3 +350,62 @@ def test_downloader( # Make sure that the downloader makes additional requests for resumption _http_get_mock.assert_has_calls(calls) + + +def test_resumed_download_caching(tmpdir: Path) -> None: + """Test that resumed downloads are cached properly for future use.""" + session = PipSession() + link = Link("http://example.com/foo.tgz") + downloader = Downloader(session, "on", resume_retries=5) + + # Mock an incomplete download followed by a successful resume + incomplete_resp = MockResponse(b"0cfa7e9d-1868-4dd7-9fb3-") + incomplete_resp.headers = {"content-length": "36"} + incomplete_resp.status_code = 200 + + resume_resp = MockResponse(b"f2561d5dfd89") + resume_resp.headers = {"content-length": "12"} + resume_resp.status_code = 206 + + responses = [incomplete_resp, resume_resp] + _http_get_mock = MagicMock(side_effect=responses) + + # Mock the session's adapters to have a cache controller + mock_adapter = MagicMock(spec=CacheControlAdapter) + mock_controller = MagicMock() + mock_adapter.controller = mock_controller + + # Create a mock for the session adapters + adapters_mock = MagicMock() + adapters_mock.__getitem__ = MagicMock(return_value=mock_adapter) + + with ( + patch.object(Downloader, "_http_get", _http_get_mock), + patch.object(session, "adapters", adapters_mock), + ): + + filepath, _ = downloader(link, str(tmpdir)) + + # Verify the file was downloaded correctly + with open(filepath, "rb") as downloaded_file: + downloaded_bytes = downloaded_file.read() + expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89" + assert downloaded_bytes == expected_bytes + + # Verify that cache_response was called for the resumed download + mock_controller.cache_response.assert_called_once() + + # Get the call arguments to verify the cached content + call_args = mock_controller.cache_response.call_args + assert call_args is not None + + # Extract positional and keyword arguments + args, kwargs = call_args + request, response = args + body = kwargs.get("body") + status_codes = kwargs.get("status_codes") + + assert body == expected_bytes, "Cached body should match complete file content" + assert response.status == 200, "Cached response should have status 200" + assert request.url == link.url_without_fragment + assert 200 in status_codes From ba1ee527d75d0582cf3b2155440531b35a318809 Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Mon, 23 Jun 2025 21:47:49 -0400 Subject: [PATCH 2/4] Stream the final file to the cache --- src/pip/_internal/network/cache.py | 19 ++++++++++++--- src/pip/_internal/network/download.py | 26 +++++++++++++------- tests/unit/test_network_download.py | 34 ++++++++++++++++----------- 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/pip/_internal/network/cache.py b/src/pip/_internal/network/cache.py index 479a1bb98b8..0c5961c45b4 100644 --- a/src/pip/_internal/network/cache.py +++ b/src/pip/_internal/network/cache.py @@ -3,10 +3,11 @@ from __future__ import annotations import os +import shutil from collections.abc import Generator from contextlib import contextmanager from datetime import datetime -from typing import BinaryIO +from typing import Any, BinaryIO, Callable from pip._vendor.cachecontrol.cache import SeparateBodyBaseCache from pip._vendor.cachecontrol.caches import SeparateBodyFileCache @@ -72,12 +73,13 @@ def get(self, key: str) -> bytes | None: with open(metadata_path, "rb") as f: return f.read() - def _write(self, path: str, data: bytes) -> None: + def _write_to_file(self, path: str, writer_func: Callable[[BinaryIO], Any]) -> None: + """Common file writing logic with proper permissions and atomic replacement.""" with suppressed_cache_errors(): ensure_dir(os.path.dirname(path)) with adjacent_tmp_file(path) as f: - f.write(data) + writer_func(f) # Inherit the read/write permissions of the cache directory # to enable multi-user cache use-cases. mode = ( @@ -93,6 +95,12 @@ def _write(self, path: str, data: bytes) -> None: replace(f.name, path) + def _write(self, path: str, data: bytes) -> None: + self._write_to_file(path, lambda f: f.write(data)) + + def _write_from_io(self, path: str, source_file: BinaryIO) -> None: + self._write_to_file(path, lambda f: shutil.copyfileobj(source_file, f)) + def set( self, key: str, value: bytes, expires: int | datetime | None = None ) -> None: @@ -118,3 +126,8 @@ def get_body(self, key: str) -> BinaryIO | None: def set_body(self, key: str, body: bytes) -> None: path = self._get_cache_path(key) + ".body" self._write(path, body) + + def set_body_from_io(self, key: str, body_file: BinaryIO) -> None: + """Set the body of the cache entry from a file object.""" + path = self._get_cache_path(key) + ".body" + self._write_from_io(path, body_file) diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 00017bfeda0..54e5070d3c4 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -21,7 +21,7 @@ from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError from pip._internal.models.index import PyPI from pip._internal.models.link import Link -from pip._internal.network.cache import is_from_cache +from pip._internal.network.cache import SafeFileCache, is_from_cache from pip._internal.network.session import CacheControlAdapter, PipSession from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext @@ -284,6 +284,14 @@ def _cache_resumed_download( ) return + # Check SafeFileCache is being used + if not isinstance(adapter.cache, SafeFileCache): + logger.debug( + "Skipping resume download caching: " + "cache doesn't support separate body storage" + ) + return + synthetic_request = PreparedRequest() synthetic_request.prepare(method="GET", url=url, headers={}) @@ -300,15 +308,17 @@ def _cache_resumed_download( preload_content=False, ) - # Use the cache controller to store this as a complete response + # Stream the file to cache + cache_url = adapter.controller.cache_url(url) + adapter.cache.set( + cache_url, + adapter.controller.serializer.dumps( + synthetic_request, synthetic_response, b"" + ), + ) download.output_file.flush() with open(download.output_file.name, "rb") as f: - adapter.controller.cache_response( - synthetic_request, - synthetic_response, - body=f.read(), - status_codes=(200, 203, 300, 301, 308), - ) + adapter.cache.set_body_from_io(cache_url, f) logger.debug( "Cached resumed download as complete response for future use: %s", url diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index 542cb46c3b5..a27f45ab7cd 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -9,6 +9,7 @@ from pip._internal.exceptions import IncompleteDownloadError from pip._internal.models.link import Link +from pip._internal.network.cache import SafeFileCache from pip._internal.network.download import ( Downloader, _get_http_response_size, @@ -374,6 +375,13 @@ def test_resumed_download_caching(tmpdir: Path) -> None: mock_adapter = MagicMock(spec=CacheControlAdapter) mock_controller = MagicMock() mock_adapter.controller = mock_controller + mock_controller.cache_url = MagicMock(return_value="cache_key") + mock_controller.serializer = MagicMock() + mock_controller.serializer.dumps = MagicMock(return_value=b"serialized_data") + + # Mock the cache to be a SafeFileCache + mock_cache = MagicMock(spec=SafeFileCache) + mock_adapter.cache = mock_cache # Create a mock for the session adapters adapters_mock = MagicMock() @@ -392,20 +400,18 @@ def test_resumed_download_caching(tmpdir: Path) -> None: expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89" assert downloaded_bytes == expected_bytes - # Verify that cache_response was called for the resumed download - mock_controller.cache_response.assert_called_once() + # Verify that cache.set was called for metadata + mock_cache.set.assert_called_once() + + # Verify that set_body_from_io was called for streaming the body + mock_cache.set_body_from_io.assert_called_once() - # Get the call arguments to verify the cached content - call_args = mock_controller.cache_response.call_args - assert call_args is not None + # Verify the call arguments + set_call_args = mock_cache.set.call_args + assert set_call_args[0][0] == "cache_key" # First argument should be cache_key - # Extract positional and keyword arguments - args, kwargs = call_args - request, response = args - body = kwargs.get("body") - status_codes = kwargs.get("status_codes") + set_body_call_args = mock_cache.set_body_from_io.call_args - assert body == expected_bytes, "Cached body should match complete file content" - assert response.status == 200, "Cached response should have status 200" - assert request.url == link.url_without_fragment - assert 200 in status_codes + assert set_body_call_args[0][0] == "cache_key" + assert hasattr(set_body_call_args[0][1], "read") + assert set_body_call_args[0][1].name == filepath From 1d4249a6273018a48548756c0fc7ba52e80b3725 Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Tue, 24 Jun 2025 12:11:45 -0400 Subject: [PATCH 3/4] NEWS ENTRY --- news/13441.bugfix.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 news/13441.bugfix.rst diff --git a/news/13441.bugfix.rst b/news/13441.bugfix.rst new file mode 100644 index 00000000000..a59c27ace22 --- /dev/null +++ b/news/13441.bugfix.rst @@ -0,0 +1 @@ +Add the full file from a resume retry download to the cache. From 67d66395531a1cc3d87a5ffbf81b2b46750395be Mon Sep 17 00:00:00 2001 From: Damian Shaw Date: Tue, 24 Jun 2025 17:25:35 -0400 Subject: [PATCH 4/4] Improve `_attempt_resumes_or_redownloads` and update tests --- news/13441.bugfix.rst | 2 +- src/pip/_internal/network/download.py | 26 +++++--------- tests/unit/test_network_download.py | 52 +++++++-------------------- 3 files changed, 21 insertions(+), 59 deletions(-) diff --git a/news/13441.bugfix.rst b/news/13441.bugfix.rst index a59c27ace22..519f38ea906 100644 --- a/news/13441.bugfix.rst +++ b/news/13441.bugfix.rst @@ -1 +1 @@ -Add the full file from a resume retry download to the cache. +Resumed downloads are saved to the HTTP cache like any other normal download. diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 54e5070d3c4..a655c19c366 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -270,12 +270,7 @@ def _cache_resumed_download( requests can use the cache. """ url = download.link.url_without_fragment - if url.startswith("https://"): - adapter = self._session.adapters["https://"] - elif url.startswith("http://"): - adapter = self._session.adapters["http://"] - else: - return + adapter = self._session.get_adapter(url) # Check if the adapter is the CacheControlAdapter (i.e. caching is enabled) if not isinstance(adapter, CacheControlAdapter): @@ -285,12 +280,9 @@ def _cache_resumed_download( return # Check SafeFileCache is being used - if not isinstance(adapter.cache, SafeFileCache): - logger.debug( - "Skipping resume download caching: " - "cache doesn't support separate body storage" - ) - return + assert isinstance( + adapter.cache, SafeFileCache + ), "separate body cache not in use!" synthetic_request = PreparedRequest() synthetic_request.prepare(method="GET", url=url, headers={}) @@ -308,14 +300,12 @@ def _cache_resumed_download( preload_content=False, ) - # Stream the file to cache + # Save metadata and then stream the file contents to cache. cache_url = adapter.controller.cache_url(url) - adapter.cache.set( - cache_url, - adapter.controller.serializer.dumps( - synthetic_request, synthetic_response, b"" - ), + metadata_blob = adapter.controller.serializer.dumps( + synthetic_request, synthetic_response, b"" ) + adapter.cache.set(cache_url, metadata_blob) download.output_file.flush() with open(download.output_file.name, "rb") as f: adapter.cache.set_body_from_io(cache_url, f) diff --git a/tests/unit/test_network_download.py b/tests/unit/test_network_download.py index a27f45ab7cd..7fb6b7fe64b 100644 --- a/tests/unit/test_network_download.py +++ b/tests/unit/test_network_download.py @@ -9,7 +9,6 @@ from pip._internal.exceptions import IncompleteDownloadError from pip._internal.models.link import Link -from pip._internal.network.cache import SafeFileCache from pip._internal.network.download import ( Downloader, _get_http_response_size, @@ -17,7 +16,7 @@ parse_content_disposition, sanitize_content_filename, ) -from pip._internal.network.session import CacheControlAdapter, PipSession +from pip._internal.network.session import PipSession from pip._internal.network.utils import HEADERS from tests.lib.requests_mocks import MockResponse @@ -355,8 +354,9 @@ def test_downloader( def test_resumed_download_caching(tmpdir: Path) -> None: """Test that resumed downloads are cached properly for future use.""" - session = PipSession() - link = Link("http://example.com/foo.tgz") + cache_dir = tmpdir / "cache" + session = PipSession(cache=str(cache_dir)) + link = Link("https://example.com/foo.tgz") downloader = Downloader(session, "on", resume_retries=5) # Mock an incomplete download followed by a successful resume @@ -371,27 +371,8 @@ def test_resumed_download_caching(tmpdir: Path) -> None: responses = [incomplete_resp, resume_resp] _http_get_mock = MagicMock(side_effect=responses) - # Mock the session's adapters to have a cache controller - mock_adapter = MagicMock(spec=CacheControlAdapter) - mock_controller = MagicMock() - mock_adapter.controller = mock_controller - mock_controller.cache_url = MagicMock(return_value="cache_key") - mock_controller.serializer = MagicMock() - mock_controller.serializer.dumps = MagicMock(return_value=b"serialized_data") - - # Mock the cache to be a SafeFileCache - mock_cache = MagicMock(spec=SafeFileCache) - mock_adapter.cache = mock_cache - - # Create a mock for the session adapters - adapters_mock = MagicMock() - adapters_mock.__getitem__ = MagicMock(return_value=mock_adapter) - - with ( - patch.object(Downloader, "_http_get", _http_get_mock), - patch.object(session, "adapters", adapters_mock), - ): - + with patch.object(Downloader, "_http_get", _http_get_mock): + # Perform the download (incomplete then resumed) filepath, _ = downloader(link, str(tmpdir)) # Verify the file was downloaded correctly @@ -400,18 +381,9 @@ def test_resumed_download_caching(tmpdir: Path) -> None: expected_bytes = b"0cfa7e9d-1868-4dd7-9fb3-f2561d5dfd89" assert downloaded_bytes == expected_bytes - # Verify that cache.set was called for metadata - mock_cache.set.assert_called_once() - - # Verify that set_body_from_io was called for streaming the body - mock_cache.set_body_from_io.assert_called_once() - - # Verify the call arguments - set_call_args = mock_cache.set.call_args - assert set_call_args[0][0] == "cache_key" # First argument should be cache_key - - set_body_call_args = mock_cache.set_body_from_io.call_args - - assert set_body_call_args[0][0] == "cache_key" - assert hasattr(set_body_call_args[0][1], "read") - assert set_body_call_args[0][1].name == filepath + # Verify that the cache directory was created and contains cache files + # The resumed download should have been cached for future use + assert cache_dir.exists() + cache_files = list(cache_dir.rglob("*")) + # Should have cache files (both metadata and body files) + assert len([f for f in cache_files if f.is_file()]) == 2