From 838a94b4202cb3561d820f909927c4415050ed1f Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 18 Nov 2025 15:01:49 -0800 Subject: [PATCH 1/8] Change dependnecy update function to operate on index.html only rather then copy whls over --- s3_management/update_dependencies.py | 188 ++++++++++++++++----------- 1 file changed, 110 insertions(+), 78 deletions(-) diff --git a/s3_management/update_dependencies.py b/s3_management/update_dependencies.py index 904b2d428d..3e922efc01 100644 --- a/s3_management/update_dependencies.py +++ b/s3_management/update_dependencies.py @@ -544,6 +544,19 @@ } +def is_nvidia_package(pkg_name: str) -> bool: + """Check if a package is from NVIDIA and should use pypi.nvidia.com""" + return pkg_name.startswith("nvidia-") + + +def get_package_source_url(pkg_name: str) -> str: + """Get the source URL for a package based on its type""" + if is_nvidia_package(pkg_name): + return f"https://pypi.nvidia.com/{pkg_name}/" + else: + return f"https://pypi.org/simple/{pkg_name}/" + + def download(url: str) -> bytes: from urllib.request import urlopen @@ -551,101 +564,123 @@ def download(url: str) -> bytes: return conn.read() -def is_stable(package_version: str) -> bool: - return bool(re.match(r"^([0-9]+\.)+[0-9]+$", package_version)) +def replace_relative_links_with_absolute(html: str, base_url: str) -> str: + """ + Replace all relative links in HTML with absolute links. + + Args: + html: HTML content as string + base_url: Base URL to prepend to relative links + + Returns: + Modified HTML with absolute links + """ + # Ensure base_url ends with / + if not base_url.endswith('/'): + base_url += '/' + + # Pattern to match href attributes with relative URLs (not starting with http:// or https://) + def replace_href(match): + full_match = match.group(0) + url = match.group(1) + + # If URL is already absolute, don't modify it + if url.startswith('http://') or url.startswith('https://') or url.startswith('//'): + return full_match + + # Remove leading ./ or / + url = url.lstrip('./') + url = url.lstrip('/') + + # Replace with absolute URL + return f'href="{base_url}{url}"' + + # Replace href="..." patterns + html = re.sub(r'href="([^"]+)"', replace_href, html) + return html -def parse_simple_idx(url: str) -> Dict[str, str]: - html = download(url).decode("ascii") - return { + +def parse_simple_idx(url: str) -> Tuple[Dict[str, str], str]: + """ + Parse a simple package index and return package dict and raw HTML. + + Returns: + Tuple of (package_dict, raw_html) + """ + html = download(url).decode("utf-8", errors="ignore") + packages = { name: url for (url, name) in re.findall(']*>([^>]+)', html) } + return packages, html -def get_whl_versions(idx: Dict[str, str]) -> List[str]: - return [ - k.split("-")[1] - for k in idx.keys() - if k.endswith(".whl") and is_stable(k.split("-")[1]) - ] +def upload_index_html( + pkg_name: str, + prefix: str, + html: str, + base_url: str, + *, + dry_run: bool = False, +) -> None: + """Upload modified index.html to S3 with absolute links""" + # Replace relative links with absolute links + modified_html = replace_relative_links_with_absolute(html, base_url) + index_key = f"{prefix}/{pkg_name}/index.html" -def get_wheels_of_version(idx: Dict[str, str], version: str) -> Dict[str, str]: - return { - k: v - for (k, v) in idx.items() - if k.endswith(".whl") and k.split("-")[1] == version - } + if dry_run: + print(f"Dry Run - not uploading index.html to s3://pytorch/{index_key}") + return + + print(f"Uploading index.html to s3://pytorch/{index_key}") + BUCKET.Object(key=index_key).put( + ACL="public-read", + ContentType="text/html", + Body=modified_html.encode("utf-8") + ) -def upload_missing_whls( - pkg_name: str = "numpy", - prefix: str = "whl/test", +def upload_package_using_simple_index( + pkg_name: str, + prefix: str, *, dry_run: bool = False, - only_pypi: bool = False, - target_version: str = "latest", ) -> None: - pypi_idx = parse_simple_idx(f"https://pypi.org/simple/{pkg_name}") - pypi_versions = get_whl_versions(pypi_idx) - - # Determine which version to use - if target_version == "latest" or not target_version: - selected_version = pypi_versions[-1] if pypi_versions else None - elif target_version in pypi_versions: - selected_version = target_version - else: - print( - f"Warning: Version {target_version} not found for {pkg_name}, using latest" - ) - selected_version = pypi_versions[-1] if pypi_versions else None + """ + Upload package index.html from PyPI Simple Index. + Simply copies the index.html with absolute links - no wheel uploads or version filtering. + Works for both NVIDIA and non-NVIDIA packages. + """ + source_url = get_package_source_url(pkg_name) + is_nvidia = is_nvidia_package(pkg_name) + + print(f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}") - if not selected_version: - print(f"No stable versions found for {pkg_name}") + # Parse the index and get raw HTML + try: + _, raw_html = parse_simple_idx(source_url) + except Exception as e: + print(f"Error fetching package {pkg_name}: {e}") return - pypi_latest_packages = get_wheels_of_version(pypi_idx, selected_version) - - download_latest_packages: Dict[str, str] = {} - if not only_pypi: - download_idx = parse_simple_idx( - f"https://download.pytorch.org/{prefix}/{pkg_name}" - ) - download_latest_packages = get_wheels_of_version(download_idx, selected_version) - - has_updates = False - for pkg in pypi_latest_packages: - if pkg in download_latest_packages: - continue - # Skip pp packages - if "-pp3" in pkg: - continue - # Skip win32 packages - if "-win32" in pkg: - continue - # Skip muslinux packages - if "-musllinux" in pkg: - continue - print(f"Downloading {pkg}") - if dry_run: - has_updates = True - print(f"Dry Run - not Uploading {pkg} to s3://pytorch/{prefix}/") - continue - data = download(pypi_idx[pkg]) - print(f"Uploading {pkg} to s3://pytorch/{prefix}/") - BUCKET.Object(key=f"{prefix}/{pkg}").put( - ACL="public-read", ContentType="binary/octet-stream", Body=data - ) - has_updates = True - if not has_updates: - print(f"{pkg_name} is already at version {selected_version} for {prefix}") + # Upload modified index.html with absolute links + upload_index_html( + pkg_name, + prefix, + raw_html, + source_url, + dry_run=dry_run + ) + + print(f"Successfully processed index.html for {pkg_name}") def main() -> None: from argparse import ArgumentParser - parser = ArgumentParser("Upload dependent packages to s3://pytorch") + parser = ArgumentParser("Upload dependent package indexes to s3://pytorch") # Get unique paths from the packages list project_paths = list( { @@ -657,7 +692,6 @@ def main() -> None: project_paths += ["all"] parser.add_argument("--package", choices=project_paths, default="torch") parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--only-pypi", action="store_true") parser.add_argument("--include-stable", action="store_true") args = parser.parse_args() @@ -682,12 +716,10 @@ def main() -> None: else: full_path = f"{prefix}" - upload_missing_whls( + upload_package_using_simple_index( pkg_name, full_path, - dry_run=args.dry_run, - only_pypi=args.only_pypi, - target_version=pkg_config["version"], + dry_run=args.dry_run ) From 6040f5613ce5f83c7fc6cfb7c63728c27f81dc53 Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 18 Nov 2025 15:13:36 -0800 Subject: [PATCH 2/8] fix --- s3_management/update_dependencies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/s3_management/update_dependencies.py b/s3_management/update_dependencies.py index 3e922efc01..cf7ca9b76d 100644 --- a/s3_management/update_dependencies.py +++ b/s3_management/update_dependencies.py @@ -655,7 +655,7 @@ def upload_package_using_simple_index( """ source_url = get_package_source_url(pkg_name) is_nvidia = is_nvidia_package(pkg_name) - + print(f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}") # Parse the index and get raw HTML @@ -673,7 +673,7 @@ def upload_package_using_simple_index( source_url, dry_run=dry_run ) - + print(f"Successfully processed index.html for {pkg_name}") From 201967fef6f8dd4100b080218fa5d6409177cb78 Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 18 Nov 2025 15:43:43 -0800 Subject: [PATCH 3/8] lint --- s3_management/update_dependencies.py | 34 ++++++++++++---------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/s3_management/update_dependencies.py b/s3_management/update_dependencies.py index cf7ca9b76d..4268731e7c 100644 --- a/s3_management/update_dependencies.py +++ b/s3_management/update_dependencies.py @@ -576,8 +576,8 @@ def replace_relative_links_with_absolute(html: str, base_url: str) -> str: Modified HTML with absolute links """ # Ensure base_url ends with / - if not base_url.endswith('/'): - base_url += '/' + if not base_url.endswith("/"): + base_url += "/" # Pattern to match href attributes with relative URLs (not starting with http:// or https://) def replace_href(match): @@ -585,12 +585,16 @@ def replace_href(match): url = match.group(1) # If URL is already absolute, don't modify it - if url.startswith('http://') or url.startswith('https://') or url.startswith('//'): + if ( + url.startswith("http://") + or url.startswith("https://") + or url.startswith("//") + ): return full_match # Remove leading ./ or / - url = url.lstrip('./') - url = url.lstrip('/') + url = url.lstrip("./") + url = url.lstrip("/") # Replace with absolute URL return f'href="{base_url}{url}"' @@ -636,9 +640,7 @@ def upload_index_html( print(f"Uploading index.html to s3://pytorch/{index_key}") BUCKET.Object(key=index_key).put( - ACL="public-read", - ContentType="text/html", - Body=modified_html.encode("utf-8") + ACL="public-read", ContentType="text/html", Body=modified_html.encode("utf-8") ) @@ -656,7 +658,9 @@ def upload_package_using_simple_index( source_url = get_package_source_url(pkg_name) is_nvidia = is_nvidia_package(pkg_name) - print(f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}") + print( + f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}" + ) # Parse the index and get raw HTML try: @@ -666,13 +670,7 @@ def upload_package_using_simple_index( return # Upload modified index.html with absolute links - upload_index_html( - pkg_name, - prefix, - raw_html, - source_url, - dry_run=dry_run - ) + upload_index_html(pkg_name, prefix, raw_html, source_url, dry_run=dry_run) print(f"Successfully processed index.html for {pkg_name}") @@ -717,9 +715,7 @@ def main() -> None: full_path = f"{prefix}" upload_package_using_simple_index( - pkg_name, - full_path, - dry_run=args.dry_run + pkg_name, full_path, dry_run=args.dry_run ) From 188069634cf7f2fa4013530e1d9959d4967019fd Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 19 Nov 2025 14:51:48 -0800 Subject: [PATCH 4/8] more fixes --- .github/workflows/update-s3-dependencies.yml | 5 -- s3_management/manage.py | 58 +++++++++++++++----- s3_management/update_dependencies.py | 2 +- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/.github/workflows/update-s3-dependencies.yml b/.github/workflows/update-s3-dependencies.yml index 5a728b1357..7d297aed43 100644 --- a/.github/workflows/update-s3-dependencies.yml +++ b/.github/workflows/update-s3-dependencies.yml @@ -1,11 +1,6 @@ name: Update S3 HTML dependencies for download.pytorch.org nightly and test on: - push: - branches: - - main - paths: - - s3_management/update_dependencies.py workflow_dispatch: inputs: dryrun: diff --git a/s3_management/manage.py b/s3_management/manage.py index 71de731d69..77aabbc228 100755 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -132,6 +132,7 @@ "charset_normalizer", "cmake", "colorama", + "cuda_bindings", "fbgemm_gpu", "fbgemm_gpu_genai", "filelock", @@ -218,7 +219,6 @@ "torchvision_extra_decoders", "triton", "tqdm", - "typing_extensions", "typing_inspect", "urllib3", "xformers", @@ -544,7 +544,7 @@ def to_simple_package_html(self, subdir: Optional[str], package_name: str) -> st attributes += ' data-requires-python=">=3.10"' out.append( - f' {path.basename(obj.key).replace("%2B","+")}
' + f' {path.basename(obj.key).replace("%2B", "+")}
' ) # Adding html footer out.append(" ") @@ -562,9 +562,34 @@ def to_simple_packages_html( out.append("") out.append("") out.append(" ") - for pkg_name in sorted(self.get_package_names(subdir)): + + # Get packages from wheel files + packages_from_wheels = set(self.get_package_names(subdir)) + + # Also find packages that have index.html but no wheels + packages_with_index_only = set() + resolved_subdir = self._resolve_subdir(subdir) + + # List all objects in the subdir to find packagename/index.html patterns + prefix_to_search = f"{resolved_subdir}/" + for obj in BUCKET.objects.filter(Prefix=prefix_to_search): + # Check if this is a packagename/index.html file + relative_key = obj.key[len(prefix_to_search):] + parts = relative_key.split("/") + if len(parts) == 2 and parts[1] == "index.html": + package_name = parts[0].replace("-", "_") + # Convert back to the format used in wheel names (use _ not -) + # But we need to check if this package already has wheels + if package_name.lower() not in {p.lower() for p in packages_from_wheels}: + packages_with_index_only.add(package_name) + print(f"INFO: Including package '{package_name}' (has index.html but no wheels)") + + # Combine both sets of packages + all_packages = packages_from_wheels | packages_with_index_only + + for pkg_name in sorted(all_packages): out.append( - f' {pkg_name.replace("_","-")}
' + f' {pkg_name.replace("_", "-")}
' ) # Adding html footer out.append(" ") @@ -691,16 +716,19 @@ def fetch_metadata(self) -> None: # Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible. regex_multipart_upload = r"^[A-Za-z0-9+/=]+=-[0-9]+$" with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor: - for idx, future in { - idx: executor.submit( - lambda key: CLIENT.head_object( - Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled" - ), - obj.orig_key, - ) - for (idx, obj) in enumerate(self.objects) - if obj.size is None - }.items(): + futures = {} + for idx, obj in enumerate(self.objects): + if obj.size is None: + print(f"Fetching metadata for: {obj.orig_key}") + future = executor.submit( + lambda key: CLIENT.head_object( + Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled" + ), + obj.orig_key, + ) + futures[idx] = future + + for idx, future in futures.items(): response = future.result() raw = response.get("ChecksumSHA256") if raw and match(regex_multipart_upload, raw): @@ -813,7 +841,7 @@ def main() -> None: ) etime = time.time() print( - f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime-stime:.2f} seconds" + f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime - stime:.2f} seconds" ) if args.compute_sha256: idx.compute_sha256() diff --git a/s3_management/update_dependencies.py b/s3_management/update_dependencies.py index 4268731e7c..9edc58ab45 100644 --- a/s3_management/update_dependencies.py +++ b/s3_management/update_dependencies.py @@ -605,7 +605,7 @@ def replace_href(match): return html -def parse_simple_idx(url: str) -> Tuple[Dict[str, str], str]: +def parse_simple_idx(url: str) -> tuple[Dict[str, str], str]: """ Parse a simple package index and return package dict and raw HTML. From 93e96f957bebfa01365fbed40d15fb748f3faafd Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 19 Nov 2025 17:31:32 -0800 Subject: [PATCH 5/8] more_fixes --- s3_management/manage.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index 77aabbc228..aeb8e0f5a6 100755 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -574,15 +574,19 @@ def to_simple_packages_html( prefix_to_search = f"{resolved_subdir}/" for obj in BUCKET.objects.filter(Prefix=prefix_to_search): # Check if this is a packagename/index.html file - relative_key = obj.key[len(prefix_to_search):] + relative_key = obj.key[len(prefix_to_search) :] parts = relative_key.split("/") if len(parts) == 2 and parts[1] == "index.html": package_name = parts[0].replace("-", "_") # Convert back to the format used in wheel names (use _ not -) # But we need to check if this package already has wheels - if package_name.lower() not in {p.lower() for p in packages_from_wheels}: + if package_name.lower() not in { + p.lower() for p in packages_from_wheels + }: packages_with_index_only.add(package_name) - print(f"INFO: Including package '{package_name}' (has index.html but no wheels)") + print( + f"INFO: Including package '{package_name}' in {prefix_to_search} (has index.html but no wheels)" + ) # Combine both sets of packages all_packages = packages_from_wheels | packages_with_index_only @@ -612,8 +616,11 @@ def upload_libtorch_html(self) -> None: def upload_pep503_htmls(self) -> None: for subdir in self.subdirs: index_html = self.to_simple_packages_html(subdir=subdir) + for bucket in INDEX_BUCKETS: print(f"INFO Uploading {subdir}/index.html to {bucket.name}") + print(f"{index_html}") + bucket.Object(key=f"{subdir}/index.html").put( ACL="public-read", CacheControl="no-cache,no-store,must-revalidate", @@ -727,7 +734,7 @@ def fetch_metadata(self) -> None: obj.orig_key, ) futures[idx] = future - + for idx, future in futures.items(): response = future.result() raw = response.get("ChecksumSHA256") From a9ca0fa4ae6e7f64210c05da48df035351dcc064 Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 19 Nov 2025 17:34:01 -0800 Subject: [PATCH 6/8] more --- s3_management/manage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/s3_management/manage.py b/s3_management/manage.py index aeb8e0f5a6..e4679aeb24 100755 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -219,6 +219,7 @@ "torchvision_extra_decoders", "triton", "tqdm", + "typing_extensions", "typing_inspect", "urllib3", "xformers", From 323ce775c2f2c37895c3e2bfdcf2952c2372ed8a Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 19 Nov 2025 17:37:25 -0800 Subject: [PATCH 7/8] fix --- s3_management/manage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index e4679aeb24..f63caa97df 100755 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -620,8 +620,6 @@ def upload_pep503_htmls(self) -> None: for bucket in INDEX_BUCKETS: print(f"INFO Uploading {subdir}/index.html to {bucket.name}") - print(f"{index_html}") - bucket.Object(key=f"{subdir}/index.html").put( ACL="public-read", CacheControl="no-cache,no-store,must-revalidate", From 7829ad8ed7b10bbc9d336256faa66bba995d89d1 Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 19 Nov 2025 17:42:31 -0800 Subject: [PATCH 8/8] test --- s3_management/manage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/s3_management/manage.py b/s3_management/manage.py index f63caa97df..9ded3453f3 100755 --- a/s3_management/manage.py +++ b/s3_management/manage.py @@ -725,7 +725,6 @@ def fetch_metadata(self) -> None: futures = {} for idx, obj in enumerate(self.objects): if obj.size is None: - print(f"Fetching metadata for: {obj.orig_key}") future = executor.submit( lambda key: CLIENT.head_object( Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled"