Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/update-s3-dependencies.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
name: Update S3 HTML dependencies for download.pytorch.org nightly and test

on:
push:
branches:
- main
paths:
- s3_management/update_dependencies.py
workflow_dispatch:
inputs:
dryrun:
Expand Down
61 changes: 47 additions & 14 deletions s3_management/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
"charset_normalizer",
"cmake",
"colorama",
"cuda_bindings",
"fbgemm_gpu",
"fbgemm_gpu_genai",
"filelock",
Expand Down Expand Up @@ -544,7 +545,7 @@ def to_simple_package_html(self, subdir: Optional[str], package_name: str) -> st
attributes += ' data-requires-python=">=3.10"'

out.append(
f' <a href="/{obj.key}{maybe_fragment}"{attributes}>{path.basename(obj.key).replace("%2B","+")}</a><br/>'
f' <a href="/{obj.key}{maybe_fragment}"{attributes}>{path.basename(obj.key).replace("%2B", "+")}</a><br/>'
)
# Adding html footer
out.append(" </body>")
Expand All @@ -562,9 +563,38 @@ def to_simple_packages_html(
out.append("<!DOCTYPE html>")
out.append("<html>")
out.append(" <body>")
for pkg_name in sorted(self.get_package_names(subdir)):

# Get packages from wheel files
packages_from_wheels = set(self.get_package_names(subdir))

# Also find packages that have index.html but no wheels
packages_with_index_only = set()
resolved_subdir = self._resolve_subdir(subdir)

# List all objects in the subdir to find packagename/index.html patterns
prefix_to_search = f"{resolved_subdir}/"
for obj in BUCKET.objects.filter(Prefix=prefix_to_search):
# Check if this is a packagename/index.html file
relative_key = obj.key[len(prefix_to_search) :]
parts = relative_key.split("/")
if len(parts) == 2 and parts[1] == "index.html":
package_name = parts[0].replace("-", "_")
# Convert back to the format used in wheel names (use _ not -)
# But we need to check if this package already has wheels
if package_name.lower() not in {
p.lower() for p in packages_from_wheels
}:
packages_with_index_only.add(package_name)
print(
f"INFO: Including package '{package_name}' in {prefix_to_search} (has index.html but no wheels)"
)

# Combine both sets of packages
all_packages = packages_from_wheels | packages_with_index_only

for pkg_name in sorted(all_packages):
out.append(
f' <a href="{pkg_name.lower().replace("_","-")}/">{pkg_name.replace("_","-")}</a><br/>'
f' <a href="{pkg_name.lower().replace("_", "-")}/">{pkg_name.replace("_", "-")}</a><br/>'
)
# Adding html footer
out.append(" </body>")
Expand All @@ -587,6 +617,7 @@ def upload_libtorch_html(self) -> None:
def upload_pep503_htmls(self) -> None:
for subdir in self.subdirs:
index_html = self.to_simple_packages_html(subdir=subdir)

for bucket in INDEX_BUCKETS:
print(f"INFO Uploading {subdir}/index.html to {bucket.name}")
bucket.Object(key=f"{subdir}/index.html").put(
Expand Down Expand Up @@ -691,16 +722,18 @@ def fetch_metadata(self) -> None:
# Add PEP 503-compatible hashes to URLs to allow clients to avoid spurious downloads, if possible.
regex_multipart_upload = r"^[A-Za-z0-9+/=]+=-[0-9]+$"
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
for idx, future in {
idx: executor.submit(
lambda key: CLIENT.head_object(
Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled"
),
obj.orig_key,
)
for (idx, obj) in enumerate(self.objects)
if obj.size is None
}.items():
futures = {}
for idx, obj in enumerate(self.objects):
if obj.size is None:
future = executor.submit(
lambda key: CLIENT.head_object(
Bucket=BUCKET.name, Key=key, ChecksumMode="Enabled"
),
obj.orig_key,
)
futures[idx] = future

for idx, future in futures.items():
response = future.result()
raw = response.get("ChecksumSHA256")
if raw and match(regex_multipart_upload, raw):
Expand Down Expand Up @@ -813,7 +846,7 @@ def main() -> None:
)
etime = time.time()
print(
f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime-stime:.2f} seconds"
f"DEBUG: Fetched {len(idx.objects)} objects for '{prefix}' in {etime - stime:.2f} seconds"
)
if args.compute_sha256:
idx.compute_sha256()
Expand Down
188 changes: 108 additions & 80 deletions s3_management/update_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,108 +544,141 @@
}


def is_nvidia_package(pkg_name: str) -> bool:
"""Check if a package is from NVIDIA and should use pypi.nvidia.com"""
return pkg_name.startswith("nvidia-")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need to add more logic to this as not all packages start with nvidia-, e.g. https://pypi.org/project/cuda-bindings/ or https://pypi.org/project/cuda-cccl/ in case these will be added in the future. Not blocking for now, just leaving this as a comment for my future self.



def get_package_source_url(pkg_name: str) -> str:
"""Get the source URL for a package based on its type"""
if is_nvidia_package(pkg_name):
return f"https://pypi.nvidia.com/{pkg_name}/"
else:
return f"https://pypi.org/simple/{pkg_name}/"


def download(url: str) -> bytes:
from urllib.request import urlopen

with urlopen(url) as conn:
return conn.read()


def is_stable(package_version: str) -> bool:
return bool(re.match(r"^([0-9]+\.)+[0-9]+$", package_version))
def replace_relative_links_with_absolute(html: str, base_url: str) -> str:
"""
Replace all relative links in HTML with absolute links.

Args:
html: HTML content as string
base_url: Base URL to prepend to relative links

Returns:
Modified HTML with absolute links
"""
# Ensure base_url ends with /
if not base_url.endswith("/"):
base_url += "/"

# Pattern to match href attributes with relative URLs (not starting with http:// or https://)
def replace_href(match):
full_match = match.group(0)
url = match.group(1)

# If URL is already absolute, don't modify it
if (
url.startswith("http://")
or url.startswith("https://")
or url.startswith("//")
):
return full_match

# Remove leading ./ or /
url = url.lstrip("./")
url = url.lstrip("/")

# Replace with absolute URL
return f'href="{base_url}{url}"'

# Replace href="..." patterns
html = re.sub(r'href="([^"]+)"', replace_href, html)

return html


def parse_simple_idx(url: str) -> Dict[str, str]:
html = download(url).decode("ascii")
return {
def parse_simple_idx(url: str) -> tuple[Dict[str, str], str]:
"""
Parse a simple package index and return package dict and raw HTML.

Returns:
Tuple of (package_dict, raw_html)
"""
html = download(url).decode("utf-8", errors="ignore")
packages = {
name: url
for (url, name) in re.findall('<a href="([^"]+)"[^>]*>([^>]+)</a>', html)
}
return packages, html


def get_whl_versions(idx: Dict[str, str]) -> List[str]:
return [
k.split("-")[1]
for k in idx.keys()
if k.endswith(".whl") and is_stable(k.split("-")[1])
]
def upload_index_html(
pkg_name: str,
prefix: str,
html: str,
base_url: str,
*,
dry_run: bool = False,
) -> None:
"""Upload modified index.html to S3 with absolute links"""
# Replace relative links with absolute links
modified_html = replace_relative_links_with_absolute(html, base_url)

index_key = f"{prefix}/{pkg_name}/index.html"

def get_wheels_of_version(idx: Dict[str, str], version: str) -> Dict[str, str]:
return {
k: v
for (k, v) in idx.items()
if k.endswith(".whl") and k.split("-")[1] == version
}
if dry_run:
print(f"Dry Run - not uploading index.html to s3://pytorch/{index_key}")
return

print(f"Uploading index.html to s3://pytorch/{index_key}")
BUCKET.Object(key=index_key).put(
ACL="public-read", ContentType="text/html", Body=modified_html.encode("utf-8")
)


def upload_missing_whls(
pkg_name: str = "numpy",
prefix: str = "whl/test",
def upload_package_using_simple_index(
pkg_name: str,
prefix: str,
*,
dry_run: bool = False,
only_pypi: bool = False,
target_version: str = "latest",
) -> None:
pypi_idx = parse_simple_idx(f"https://pypi.org/simple/{pkg_name}")
pypi_versions = get_whl_versions(pypi_idx)

# Determine which version to use
if target_version == "latest" or not target_version:
selected_version = pypi_versions[-1] if pypi_versions else None
elif target_version in pypi_versions:
selected_version = target_version
else:
print(
f"Warning: Version {target_version} not found for {pkg_name}, using latest"
)
selected_version = pypi_versions[-1] if pypi_versions else None
"""
Upload package index.html from PyPI Simple Index.
Simply copies the index.html with absolute links - no wheel uploads or version filtering.
Works for both NVIDIA and non-NVIDIA packages.
"""
source_url = get_package_source_url(pkg_name)
is_nvidia = is_nvidia_package(pkg_name)

if not selected_version:
print(f"No stable versions found for {pkg_name}")
print(
f"Processing {pkg_name} using {'NVIDIA' if is_nvidia else 'PyPI'} Simple Index: {source_url}"
)

# Parse the index and get raw HTML
try:
_, raw_html = parse_simple_idx(source_url)
except Exception as e:
print(f"Error fetching package {pkg_name}: {e}")
return

pypi_latest_packages = get_wheels_of_version(pypi_idx, selected_version)

download_latest_packages: Dict[str, str] = {}
if not only_pypi:
download_idx = parse_simple_idx(
f"https://download.pytorch.org/{prefix}/{pkg_name}"
)
download_latest_packages = get_wheels_of_version(download_idx, selected_version)

has_updates = False
for pkg in pypi_latest_packages:
if pkg in download_latest_packages:
continue
# Skip pp packages
if "-pp3" in pkg:
continue
# Skip win32 packages
if "-win32" in pkg:
continue
# Skip muslinux packages
if "-musllinux" in pkg:
continue
print(f"Downloading {pkg}")
if dry_run:
has_updates = True
print(f"Dry Run - not Uploading {pkg} to s3://pytorch/{prefix}/")
continue
data = download(pypi_idx[pkg])
print(f"Uploading {pkg} to s3://pytorch/{prefix}/")
BUCKET.Object(key=f"{prefix}/{pkg}").put(
ACL="public-read", ContentType="binary/octet-stream", Body=data
)
has_updates = True
if not has_updates:
print(f"{pkg_name} is already at version {selected_version} for {prefix}")
# Upload modified index.html with absolute links
upload_index_html(pkg_name, prefix, raw_html, source_url, dry_run=dry_run)

print(f"Successfully processed index.html for {pkg_name}")


def main() -> None:
from argparse import ArgumentParser

parser = ArgumentParser("Upload dependent packages to s3://pytorch")
parser = ArgumentParser("Upload dependent package indexes to s3://pytorch")
# Get unique paths from the packages list
project_paths = list(
{
Expand All @@ -657,7 +690,6 @@ def main() -> None:
project_paths += ["all"]
parser.add_argument("--package", choices=project_paths, default="torch")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--only-pypi", action="store_true")
parser.add_argument("--include-stable", action="store_true")
args = parser.parse_args()

Expand All @@ -682,12 +714,8 @@ def main() -> None:
else:
full_path = f"{prefix}"

upload_missing_whls(
pkg_name,
full_path,
dry_run=args.dry_run,
only_pypi=args.only_pypi,
target_version=pkg_config["version"],
upload_package_using_simple_index(
pkg_name, full_path, dry_run=args.dry_run
)


Expand Down