petrex · petrex · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 18, 2024
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -1,3 +1,5 @@
 mergebot: True
 ciflow_push_tags:
 - ciflow/benchmark
+- ciflow/tutorials
+- ciflow/rocm
diff --git a/.github/workflows/build-wheels_m1.yml b/.github/workflows/build-wheels_m1.yml
@@ -41,3 +41,34 @@ jobs:
       runner-type: macos-m1-stable
       smoke-test-script: test/smoke_test.py
       trigger-event: ${{ github.event_name }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+            A failure occurred in the Build Linux Wheels workflow.
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
diff --git a/.github/workflows/build_wheels_aarch64_linux.yml b/.github/workflows/build_wheels_aarch64_linux.yml
@@ -29,7 +29,8 @@ jobs:
       test-infra-repository: pytorch/test-infra
       test-infra-ref: main
       with-cuda: disable
-
+      # please note: excluding 3.13t for aarch64 builds for now
+      python-versions: '["3.9", "3.10", "3.11", "3.12", "3.13"]'
   build:
     needs: generate-matrix
     permissions:
@@ -53,3 +54,34 @@ jobs:
       setup-miniconda: false
     secrets:
       PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+            A failure occurred in the Build AARCH64 Wheels workflow.
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
@@ -30,6 +30,8 @@ jobs:
       with-cuda: enable
       with-rocm: enable
       with-xpu: enable
+      # please note: excluding 3.13t for aarch64 builds for now
+      python-versions: '["3.9", "3.10", "3.11", "3.12", "3.13"]'
 
   build:
     needs: generate-matrix
@@ -56,3 +58,38 @@ jobs:
       upload-to-pypi: cu121
     secrets:
       PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+
+            A failure occurred in the Build Linux Wheels workflow.
+
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
diff --git a/.github/workflows/build_wheels_windows.yml b/.github/workflows/build_wheels_windows.yml
@@ -60,3 +60,38 @@ jobs:
       package-name: ${{ matrix.package-name }}
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
+  notify:
+    runs-on: ubuntu-latest
+    name: Email notification
+    needs: [generate-matrix, build]
+    if: failure() && github.event_name == 'schedule'
+    steps:
+      - uses: dawidd6/action-send-mail@v4
+        with:
+          server_address: smtp.gmail.com
+          server_port: 465
+          username: torchao.notify
+          password: ${{ secrets.TORCHAO_NOTIFY_PASSWORD }}
+          from: [email protected]
+          to: ${{ secrets.TORCHAO_NOTIFY_RECIPIENT }}
+          subject: Scheduled Build Failure for TorchAO
+          body: |
+            Build Failure Notification for TorchAO
+
+            A failure occurred in the Build Windows Wheels workflow.
+
+            Run Details:
+            - Workflow: ${{ github.workflow }}
+            - Run Type: ${{ github.event_name }}
+            - Repository: ${{ github.repository }}
+            - Branch/PR: ${{ github.ref }}
+            - Commit: ${{ github.sha }}
+
+            You can view the full run details here:
+            ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+            Error Information:
+            ${{ needs.generate-matrix.result == 'failure' && 'Matrix generation failed' || '' }}
+            ${{ needs.build.result == 'failure' && 'Build job failed' || '' }}
+
+            This is an automated notification. Please check the GitHub Actions page for more details about the failure.
diff --git a/.github/workflows/dashboard_perf_test.yml b/.github/workflows/dashboard_perf_test.yml
@@ -42,19 +42,19 @@ jobs:
 
           mkdir -p ${{ runner.temp }}/benchmark-results
           # llama3 - compile baseline
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # llama3 - autoquant
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
           # # SAM
           # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
           # # SAM compile baselilne
-          # ${CONDA_RUN} sh torchao/_models/sam/setup.sh
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} sh benchmarks/_models/sam/setup.sh
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
           # SAM 2.1
           # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2

diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml
@@ -10,6 +10,9 @@ on:
       - v[0-9]+.[0-9]+.[0-9]
       - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   pull_request:
+    paths:
+      - 'docs/**'
+      - '!docs/**'
   workflow_dispatch:
 
 concurrency:
@@ -91,7 +94,7 @@ jobs:
           ref: gh-pages
           persist-credentials: true
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: Doc-Build
           path: docs

diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml
@@ -25,10 +25,18 @@ jobs:
         include:
           - name: SM-89
             runs-on: linux.g6.4xlarge.experimental.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/cu124'
             gpu-arch-type: "cuda"
-            gpu-arch-version: "12.1"
+            gpu-arch-version: "12.4"
+          - name: H100
+            runs-on: linux.aws.h100
+            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
 
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60

diff --git a/.github/workflows/float8nocompile_test.yaml b/.github/workflows/float8nocompile_test.yaml
@@ -0,0 +1,53 @@
+name: Run Float8nocompile Tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+    paths:
+      - 'torchao/prototype/float8nocompile/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+    paths:
+      - 'torchao/prototype/float8nocompile/**'
+
+concurrency:
+  group: floatnocompile_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: SM-89
+            runs-on: linux.g6.4xlarge.experimental.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.1"
+
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 300
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+        pip install ${{ matrix.torch-spec }}
+        pip install -r dev-requirements.txt
+        pip install .
+        cd torchao/prototype/float8nocompile
+        pytest kernels/ --verbose -s
+        pytest test/train_test.py --verbose -s
diff --git a/.github/workflows/nightly_smoke_test.yml b/.github/workflows/nightly_smoke_test.yml
@@ -11,7 +11,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }} 
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
 jobs:
   test:
@@ -21,11 +21,13 @@ jobs:
         include:
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/cu124'
             gpu-arch-type: "cuda"
-            gpu-arch-version: "12.1"
-
+            gpu-arch-version: "12.4"
 
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: ${{ matrix.runs-on }}

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -25,15 +25,18 @@ jobs:
         include:
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/cu124'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"
           - name: CPU Nightly
             runs-on: linux.4xlarge
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
 
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 120

diff --git a/.github/workflows/regression_test_rocm.yml b/.github/workflows/regression_test_rocm.yml
@@ -0,0 +1,49 @@
+name: Run Regression Tests on ROCm
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - ciflow/rocm/*
+
+concurrency:
+  group: regression_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test-nightly:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: ROCM Nightly
+            runs-on: linux.rocm.gpu.torchao
+            torch-spec: '--pre torch==2.7.0.dev20250122 --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
+            gpu-arch-type: "rocm"
+            gpu-arch-version: "6.3"
+
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 120
+      no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }}
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        python -m pip install --upgrade pip
+        pip install ${{ matrix.torch-spec }}
+        pip install -r dev-requirements.txt
+        pip install .
+        export CONDA=$(dirname $(dirname $(which conda)))
+        export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
+        pytest test --verbose -s