pytorch
diff --git a/‎.github/pytorch-probot.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/pytorch-probot.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/doc_build.yml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/doc_build.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.github/workflows/ruff_linter.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ruff_linter.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/run_tutorials.yml‎
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/run_tutorials.yml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.github/workflows/trymerge.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trymerge.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 23 additions & 12 deletions b/‎README.md‎
Lines changed: 23 additions & 12 deletions
diff --git a/‎benchmarks/float8/profile_linear_float8.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/float8/profile_linear_float8.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev-requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎dev-requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -1,3 +1,4 @@
 mergebot: True
 ciflow_push_tags:
 - ciflow/benchmark
+- ciflow/tutorials
@@ -16,7 +16,7 @@ jobs:
         torch-spec:
           - '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup miniconda
         uses: pytorch/test-infra/.github/actions/setup-miniconda@main
@@ -55,7 +55,7 @@ jobs:
           # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
           # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
-          
+
           # SAM 2.1
           # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2
           # cd examples/sam2_amg_server
 
@@ -28,7 +28,7 @@ jobs:
         python-version: ['3.11']
     steps:
       - name: Check out repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Setup conda env
         uses: conda-incubator/setup-miniconda@v2
         with:
@@ -50,7 +50,7 @@ jobs:
         run: |
           cd docs
           make html
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
           name: Doc-Build
           path: docs/build/html/
@@ -61,9 +61,9 @@ jobs:
     if: ${{ github.event_name == 'pull_request' }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: Doc-Build
           path: docs
@@ -86,12 +86,12 @@ jobs:
     if: github.repository == 'pytorch/ao' && github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch')
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           ref: gh-pages
           persist-credentials: true
       - name: Download artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: Doc-Build
           path: docs
 
@@ -34,7 +34,7 @@ jobs:
         PR_NUMBER=$(echo $PR_URL | grep -oE '[0-9]+$')
         echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       if: github.event_name == 'workflow_dispatch'
       with:
         fetch-depth: 0
@@ -47,7 +47,7 @@ jobs:
       env:
         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       if: github.event_name != 'workflow_dispatch'
       with:
         fetch-depth: 0
 
@@ -0,0 +1,33 @@
+name: Run tutorials
+
+on:
+  push:
+    tags:
+      - ciflow/tutorials/*
+  workflow_dispatch:
+
+jobs:
+  run_tutorials:
+    runs-on: linux.aws.a100
+    strategy:
+      matrix:
+        torch-spec:
+          - '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.9"
+
+      - name: Run tutorials
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -m pip install --upgrade pip
+          ${CONDA_RUN} pip install ${{ matrix.torch-spec }}
+          ${CONDA_RUN} pip install -r dev-requirements.txt
+          ${CONDA_RUN} pip install .
+          cd tutorials
+          ${CONDA_RUN} bash run_all.sh
@@ -16,7 +16,7 @@ jobs:
     steps:
       - name: Checkout repo
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           token: ${{ secrets.PYTORCH_MERGEBOT_TOKEN }}
 
@@ -54,27 +54,38 @@ We've added kv cache quantization and other features in order to enable long con
 
 In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](torchao/_models/llama/README.md)
 
+## Training
+
 ### Quantization Aware Training
 
-Post-training quantization can result in a fast and compact model, but may also lead to accuracy degradation. We recommend exploring Quantization Aware Training (QAT) to overcome this limitation. In collaboration with Torchtune, we've developed a QAT recipe that demonstrates significant accuracy improvements over traditional PTQ, recovering **96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext** for Llama3 compared to post-training quantization (PTQ). And we've provided a full recipe [here](https://pytorch.org/blog/quantization-aware-training/)
+Post-training quantization can result in a fast and compact model, but may also lead to accuracy degradation. We recommend exploring Quantization Aware Training (QAT) to overcome this limitation. In collaboration with Torchtune, we've developed a QAT recipe that demonstrates significant accuracy improvements over traditional PTQ, recovering **96% of the accuracy degradation on hellaswag and 68% of the perplexity degradation on wikitext** for Llama3 compared to post-training quantization (PTQ). And we've provided a full recipe [here](https://pytorch.org/blog/quantization-aware-training/). For more details, please see the [QAT README](./torchao/quantization/qat/README.md).
 
 ```python
-from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
-
-qat_quantizer = Int8DynActInt4WeightQATQuantizer()
+from torchao.quantization import (
+    quantize_,
+    int8_dynamic_activation_int4_weight,
+)
+from torchao.quantization.qat import (
+    FakeQuantizeConfig,
+    from_intx_quantization_aware_training,
+    intx_quantization_aware_training,
+)
 
-# Insert "fake quantize" operations into linear layers.
-# These operations simulate quantization numerics
-model = qat_quantizer.prepare(model)
+# Insert fake quantization
+activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
+weight_config = FakeQuantizeConfig(torch.int4, group_size=32)
+quantize_(
+    my_model,
+    intx_quantization_aware_training(activation_config, weight_config),
+)
 
-# Run Training...
+# Run training... (not shown)
 
-# Convert fake quantize to actual quantize operations
-model = qat_quantizer.convert(model)
+# Convert fake quantization to actual quantized operations
+quantize_(my_model, from_intx_quantization_aware_training())
+quantize_(my_model, int8_dynamic_activation_int4_weight(group_size=32))
 ```
 
-## Training
-
 ### Float8
 
 [torchao.float8](torchao/float8) implements training recipes with the scaled float8 dtypes, as laid out in https://arxiv.org/abs/2209.05433.
 
@@ -355,7 +355,7 @@ def main(
             1, 2048, 4096, device=device, dtype=ref_dtype
         ).requires_grad_()
     else:
-        M, K, N = 4096, 4096, 4096
+        M, K, N = 2048, 4096, 8192
         m_ref = torch.nn.Sequential(
             torch.nn.Linear(K, N, bias=False),
         )
 
@@ -21,6 +21,7 @@ lm_eval
 diskcache
 pycocotools
 tqdm
+importlib_metadata
 
 # Custom CUDA Extensions
 ninja
Original file line number	Diff line number	Diff line change
`@@ -355,7 +355,7 @@ def main(`
`355`	`355`	`1, 2048, 4096, device=device, dtype=ref_dtype`
`356`	`356`	`).requires_grad_()`
`357`	`357`	`else:`
`358`		`- M, K, N = 4096, 4096, 4096`
	`358`	`+ M, K, N = 2048, 4096, 8192`
`359`	`359`	`m_ref = torch.nn.Sequential(`
`360`	`360`	`torch.nn.Linear(K, N, bias=False),`
`361`	`361`	`)`