pytorch
diff --git a/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/dashboard_perf_test.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 55 additions & 3 deletions b/‎.github/workflows/torchao_experimental_test.yml‎
Lines changed: 55 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 5 deletions b/‎README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎torchao/_models/__init__.py‎ renamed to ‎benchmarks/__init__.py‎ b/‎torchao/_models/__init__.py‎ renamed to ‎benchmarks/__init__.py‎
diff --git a/‎torchao/_models/README.md‎ renamed to ‎benchmarks/_models/README.md‎ b/‎torchao/_models/README.md‎ renamed to ‎benchmarks/_models/README.md‎
diff --git a/‎torchao/_models/llama/__init__.py‎ renamed to ‎benchmarks/_models/__init__.py‎ b/‎torchao/_models/llama/__init__.py‎ renamed to ‎benchmarks/_models/__init__.py‎
diff --git a/‎torchao/_models/_eval.py‎ renamed to ‎benchmarks/_models/_eval.py‎ b/‎torchao/_models/_eval.py‎ renamed to ‎benchmarks/_models/_eval.py‎
diff --git a/‎torchao/_models/llama/.gitignore‎ renamed to ‎benchmarks/_models/llama/.gitignore‎ b/‎torchao/_models/llama/.gitignore‎ renamed to ‎benchmarks/_models/llama/.gitignore‎
diff --git a/‎torchao/_models/llama/README.md‎ renamed to ‎benchmarks/_models/llama/README.md‎
Lines changed: 1 addition & 1 deletion b/‎torchao/_models/llama/README.md‎ renamed to ‎benchmarks/_models/llama/README.md‎
Lines changed: 1 addition & 1 deletion
@@ -42,19 +42,19 @@ jobs:
 
           mkdir -p ${{ runner.temp }}/benchmark-results
           # llama3 - compile baseline
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # llama3 - autoquant
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
           # # SAM
           # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
           # # SAM compile baselilne
-          # ${CONDA_RUN} sh torchao/_models/sam/setup.sh
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} sh benchmarks/_models/sam/setup.sh
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
           # SAM 2.1
           # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2
 
@@ -11,7 +11,7 @@ on:
       - 'gh/**'
 
 jobs:
-  test:
+  test-cpu-ops:
     strategy:
       matrix:
         runner: [macos-14]
@@ -53,6 +53,58 @@ jobs:
         run: |
           conda activate venv
           pushd torchao/experimental/ops/tests
-          sh build_and_run_tests.sh
-          rm -rf /tmp/cmake-out
+          # sh build_and_run_tests.sh
+          # rm -rf /tmp/cmake-out
+          popd
+
+  test-mps-ops:
+    strategy:
+      matrix:
+        runner: [macos-m1-stable]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Create conda env
+        run: |
+          conda create -yn test-mps-ops-env python=3.11
+      - name: Activate conda env
+        run: |
+          source activate base
+          conda activate test-mps-ops-env
+      - name: Install torch
+        run: |
+          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu"
+      - name: Print torch version
+        run: |
+          python -c "import torch; print(torch.__version__)"
+      - name: Install requirements
+        run: |
+          pip install cmake
+          pip install parameterized
+          pip install pyyaml
+          pip install numpy
+      - name: Print pip freeze
+        run: |
+          pip freeze
+      - name: Print current directory
+        run: |
+          python -c "import os; print(os.getcwd())"
+      - name: Build ao with experimental mps ops
+        run: |
+          USE_CPP=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 pip install .
+      - name: Run mps tests
+        run: |
+          pushd torchao/experimental/ops/mps/test
+          python test_lowbit.py
+          python test_quantizer.py
           popd
@@ -375,3 +375,4 @@ checkpoints/
 
 # Experimental
 torchao/experimental/cmake-out
+torchao/experimental/deps
@@ -19,7 +19,7 @@ torchao just works with `torch.compile()` and `FSDP2` over most PyTorch models o
 
 ### Post Training Quantization
 
-Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/torchao/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
+Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/benchmarks/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
 
 For inference, we have the option of
 1. Quantize only the weights: works best for memory bound models
@@ -52,7 +52,7 @@ We also provide a developer facing API so you can implement your own quantizatio
 
 We've added kv cache quantization and other features in order to enable long context length (and necessarily memory efficient) inference.
 
-In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](torchao/_models/llama/README.md)
+In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](benchmarks/_models/llama/README.md)
 
 ## Training
 
@@ -159,20 +159,20 @@ Things we're excited about but need more time to cook in the oven
 
 `torchao` makes liberal use of several new features in Pytorch, it's recommended to use it with the current nightly or latest stable version of PyTorch.
 
-Stable release from Pypi which will default to CUDA 12.1
+Stable release from Pypi which will default to CUDA 12.4
 
 ```Shell
 pip install torchao
 ```
 
 Stable Release from the PyTorch index
 ```Shell
-pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
+pip install torchao --extra-index-url https://download.pytorch.org/whl/cu124 # full options are cpu/cu118/cu124/cu126
 ```
 
 Nightly Release
 ```Shell
-pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
+pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu126/cu128
 ```
 
 For *most* developers you probably want to skip building custom C++/CUDA extensions for faster iteration
 
@@ -8,7 +8,7 @@ and follow the steps to gain access.
 Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
 download and convert the model weights
 
-once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
+once done you can execute benchmarks from the benchmarks/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
 directly using `generate.py` or `eval.py`.
 
 ## KV Cache Quantization - Memory Efficient Inference
Original file line number	Diff line number	Diff line change
`@@ -375,3 +375,4 @@ checkpoints/`
`375`	`375`
`376`	`376`	`# Experimental`
`377`	`377`	`torchao/experimental/cmake-out`
	`378`	`+torchao/experimental/deps`