From a1d678747e9f4378218397868dbc2868ddbd8224 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Feb 2025 21:47:29 -0800
Subject: [PATCH 1/7] Move _models to benchmarks

---
 {torchao => benchmarks}/_models/README.md           |   0
 {torchao => benchmarks}/_models/__init__.py         |   0
 {torchao => benchmarks}/_models/llama/.gitignore    |   0
 {torchao => benchmarks}/_models/llama/README.md     |   0
 {torchao => benchmarks}/_models/llama/__init__.py   |   0
 .../_models/llama/benchmark_results.txt             |   0
 {torchao => benchmarks}/_models/llama/benchmarks.sh |   0
 .../_models/llama/demo_summarize.sh                 |   0
 {torchao => benchmarks}/_models/llama/eval.py       |   4 ++--
 {torchao => benchmarks}/_models/llama/evals.sh      |   0
 {torchao => benchmarks}/_models/llama/generate.py   |   6 +++---
 {torchao => benchmarks}/_models/llama/model.py      |   0
 .../_models/llama/perf_profile.py                   |   0
 {torchao => benchmarks}/_models/llama/tokenizer.py  |   0
 {torchao => benchmarks}/_models/sam/.gitignore      |   0
 {torchao => benchmarks}/_models/sam/README.md       |   0
 {torchao => benchmarks}/_models/sam/benchmark.sh    |   0
 {torchao => benchmarks}/_models/sam/data.py         |   0
 {torchao => benchmarks}/_models/sam/eval_combo.py   |   0
 .../_models/sam/flash_4_configs.p                   | Bin
 {torchao => benchmarks}/_models/sam/metrics.py      |   0
 {torchao => benchmarks}/_models/sam/results.csv     |   0
 {torchao => benchmarks}/_models/sam/setup.sh        |   0
 {torchao => benchmarks}/_models/sam2/__init__.py    |   0
 .../_models/sam2/automatic_mask_generator.py        |   0
 {torchao => benchmarks}/_models/sam2/build_sam.py   |   0
 .../sam2/configs/sam2.1/sam2.1_hiera_b+.yaml        |   0
 .../_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml |   0
 .../_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml |   0
 .../_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml |   0
 .../sam2.1_hiera_b+_MOSE_finetune.yaml              |   0
 .../_models/sam2/configs/sam2/sam2_hiera_b+.yaml    |   0
 .../_models/sam2/configs/sam2/sam2_hiera_l.yaml     |   0
 .../_models/sam2/configs/sam2/sam2_hiera_s.yaml     |   0
 .../_models/sam2/configs/sam2/sam2_hiera_t.yaml     |   0
 .../_models/sam2/csrc/connected_components.cu       |   0
 {torchao => benchmarks}/_models/sam2/map_tensor.py  |   0
 .../_models/sam2/modeling/__init__.py               |   0
 .../_models/sam2/modeling/backbones/__init__.py     |   0
 .../_models/sam2/modeling/backbones/hieradet.py     |   0
 .../sam2/modeling/backbones/image_encoder.py        |   0
 .../_models/sam2/modeling/backbones/utils.py        |   0
 .../_models/sam2/modeling/memory_attention.py       |   0
 .../_models/sam2/modeling/memory_encoder.py         |   0
 .../_models/sam2/modeling/position_encoding.py      |   0
 .../_models/sam2/modeling/sam/__init__.py           |   0
 .../_models/sam2/modeling/sam/mask_decoder.py       |   0
 .../_models/sam2/modeling/sam/prompt_encoder.py     |   0
 .../_models/sam2/modeling/sam/transformer.py        |   0
 .../_models/sam2/modeling/sam2_base.py              |   0
 .../_models/sam2/modeling/sam2_utils.py             |   0
 .../_models/sam2/sam2_hiera_b+.yaml                 |   0
 .../_models/sam2/sam2_hiera_l.yaml                  |   0
 .../_models/sam2/sam2_hiera_s.yaml                  |   0
 .../_models/sam2/sam2_hiera_t.yaml                  |   0
 .../_models/sam2/sam2_image_predictor.py            |   0
 .../_models/sam2/sam2_video_predictor.py            |   0
 .../_models/sam2/utils/__init__.py                  |   0
 {torchao => benchmarks}/_models/sam2/utils/amg.py   |   0
 {torchao => benchmarks}/_models/sam2/utils/misc.py  |   0
 .../_models/sam2/utils/transforms.py                |   0
 {torchao => benchmarks}/_models/utils.py            |   0
 test/quantization/test_quant_api.py                 |  12 ++++++------
 torchao/{_models => }/_eval.py                      |   0
 torchao/quantization/README.md                      |   2 +-
 65 files changed, 12 insertions(+), 12 deletions(-)
 rename {torchao => benchmarks}/_models/README.md (100%)
 rename {torchao => benchmarks}/_models/__init__.py (100%)
 rename {torchao => benchmarks}/_models/llama/.gitignore (100%)
 rename {torchao => benchmarks}/_models/llama/README.md (100%)
 rename {torchao => benchmarks}/_models/llama/__init__.py (100%)
 rename {torchao => benchmarks}/_models/llama/benchmark_results.txt (100%)
 rename {torchao => benchmarks}/_models/llama/benchmarks.sh (100%)
 rename {torchao => benchmarks}/_models/llama/demo_summarize.sh (100%)
 rename {torchao => benchmarks}/_models/llama/eval.py (98%)
 rename {torchao => benchmarks}/_models/llama/evals.sh (100%)
 rename {torchao => benchmarks}/_models/llama/generate.py (99%)
 rename {torchao => benchmarks}/_models/llama/model.py (100%)
 rename {torchao => benchmarks}/_models/llama/perf_profile.py (100%)
 rename {torchao => benchmarks}/_models/llama/tokenizer.py (100%)
 rename {torchao => benchmarks}/_models/sam/.gitignore (100%)
 rename {torchao => benchmarks}/_models/sam/README.md (100%)
 rename {torchao => benchmarks}/_models/sam/benchmark.sh (100%)
 rename {torchao => benchmarks}/_models/sam/data.py (100%)
 rename {torchao => benchmarks}/_models/sam/eval_combo.py (100%)
 rename {torchao => benchmarks}/_models/sam/flash_4_configs.p (100%)
 rename {torchao => benchmarks}/_models/sam/metrics.py (100%)
 rename {torchao => benchmarks}/_models/sam/results.csv (100%)
 rename {torchao => benchmarks}/_models/sam/setup.sh (100%)
 rename {torchao => benchmarks}/_models/sam2/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/automatic_mask_generator.py (100%)
 rename {torchao => benchmarks}/_models/sam2/build_sam.py (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_b+.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_l.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_s.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_t.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/csrc/connected_components.cu (100%)
 rename {torchao => benchmarks}/_models/sam2/map_tensor.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/hieradet.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/image_encoder.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/utils.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/memory_attention.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/memory_encoder.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/position_encoding.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/mask_decoder.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/prompt_encoder.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/transformer.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam2_base.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam2_utils.py (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_b+.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_l.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_s.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_t.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_image_predictor.py (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_video_predictor.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/amg.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/misc.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/transforms.py (100%)
 rename {torchao => benchmarks}/_models/utils.py (100%)
 rename torchao/{_models => }/_eval.py (100%)

diff --git a/torchao/_models/README.md b/benchmarks/_models/README.md
similarity index 100%
rename from torchao/_models/README.md
rename to benchmarks/_models/README.md
diff --git a/torchao/_models/__init__.py b/benchmarks/_models/__init__.py
similarity index 100%
rename from torchao/_models/__init__.py
rename to benchmarks/_models/__init__.py
diff --git a/torchao/_models/llama/.gitignore b/benchmarks/_models/llama/.gitignore
similarity index 100%
rename from torchao/_models/llama/.gitignore
rename to benchmarks/_models/llama/.gitignore
diff --git a/torchao/_models/llama/README.md b/benchmarks/_models/llama/README.md
similarity index 100%
rename from torchao/_models/llama/README.md
rename to benchmarks/_models/llama/README.md
diff --git a/torchao/_models/llama/__init__.py b/benchmarks/_models/llama/__init__.py
similarity index 100%
rename from torchao/_models/llama/__init__.py
rename to benchmarks/_models/llama/__init__.py
diff --git a/torchao/_models/llama/benchmark_results.txt b/benchmarks/_models/llama/benchmark_results.txt
similarity index 100%
rename from torchao/_models/llama/benchmark_results.txt
rename to benchmarks/_models/llama/benchmark_results.txt
diff --git a/torchao/_models/llama/benchmarks.sh b/benchmarks/_models/llama/benchmarks.sh
similarity index 100%
rename from torchao/_models/llama/benchmarks.sh
rename to benchmarks/_models/llama/benchmarks.sh
diff --git a/torchao/_models/llama/demo_summarize.sh b/benchmarks/_models/llama/demo_summarize.sh
similarity index 100%
rename from torchao/_models/llama/demo_summarize.sh
rename to benchmarks/_models/llama/demo_summarize.sh
diff --git a/torchao/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
similarity index 98%
rename from torchao/_models/llama/eval.py
rename to benchmarks/_models/llama/eval.py
index 4a67124a08..8507733b5d 100644
--- a/torchao/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -120,7 +120,7 @@ def run_evaluation(
             quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
         if "int4wo" in quantization and "gptq" in quantization:
             # avoid circular imports
-            from torchao._models._eval import MultiTensorInputRecorder
+            from torchao._eval import MultiTensorInputRecorder
             from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
 
             groupsize = int(quantization.split("-")[-2])
@@ -242,7 +242,7 @@ def run_evaluation(
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
-        from torchao._models._eval import TransformerEvalWrapper
+        from torchao._eval import TransformerEvalWrapper
 
         TransformerEvalWrapper(
             model=model.to(device),
diff --git a/torchao/_models/llama/evals.sh b/benchmarks/_models/llama/evals.sh
similarity index 100%
rename from torchao/_models/llama/evals.sh
rename to benchmarks/_models/llama/evals.sh
diff --git a/torchao/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
similarity index 99%
rename from torchao/_models/llama/generate.py
rename to benchmarks/_models/llama/generate.py
index 0958a5207c..b6c21792fb 100644
--- a/torchao/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -476,7 +476,7 @@ def ffn_or_attn_only(mod, fqn):
                 filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
             )
         elif quantization.startswith("awq"):
-            from torchao._models._eval import TransformerEvalWrapper
+            from torchao._eval import TransformerEvalWrapper
             from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
 
             if not TORCH_VERSION_AT_LEAST_2_3:
@@ -575,7 +575,7 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from torchao._models._eval import InputRecorder
+            from torchao._eval import InputRecorder
             from torchao._models.llama.model import prepare_inputs_for_model
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
@@ -665,7 +665,7 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from torchao._models._eval import InputRecorder
+            from torchao._eval import InputRecorder
             from torchao._models.llama.model import prepare_inputs_for_model
 
             calibration_seq_length = 256
diff --git a/torchao/_models/llama/model.py b/benchmarks/_models/llama/model.py
similarity index 100%
rename from torchao/_models/llama/model.py
rename to benchmarks/_models/llama/model.py
diff --git a/torchao/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
similarity index 100%
rename from torchao/_models/llama/perf_profile.py
rename to benchmarks/_models/llama/perf_profile.py
diff --git a/torchao/_models/llama/tokenizer.py b/benchmarks/_models/llama/tokenizer.py
similarity index 100%
rename from torchao/_models/llama/tokenizer.py
rename to benchmarks/_models/llama/tokenizer.py
diff --git a/torchao/_models/sam/.gitignore b/benchmarks/_models/sam/.gitignore
similarity index 100%
rename from torchao/_models/sam/.gitignore
rename to benchmarks/_models/sam/.gitignore
diff --git a/torchao/_models/sam/README.md b/benchmarks/_models/sam/README.md
similarity index 100%
rename from torchao/_models/sam/README.md
rename to benchmarks/_models/sam/README.md
diff --git a/torchao/_models/sam/benchmark.sh b/benchmarks/_models/sam/benchmark.sh
similarity index 100%
rename from torchao/_models/sam/benchmark.sh
rename to benchmarks/_models/sam/benchmark.sh
diff --git a/torchao/_models/sam/data.py b/benchmarks/_models/sam/data.py
similarity index 100%
rename from torchao/_models/sam/data.py
rename to benchmarks/_models/sam/data.py
diff --git a/torchao/_models/sam/eval_combo.py b/benchmarks/_models/sam/eval_combo.py
similarity index 100%
rename from torchao/_models/sam/eval_combo.py
rename to benchmarks/_models/sam/eval_combo.py
diff --git a/torchao/_models/sam/flash_4_configs.p b/benchmarks/_models/sam/flash_4_configs.p
similarity index 100%
rename from torchao/_models/sam/flash_4_configs.p
rename to benchmarks/_models/sam/flash_4_configs.p
diff --git a/torchao/_models/sam/metrics.py b/benchmarks/_models/sam/metrics.py
similarity index 100%
rename from torchao/_models/sam/metrics.py
rename to benchmarks/_models/sam/metrics.py
diff --git a/torchao/_models/sam/results.csv b/benchmarks/_models/sam/results.csv
similarity index 100%
rename from torchao/_models/sam/results.csv
rename to benchmarks/_models/sam/results.csv
diff --git a/torchao/_models/sam/setup.sh b/benchmarks/_models/sam/setup.sh
similarity index 100%
rename from torchao/_models/sam/setup.sh
rename to benchmarks/_models/sam/setup.sh
diff --git a/torchao/_models/sam2/__init__.py b/benchmarks/_models/sam2/__init__.py
similarity index 100%
rename from torchao/_models/sam2/__init__.py
rename to benchmarks/_models/sam2/__init__.py
diff --git a/torchao/_models/sam2/automatic_mask_generator.py b/benchmarks/_models/sam2/automatic_mask_generator.py
similarity index 100%
rename from torchao/_models/sam2/automatic_mask_generator.py
rename to benchmarks/_models/sam2/automatic_mask_generator.py
diff --git a/torchao/_models/sam2/build_sam.py b/benchmarks/_models/sam2/build_sam.py
similarity index 100%
rename from torchao/_models/sam2/build_sam.py
rename to benchmarks/_models/sam2/build_sam.py
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
diff --git a/torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml b/benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
rename to benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
diff --git a/torchao/_models/sam2/csrc/connected_components.cu b/benchmarks/_models/sam2/csrc/connected_components.cu
similarity index 100%
rename from torchao/_models/sam2/csrc/connected_components.cu
rename to benchmarks/_models/sam2/csrc/connected_components.cu
diff --git a/torchao/_models/sam2/map_tensor.py b/benchmarks/_models/sam2/map_tensor.py
similarity index 100%
rename from torchao/_models/sam2/map_tensor.py
rename to benchmarks/_models/sam2/map_tensor.py
diff --git a/torchao/_models/sam2/modeling/__init__.py b/benchmarks/_models/sam2/modeling/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/__init__.py
rename to benchmarks/_models/sam2/modeling/__init__.py
diff --git a/torchao/_models/sam2/modeling/backbones/__init__.py b/benchmarks/_models/sam2/modeling/backbones/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/__init__.py
rename to benchmarks/_models/sam2/modeling/backbones/__init__.py
diff --git a/torchao/_models/sam2/modeling/backbones/hieradet.py b/benchmarks/_models/sam2/modeling/backbones/hieradet.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/hieradet.py
rename to benchmarks/_models/sam2/modeling/backbones/hieradet.py
diff --git a/torchao/_models/sam2/modeling/backbones/image_encoder.py b/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/image_encoder.py
rename to benchmarks/_models/sam2/modeling/backbones/image_encoder.py
diff --git a/torchao/_models/sam2/modeling/backbones/utils.py b/benchmarks/_models/sam2/modeling/backbones/utils.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/utils.py
rename to benchmarks/_models/sam2/modeling/backbones/utils.py
diff --git a/torchao/_models/sam2/modeling/memory_attention.py b/benchmarks/_models/sam2/modeling/memory_attention.py
similarity index 100%
rename from torchao/_models/sam2/modeling/memory_attention.py
rename to benchmarks/_models/sam2/modeling/memory_attention.py
diff --git a/torchao/_models/sam2/modeling/memory_encoder.py b/benchmarks/_models/sam2/modeling/memory_encoder.py
similarity index 100%
rename from torchao/_models/sam2/modeling/memory_encoder.py
rename to benchmarks/_models/sam2/modeling/memory_encoder.py
diff --git a/torchao/_models/sam2/modeling/position_encoding.py b/benchmarks/_models/sam2/modeling/position_encoding.py
similarity index 100%
rename from torchao/_models/sam2/modeling/position_encoding.py
rename to benchmarks/_models/sam2/modeling/position_encoding.py
diff --git a/torchao/_models/sam2/modeling/sam/__init__.py b/benchmarks/_models/sam2/modeling/sam/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam/__init__.py
rename to benchmarks/_models/sam2/modeling/sam/__init__.py
diff --git a/torchao/_models/sam2/modeling/sam/mask_decoder.py b/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam/mask_decoder.py
rename to benchmarks/_models/sam2/modeling/sam/mask_decoder.py
diff --git a/torchao/_models/sam2/modeling/sam/prompt_encoder.py b/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam/prompt_encoder.py
rename to benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
diff --git a/torchao/_models/sam2/modeling/sam/transformer.py b/benchmarks/_models/sam2/modeling/sam/transformer.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam/transformer.py
rename to benchmarks/_models/sam2/modeling/sam/transformer.py
diff --git a/torchao/_models/sam2/modeling/sam2_base.py b/benchmarks/_models/sam2/modeling/sam2_base.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam2_base.py
rename to benchmarks/_models/sam2/modeling/sam2_base.py
diff --git a/torchao/_models/sam2/modeling/sam2_utils.py b/benchmarks/_models/sam2/modeling/sam2_utils.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam2_utils.py
rename to benchmarks/_models/sam2/modeling/sam2_utils.py
diff --git a/torchao/_models/sam2/sam2_hiera_b+.yaml b/benchmarks/_models/sam2/sam2_hiera_b+.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_b+.yaml
rename to benchmarks/_models/sam2/sam2_hiera_b+.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_l.yaml b/benchmarks/_models/sam2/sam2_hiera_l.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_l.yaml
rename to benchmarks/_models/sam2/sam2_hiera_l.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_s.yaml b/benchmarks/_models/sam2/sam2_hiera_s.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_s.yaml
rename to benchmarks/_models/sam2/sam2_hiera_s.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_t.yaml b/benchmarks/_models/sam2/sam2_hiera_t.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_t.yaml
rename to benchmarks/_models/sam2/sam2_hiera_t.yaml
diff --git a/torchao/_models/sam2/sam2_image_predictor.py b/benchmarks/_models/sam2/sam2_image_predictor.py
similarity index 100%
rename from torchao/_models/sam2/sam2_image_predictor.py
rename to benchmarks/_models/sam2/sam2_image_predictor.py
diff --git a/torchao/_models/sam2/sam2_video_predictor.py b/benchmarks/_models/sam2/sam2_video_predictor.py
similarity index 100%
rename from torchao/_models/sam2/sam2_video_predictor.py
rename to benchmarks/_models/sam2/sam2_video_predictor.py
diff --git a/torchao/_models/sam2/utils/__init__.py b/benchmarks/_models/sam2/utils/__init__.py
similarity index 100%
rename from torchao/_models/sam2/utils/__init__.py
rename to benchmarks/_models/sam2/utils/__init__.py
diff --git a/torchao/_models/sam2/utils/amg.py b/benchmarks/_models/sam2/utils/amg.py
similarity index 100%
rename from torchao/_models/sam2/utils/amg.py
rename to benchmarks/_models/sam2/utils/amg.py
diff --git a/torchao/_models/sam2/utils/misc.py b/benchmarks/_models/sam2/utils/misc.py
similarity index 100%
rename from torchao/_models/sam2/utils/misc.py
rename to benchmarks/_models/sam2/utils/misc.py
diff --git a/torchao/_models/sam2/utils/transforms.py b/benchmarks/_models/sam2/utils/transforms.py
similarity index 100%
rename from torchao/_models/sam2/utils/transforms.py
rename to benchmarks/_models/sam2/utils/transforms.py
diff --git a/torchao/_models/utils.py b/benchmarks/_models/utils.py
similarity index 100%
rename from torchao/_models/utils.py
rename to benchmarks/_models/utils.py
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 4af429940f..9079fbc907 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -278,7 +278,7 @@ def test_8da4w_quantizer(self):
     # https://github.com/pytorch-labs/gpt-fast/blob/6253c6bb054e658d67566150f87329b87815ae63/scripts/convert_hf_checkpoint.py
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_8da4w_gptq_quantizer(self):
-        from torchao._models._eval import InputRecorder, TransformerEvalWrapper
+        from torchao._eval import InputRecorder, TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer
 
         # should be similar to TorchCompileDynamicQuantizer
@@ -348,7 +348,7 @@ def test_8da4w_gptq_quantizer(self):
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch verion is 2.4 or lower"
     )
     def test_8da4w_quantizer_eval(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from torchao._eval import TransformerEvalWrapper
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
         precision = torch.bfloat16
@@ -384,7 +384,7 @@ def test_8da4w_quantizer_eval(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer_int4_weight_only(self):
-        from torchao._models._eval import (
+        from torchao._eval import (
             MultiTensorInputRecorder,
             TransformerEvalWrapper,
         )
@@ -454,7 +454,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_quantizer_int4_weight_only(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from torchao._eval import TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
@@ -492,7 +492,7 @@ def test_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from torchao._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
@@ -525,7 +525,7 @@ def test_eval_wrapper(self):
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper_llama3(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from torchao._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
diff --git a/torchao/_models/_eval.py b/torchao/_eval.py
similarity index 100%
rename from torchao/_models/_eval.py
rename to torchao/_eval.py
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
index d2b6e0c016..ed98cb1c03 100644
--- a/torchao/quantization/README.md
+++ b/torchao/quantization/README.md
@@ -396,7 +396,7 @@ The `quantize_` and `autoquant` apis now automatically use our recommended induc
 ## (To be moved to prototype) A16W4 WeightOnly Quantization with GPTQ
 
 ```python
-from torchao._models._eval import InputRecorder, TransformerEvalWrapper
+from torchao._eval import InputRecorder, TransformerEvalWrapper
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 precision = torch.bfloat16
 device = "cuda"

From 57d9da984bfa5ae2a28c40c5985584f6fbf84dba Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Tue, 25 Feb 2025 23:10:10 -0800
Subject: [PATCH 2/7] Update references

---
 benchmarks/_models/llama/eval.py              |  4 +--
 benchmarks/_models/llama/generate.py          | 10 +++----
 benchmarks/_models/llama/perf_profile.py      |  4 +--
 benchmarks/_models/sam/eval_combo.py          |  2 +-
 benchmarks/_models/sam2/__init__.py           |  2 +-
 .../_models/sam2/automatic_mask_generator.py  |  8 +++---
 benchmarks/_models/sam2/build_sam.py          |  4 +--
 .../sam2/configs/sam2.1/sam2.1_hiera_b+.yaml  | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_l.yaml   | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_s.yaml   | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_t.yaml   | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_b+.yaml      | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_l.yaml       | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_s.yaml       | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_t.yaml       | 28 +++++++++----------
 .../sam2/modeling/backbones/hieradet.py       |  4 +--
 .../sam2/modeling/backbones/image_encoder.py  |  2 +-
 .../_models/sam2/modeling/memory_attention.py |  4 +--
 .../_models/sam2/modeling/memory_encoder.py   |  6 +++-
 .../_models/sam2/modeling/sam/mask_decoder.py |  2 +-
 .../sam2/modeling/sam/prompt_encoder.py       |  4 +--
 .../_models/sam2/modeling/sam/transformer.py  |  6 ++--
 benchmarks/_models/sam2/modeling/sam2_base.py |  8 +++---
 .../_models/sam2/modeling/sam2_utils.py       |  2 +-
 .../_models/sam2/sam2_image_predictor.py      |  6 ++--
 .../_models/sam2/sam2_video_predictor.py      |  6 ++--
 benchmarks/_models/sam2/utils/transforms.py   |  4 +--
 .../quantized_training/pretrain_llama2.py     |  4 +--
 examples/sam2_amg_server/annotate_with_rle.py |  2 +-
 examples/sam2_amg_server/cli.py               |  6 ++--
 examples/sam2_amg_server/cli_on_modal.py      |  8 +++---
 examples/sam2_amg_server/compare_rle_lists.py |  2 +-
 .../sam2_amg_server/compile_export_utils.py   | 10 +++----
 examples/sam2_amg_server/generate_data.py     | 10 +++----
 examples/sam2_amg_server/server.py            |  8 +++---
 .../sam2_vos_example/compile_export_utils.py  |  2 +-
 examples/sam2_vos_example/video_profile.py    |  4 +--
 scripts/convert_hf_checkpoint.py              |  2 +-
 test/prototype/test_spinquant.py              |  2 +-
 test/quantization/test_gptq_mt.py             |  4 +--
 test/quantization/test_quant_api.py           |  4 +--
 test/test_ao_models.py                        |  2 +-
 .../scripts/BO_acc_throughput.py              |  8 +++---
 torchao/prototype/spinquant/spinquant.py      |  2 +-
 torchao/quantization/GPTQ.py                  |  2 +-
 45 files changed, 199 insertions(+), 195 deletions(-)

diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
index 8507733b5d..5c26329114 100644
--- a/benchmarks/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -15,7 +15,7 @@
 from tokenizer import get_tokenizer
 
 import torchao
-from torchao._models.llama.model import prepare_inputs_for_model
+from benchmarks._models.llama.model import prepare_inputs_for_model
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -172,7 +172,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from torchao._models.llama.model import TransformerBlock
+            from benchmarks._models.llama.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
index b6c21792fb..985bc1235b 100644
--- a/benchmarks/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -14,7 +14,7 @@
 import torch._inductor.config
 
 import torchao
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
@@ -72,8 +72,8 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
-from torchao._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 
 
 def multinomial_sample_one_no_sync(
@@ -575,8 +575,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
+            from benchmarks._models.llama.model import prepare_inputs_for_model
             from torchao._eval import InputRecorder
-            from torchao._models.llama.model import prepare_inputs_for_model
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -665,8 +665,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
+            from benchmarks._models.llama.model import prepare_inputs_for_model
             from torchao._eval import InputRecorder
-            from torchao._models.llama.model import prepare_inputs_for_model
 
             calibration_seq_length = 256
             inputs = (
diff --git a/benchmarks/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
index f613982221..d1e9cab83c 100644
--- a/benchmarks/_models/llama/perf_profile.py
+++ b/benchmarks/_models/llama/perf_profile.py
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from torchao._models.llama.model import Transformer
-from torchao._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
diff --git a/benchmarks/_models/sam/eval_combo.py b/benchmarks/_models/sam/eval_combo.py
index 781c10c935..7f17df4f4f 100644
--- a/benchmarks/_models/sam/eval_combo.py
+++ b/benchmarks/_models/sam/eval_combo.py
@@ -9,7 +9,7 @@
 from metrics import calculate_miou, create_result_entry
 
 import torchao
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
diff --git a/benchmarks/_models/sam2/__init__.py b/benchmarks/_models/sam2/__init__.py
index 0dc11c2fde..f49e12ba4e 100644
--- a/benchmarks/_models/sam2/__init__.py
+++ b/benchmarks/_models/sam2/__init__.py
@@ -8,4 +8,4 @@
 from hydra.core.global_hydra import GlobalHydra
 
 if not GlobalHydra.instance().is_initialized():
-    initialize_config_module("torchao._models.sam2", version_base="1.2")
+    initialize_config_module("benchmarks._models.sam2", version_base="1.2")
diff --git a/benchmarks/_models/sam2/automatic_mask_generator.py b/benchmarks/_models/sam2/automatic_mask_generator.py
index 6f4f1d3e7b..4e82f3ef04 100644
--- a/benchmarks/_models/sam2/automatic_mask_generator.py
+++ b/benchmarks/_models/sam2/automatic_mask_generator.py
@@ -11,9 +11,9 @@
 import torch
 from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
 
-from torchao._models.sam2.modeling.sam2_base import SAM2Base
-from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
-from torchao._models.sam2.utils.amg import (
+from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
+from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from benchmarks._models.sam2.utils.amg import (
     MaskData,
     _mask_to_rle_pytorch_2_0,
     _mask_to_rle_pytorch_2_1,
@@ -33,7 +33,7 @@
     uncrop_masks,
     uncrop_points,
 )
-from torchao._models.sam2.utils.misc import (
+from benchmarks._models.sam2.utils.misc import (
     crop_image,
     get_image_size,
 )
diff --git a/benchmarks/_models/sam2/build_sam.py b/benchmarks/_models/sam2/build_sam.py
index ad0d1fe41c..eea26ccee4 100644
--- a/benchmarks/_models/sam2/build_sam.py
+++ b/benchmarks/_models/sam2/build_sam.py
@@ -12,7 +12,7 @@
 from hydra.utils import instantiate
 from omegaconf import OmegaConf
 
-from torchao._models import sam2
+from benchmarks._models import sam2
 
 # Check if the user is running Python from the parent directory of the sam2 repo
 # (i.e. the directory where this repo is cloned into) -- this is not supported since
@@ -106,7 +106,7 @@ def build_sam2_video_predictor(
     **kwargs,
 ):
     hydra_overrides = [
-        "++model._target_=torchao._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
+        "++model._target_=benchmarks._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
     ]
     if apply_postprocessing:
         hydra_overrides_extra = hydra_overrides_extra.copy()
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
index 42cd897c67..1742a20e95 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
index ba9dafd489..17bf334745 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
index 898898b158..7b5f000254 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
index c6318f843b..84c6e92e9c 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
index b3ba469471..0f6c1c56cc 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
index 59a8a1e36b..4baf4e38eb 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
index b051d3be63..84b4b52a8e 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
index 6b108e708f..b572a7e4ee 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/modeling/backbones/hieradet.py b/benchmarks/_models/sam2/modeling/backbones/hieradet.py
index 91e98f795e..b56c983c8f 100644
--- a/benchmarks/_models/sam2/modeling/backbones/hieradet.py
+++ b/benchmarks/_models/sam2/modeling/backbones/hieradet.py
@@ -13,12 +13,12 @@
 import torch.nn.functional as F
 from iopath.common.file_io import g_pathmgr
 
-from torchao._models.sam2.modeling.backbones.utils import (
+from benchmarks._models.sam2.modeling.backbones.utils import (
     PatchEmbed,
     window_partition,
     window_unpartition,
 )
-from torchao._models.sam2.modeling.sam2_utils import MLP, DropPath
+from benchmarks._models.sam2.modeling.sam2_utils import MLP, DropPath
 
 
 def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
diff --git a/benchmarks/_models/sam2/modeling/backbones/image_encoder.py b/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
index 0f0a256867..efa1d963e4 100644
--- a/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
+++ b/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
@@ -29,7 +29,7 @@ def __init__(
     def forward(self, sample: torch.Tensor):
         # Forward through backbone
         with torch.autograd.profiler.record_function("self.neck(self.trunk(sample))"):
-            from torchao._models.sam2.map_tensor import MapTensor, to_map_tensor
+            from benchmarks._models.sam2.map_tensor import MapTensor, to_map_tensor
 
             if isinstance(sample, MapTensor):
                 features, pos = self.neck(self.trunk(sample.elems.flatten(0, 1)))
diff --git a/benchmarks/_models/sam2/modeling/memory_attention.py b/benchmarks/_models/sam2/modeling/memory_attention.py
index 5ac6288af0..c32707cf31 100644
--- a/benchmarks/_models/sam2/modeling/memory_attention.py
+++ b/benchmarks/_models/sam2/modeling/memory_attention.py
@@ -9,8 +9,8 @@
 import torch
 from torch import Tensor, nn
 
-from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
-from torchao._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
+from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
+from benchmarks._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
 
 
 class MemoryAttentionLayer(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/memory_encoder.py b/benchmarks/_models/sam2/modeling/memory_encoder.py
index 3796cefd00..84116aa225 100644
--- a/benchmarks/_models/sam2/modeling/memory_encoder.py
+++ b/benchmarks/_models/sam2/modeling/memory_encoder.py
@@ -11,7 +11,11 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torchao._models.sam2.modeling.sam2_utils import DropPath, LayerNorm2d, get_clones
+from benchmarks._models.sam2.modeling.sam2_utils import (
+    DropPath,
+    LayerNorm2d,
+    get_clones,
+)
 
 
 class MaskDownSampler(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/sam/mask_decoder.py b/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
index 7d25697018..1c29113197 100644
--- a/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
+++ b/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
@@ -9,7 +9,7 @@
 import torch
 from torch import nn
 
-from torchao._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
+from benchmarks._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
 
 
 class MaskDecoder(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py b/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
index 94b7fda8b2..2c3abbfa34 100644
--- a/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
+++ b/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
@@ -9,8 +9,8 @@
 import torch
 from torch import nn
 
-from torchao._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
-from torchao._models.sam2.modeling.sam2_utils import LayerNorm2d
+from benchmarks._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
+from benchmarks._models.sam2.modeling.sam2_utils import LayerNorm2d
 
 
 class PromptEncoder(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/sam/transformer.py b/benchmarks/_models/sam2/modeling/sam/transformer.py
index bf0b58d6fd..3c6d3b83cd 100644
--- a/benchmarks/_models/sam2/modeling/sam/transformer.py
+++ b/benchmarks/_models/sam2/modeling/sam/transformer.py
@@ -14,12 +14,12 @@
 import torch.nn.functional as F
 from torch import Tensor, nn
 
-from torchao._models.sam2.modeling.position_encoding import (
+from benchmarks._models.sam2.modeling.position_encoding import (
     apply_rotary_enc,
     compute_axial_cis,
 )
-from torchao._models.sam2.modeling.sam2_utils import MLP
-from torchao._models.sam2.utils.misc import get_sdpa_settings
+from benchmarks._models.sam2.modeling.sam2_utils import MLP
+from benchmarks._models.sam2.utils.misc import get_sdpa_settings
 
 warnings.simplefilter(action="ignore", category=FutureWarning)
 # Check whether Flash Attention is available (and use it by default)
diff --git a/benchmarks/_models/sam2/modeling/sam2_base.py b/benchmarks/_models/sam2/modeling/sam2_base.py
index 4c2a24a0ef..c5d1f54829 100644
--- a/benchmarks/_models/sam2/modeling/sam2_base.py
+++ b/benchmarks/_models/sam2/modeling/sam2_base.py
@@ -9,10 +9,10 @@
 import torch.nn.functional as F
 from torch.nn.init import trunc_normal_
 
-from torchao._models.sam2.modeling.sam.mask_decoder import MaskDecoder
-from torchao._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
-from torchao._models.sam2.modeling.sam.transformer import TwoWayTransformer
-from torchao._models.sam2.modeling.sam2_utils import (
+from benchmarks._models.sam2.modeling.sam.mask_decoder import MaskDecoder
+from benchmarks._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
+from benchmarks._models.sam2.modeling.sam.transformer import TwoWayTransformer
+from benchmarks._models.sam2.modeling.sam2_utils import (
     MLP,
     get_1d_sine_pe,
     select_closest_cond_frames,
diff --git a/benchmarks/_models/sam2/modeling/sam2_utils.py b/benchmarks/_models/sam2/modeling/sam2_utils.py
index 579bfc671a..1c00f534e3 100644
--- a/benchmarks/_models/sam2/modeling/sam2_utils.py
+++ b/benchmarks/_models/sam2/modeling/sam2_utils.py
@@ -13,7 +13,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torchao._models.sam2.utils.misc import mask_to_box
+from benchmarks._models.sam2.utils.misc import mask_to_box
 
 
 def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
diff --git a/benchmarks/_models/sam2/sam2_image_predictor.py b/benchmarks/_models/sam2/sam2_image_predictor.py
index a4aa1c668c..a2c53bdf0a 100644
--- a/benchmarks/_models/sam2/sam2_image_predictor.py
+++ b/benchmarks/_models/sam2/sam2_image_predictor.py
@@ -11,9 +11,9 @@
 import torch
 from PIL.Image import Image
 
-from torchao._models.sam2.modeling.sam2_base import SAM2Base
-from torchao._models.sam2.utils.misc import get_image_size
-from torchao._models.sam2.utils.transforms import SAM2Transforms
+from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
+from benchmarks._models.sam2.utils.misc import get_image_size
+from benchmarks._models.sam2.utils.transforms import SAM2Transforms
 
 
 class SAM2ImagePredictor(torch.nn.Module):
diff --git a/benchmarks/_models/sam2/sam2_video_predictor.py b/benchmarks/_models/sam2/sam2_video_predictor.py
index 53b0a11d7c..6715178958 100644
--- a/benchmarks/_models/sam2/sam2_video_predictor.py
+++ b/benchmarks/_models/sam2/sam2_video_predictor.py
@@ -10,8 +10,8 @@
 import torch
 from tqdm import tqdm
 
-from torchao._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
-from torchao._models.sam2.utils.misc import (
+from benchmarks._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from benchmarks._models.sam2.utils.misc import (
     concat_points,
     fill_holes_in_mask_scores,
     load_video_frames,
@@ -52,7 +52,7 @@ def batch_inference_states(inference_states: list):
 
         batched_inference_state = copy.copy(inference_states[0])
 
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         # NOTE: Making a build assumption only images differ
         all_images = torch.stack([state["images"] for state in inference_states])
diff --git a/benchmarks/_models/sam2/utils/transforms.py b/benchmarks/_models/sam2/utils/transforms.py
index c616233050..2d5e46193b 100644
--- a/benchmarks/_models/sam2/utils/transforms.py
+++ b/benchmarks/_models/sam2/utils/transforms.py
@@ -78,7 +78,7 @@ def postprocess_masks(
         """
         Perform PostProcessing on output masks.
         """
-        from torchao._models.sam2.utils.misc import get_connected_components
+        from benchmarks._models.sam2.utils.misc import get_connected_components
 
         masks = masks.float()
         input_masks = masks
@@ -125,7 +125,7 @@ def postprocess_masks_1_channel(
         """
         Perform PostProcessing on output masks.
         """
-        from torchao._models.sam2.utils.misc import get_connected_components
+        from benchmarks._models.sam2.utils.misc import get_connected_components
 
         assert masks.dim() == 4
         assert masks.size(1) == 1
diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py
index 25b37921b6..2eb66f5e6b 100644
--- a/benchmarks/quantized_training/pretrain_llama2.py
+++ b/benchmarks/quantized_training/pretrain_llama2.py
@@ -22,13 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao import quantize_
-from torchao._models.llama.model import (
+from benchmarks._models.llama.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
+from torchao import quantize_
 from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
diff --git a/examples/sam2_amg_server/annotate_with_rle.py b/examples/sam2_amg_server/annotate_with_rle.py
index 55e5512011..3c3bbc77b0 100644
--- a/examples/sam2_amg_server/annotate_with_rle.py
+++ b/examples/sam2_amg_server/annotate_with_rle.py
@@ -14,7 +14,7 @@
 )
 from tqdm import tqdm
 
-from torchao._models.sam2.utils.amg import area_from_rle, rle_to_mask
+from benchmarks._models.sam2.utils.amg import area_from_rle, rle_to_mask
 
 
 def timestamped_print(*args, **kwargs):
diff --git a/examples/sam2_amg_server/cli.py b/examples/sam2_amg_server/cli.py
index 2f6758b7d3..b5feac395e 100644
--- a/examples/sam2_amg_server/cli.py
+++ b/examples/sam2_amg_server/cli.py
@@ -12,9 +12,9 @@
     show_anns,
 )
 
-from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-from torchao._models.sam2.build_sam import build_sam2
-from torchao._models.sam2.utils.amg import rle_to_mask
+from benchmarks._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from benchmarks._models.sam2.build_sam import build_sam2
+from benchmarks._models.sam2.utils.amg import rle_to_mask
 
 
 def main_docstring():
diff --git a/examples/sam2_amg_server/cli_on_modal.py b/examples/sam2_amg_server/cli_on_modal.py
index 5fe56eeb1a..d44de90bf7 100644
--- a/examples/sam2_amg_server/cli_on_modal.py
+++ b/examples/sam2_amg_server/cli_on_modal.py
@@ -84,10 +84,10 @@ def build(self):
             from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
             from sam2.build_sam import build_sam2
         else:
-            from torchao._models.sam2.automatic_mask_generator import (
+            from benchmarks._models.sam2.automatic_mask_generator import (
                 SAM2AutomaticMaskGenerator,
             )
-            from torchao._models.sam2.build_sam import build_sam2
+            from benchmarks._models.sam2.build_sam import build_sam2
 
         os.chdir(f"{TARGET}ao_src_0/examples/sam2_amg_server")
         import sys
@@ -139,11 +139,11 @@ def build(self):
             from sam2.utils.amg import mask_to_rle_pytorch as mask_to_rle_pytorch_2
             from sam2.utils.amg import rle_to_mask
         else:
-            from torchao._models.sam2.utils.amg import (
+            from benchmarks._models.sam2.utils.amg import (
                 mask_to_rle_pytorch_2,
                 rle_to_mask,
             )
-        from torchao._models.sam2.utils.amg import area_from_rle
+        from benchmarks._models.sam2.utils.amg import area_from_rle
 
         self.np = np
         self.tio = tio
diff --git a/examples/sam2_amg_server/compare_rle_lists.py b/examples/sam2_amg_server/compare_rle_lists.py
index 7a1c78b846..88be3df491 100644
--- a/examples/sam2_amg_server/compare_rle_lists.py
+++ b/examples/sam2_amg_server/compare_rle_lists.py
@@ -7,7 +7,7 @@
 import torch
 
 
-# from torchao._models.sam2.utils.amg import rle_to_mask
+# from benchmarks._models.sam2.utils.amg import rle_to_mask
 def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
     """Compute a binary mask from an uncompressed RLE."""
     h, w = rle["size"]
diff --git a/examples/sam2_amg_server/compile_export_utils.py b/examples/sam2_amg_server/compile_export_utils.py
index d1c6fc06fa..ab6e76a4fc 100644
--- a/examples/sam2_amg_server/compile_export_utils.py
+++ b/examples/sam2_amg_server/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
@@ -519,12 +519,12 @@ def set_fast(
         # A bunch of extra compiles at module level
         # Note that this can cause recompilations!
         # We might want to guard on that
-        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
+        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
             fullgraph=True, dynamic=True
-        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
-        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
+        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
+        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
             fullgraph=True, dynamic=True
-        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
+        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
         mask_generator.calculate_stability_score = torch.compile(
             fullgraph=True, dynamic=True
         )(mask_generator.calculate_stability_score)
diff --git a/examples/sam2_amg_server/generate_data.py b/examples/sam2_amg_server/generate_data.py
index 50eeccb912..dc82348d0b 100644
--- a/examples/sam2_amg_server/generate_data.py
+++ b/examples/sam2_amg_server/generate_data.py
@@ -192,7 +192,7 @@ def gen_masks_ao_batch(
         center_points_label_torch_batch = [
             torch.from_numpy(t).unsqueeze(1) for t in center_points_label_batch
         ]
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch_batch = list(map(to_map_tensor, center_points_torch_batch))
         center_points_label_torch_batch = list(
@@ -255,7 +255,7 @@ def gen_masks_ao(
 
         center_points_torch = torch.from_numpy(center_points).unsqueeze(1)
         center_points_label_torch = torch.from_numpy(center_points_label).unsqueeze(1)
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch = to_map_tensor(center_points_torch)
         center_points_label_torch = to_map_tensor(center_points_label_torch)
@@ -532,11 +532,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import mask_to_rle_pytorch
     else:
-        from torchao._models.sam2.automatic_mask_generator import (
+        from benchmarks._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from torchao._models.sam2.build_sam import build_sam2
-        from torchao._models.sam2.utils.amg import (
+        from benchmarks._models.sam2.build_sam import build_sam2
+        from benchmarks._models.sam2.utils.amg import (
             mask_to_rle_pytorch_2 as mask_to_rle_pytorch,
         )
     torch.manual_seed(seed)
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
index 7e35858590..ea9953dbed 100644
--- a/examples/sam2_amg_server/server.py
+++ b/examples/sam2_amg_server/server.py
@@ -26,7 +26,7 @@
 from fastapi.responses import StreamingResponse
 from torch._inductor import config as inductorconfig
 
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
@@ -460,11 +460,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import rle_to_mask
     else:
-        from torchao._models.sam2.automatic_mask_generator import (
+        from benchmarks._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from torchao._models.sam2.build_sam import build_sam2
-        from torchao._models.sam2.utils.amg import rle_to_mask
+        from benchmarks._models.sam2.build_sam import build_sam2
+        from benchmarks._models.sam2.utils.amg import rle_to_mask
 
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
diff --git a/examples/sam2_vos_example/compile_export_utils.py b/examples/sam2_vos_example/compile_export_utils.py
index 7d1b3eddf3..00f1b56794 100644
--- a/examples/sam2_vos_example/compile_export_utils.py
+++ b/examples/sam2_vos_example/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from torchao._models.sam2.sam2_video_predictor import SAM2VideoPredictor
+from benchmarks._models.sam2.sam2_video_predictor import SAM2VideoPredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
diff --git a/examples/sam2_vos_example/video_profile.py b/examples/sam2_vos_example/video_profile.py
index 8ee9151cc4..44b90bd77b 100644
--- a/examples/sam2_vos_example/video_profile.py
+++ b/examples/sam2_vos_example/video_profile.py
@@ -280,7 +280,7 @@ def main(
     if use_baseline:
         from sam2.build_sam import build_sam2_video_predictor
     else:
-        from torchao._models.sam2.build_sam import build_sam2_video_predictor
+        from benchmarks._models.sam2.build_sam import build_sam2_video_predictor
 
     device = "cuda:0"
     # hydra_overrides_extra = ["++model.compile_image_encoder=true"]
@@ -292,7 +292,7 @@ def main(
     )
     predictor._frame_batch_size = frame_batch_size
     predictor.image_encoder.trunk = predictor.image_encoder.trunk.to(torch.bfloat16)
-    from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
+    from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
 
     rope_attention_modules = [
         module for module in predictor.modules() if isinstance(module, RoPEAttention)
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index e05f23da2a..1b0939c951 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -14,7 +14,7 @@
 import torch
 from safetensors.torch import load_file as load_safetensors_file
 
-from torchao._models.llama.model import ModelArgs
+from benchmarks._models.llama.model import ModelArgs
 
 
 @torch.inference_mode()
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index 42606b014e..a50b9d9cb7 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.llama.model import Transformer
+from benchmarks._models.llama.model import Transformer
 from torchao.prototype.spinquant import apply_spinquant
 
 
diff --git a/test/quantization/test_gptq_mt.py b/test/quantization/test_gptq_mt.py
index 5d4e73ed61..f82315714b 100644
--- a/test/quantization/test_gptq_mt.py
+++ b/test/quantization/test_gptq_mt.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests
 
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
-from torchao._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer, MultiTensor
 from torchao.quantization.utils import _lm_eval_available
 from torchao.utils import is_fbcode
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 9079fbc907..6cfb41028c 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -22,8 +22,8 @@
 from torch.testing._internal.common_utils import TestCase
 
 from torchao import quantize_
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
-from torchao._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao.dtypes import AffineQuantizedTensor
 from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.quant_api import (
diff --git a/test/test_ao_models.py b/test/test_ao_models.py
index 49385b0a99..064e2a9a54 100644
--- a/test/test_ao_models.py
+++ b/test/test_ao_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.llama.model import Transformer
+from benchmarks._models.llama.model import Transformer
 
 _AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
index 12fc77bd9a..19bf96d153 100644
--- a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
+++ b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -18,15 +18,15 @@
 )
 
 import torchao
-from torchao._models.llama.generate import (
+from benchmarks._models.llama.generate import (
     _load_model,
     decode_one_token,
     device_sync,
     encode_tokens,
     prefill,
 )
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
-from torchao._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 
 default_device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -99,7 +99,7 @@ def generate(
             _replace_with_custom_fn_if_matches_filter(
                 model,
                 AffineQuantizedKVCache.from_float,
-                lambda x, y: isinstance(x, torchao._models.llama.model.KVCache),
+                lambda x, y: isinstance(x, benchmarks._models.llama.model.KVCache),
             )
 
     # format model input
diff --git a/torchao/prototype/spinquant/spinquant.py b/torchao/prototype/spinquant/spinquant.py
index 60ad1a8b41..bfa83a332a 100644
--- a/torchao/prototype/spinquant/spinquant.py
+++ b/torchao/prototype/spinquant/spinquant.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from torchao._models.llama.model import RMSNorm, Transformer
+from benchmarks._models.llama.model import RMSNorm, Transformer
 from torchao.prototype.spinquant.hadamard_utils import (
     apply_exact_had_to_linear,
     get_hadK,
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
index b278e22b3b..763530a55c 100644
--- a/torchao/quantization/GPTQ.py
+++ b/torchao/quantization/GPTQ.py
@@ -81,7 +81,7 @@ def __init__(
         # needed for GPTQ on the torchao llama model
         import torchao
 
-        torchao._models.llama.model.use_index_put_for_kv_cache = True
+        benchmarks._models.llama.model.use_index_put_for_kv_cache = True
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)

From daae64d8de8796ee49a6e0465f6f8210e65e5327 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Wed, 26 Feb 2025 09:24:57 -0800
Subject: [PATCH 3/7] Make benchmarks a module

---
 benchmarks/__init__.py                   |  0
 {torchao => benchmarks/_models}/_eval.py |  0
 benchmarks/_models/llama/eval.py         |  4 ++--
 benchmarks/_models/llama/generate.py     |  6 +++---
 test/quantization/test_quant_api.py      | 12 ++++++------
 torchao/quantization/README.md           |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/__init__.py
 rename {torchao => benchmarks/_models}/_eval.py (100%)

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchao/_eval.py b/benchmarks/_models/_eval.py
similarity index 100%
rename from torchao/_eval.py
rename to benchmarks/_models/_eval.py
diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
index 5c26329114..4e4d5a979f 100644
--- a/benchmarks/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -120,7 +120,7 @@ def run_evaluation(
             quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
         if "int4wo" in quantization and "gptq" in quantization:
             # avoid circular imports
-            from torchao._eval import MultiTensorInputRecorder
+            from benchmarks._models._eval import MultiTensorInputRecorder
             from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
 
             groupsize = int(quantization.split("-")[-2])
@@ -242,7 +242,7 @@ def run_evaluation(
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
-        from torchao._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         TransformerEvalWrapper(
             model=model.to(device),
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
index 985bc1235b..bf0aedec78 100644
--- a/benchmarks/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -476,7 +476,7 @@ def ffn_or_attn_only(mod, fqn):
                 filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
             )
         elif quantization.startswith("awq"):
-            from torchao._eval import TransformerEvalWrapper
+            from benchmarks._models._eval import TransformerEvalWrapper
             from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
 
             if not TORCH_VERSION_AT_LEAST_2_3:
@@ -576,7 +576,7 @@ def ffn_or_attn_only(mod, fqn):
             )
         elif "autoquant_v2" in quantization:
             from benchmarks._models.llama.model import prepare_inputs_for_model
-            from torchao._eval import InputRecorder
+            from benchmarks._models._eval import InputRecorder
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -666,7 +666,7 @@ def ffn_or_attn_only(mod, fqn):
             model.finalize_autoquant()
         elif "autoquant" in quantization:
             from benchmarks._models.llama.model import prepare_inputs_for_model
-            from torchao._eval import InputRecorder
+            from benchmarks._models._eval import InputRecorder
 
             calibration_seq_length = 256
             inputs = (
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 6cfb41028c..6fba865a39 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -278,7 +278,7 @@ def test_8da4w_quantizer(self):
     # https://github.com/pytorch-labs/gpt-fast/blob/6253c6bb054e658d67566150f87329b87815ae63/scripts/convert_hf_checkpoint.py
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_8da4w_gptq_quantizer(self):
-        from torchao._eval import InputRecorder, TransformerEvalWrapper
+        from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer
 
         # should be similar to TorchCompileDynamicQuantizer
@@ -348,7 +348,7 @@ def test_8da4w_gptq_quantizer(self):
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch verion is 2.4 or lower"
     )
     def test_8da4w_quantizer_eval(self):
-        from torchao._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
         precision = torch.bfloat16
@@ -384,7 +384,7 @@ def test_8da4w_quantizer_eval(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer_int4_weight_only(self):
-        from torchao._eval import (
+        from benchmarks._models._eval import (
             MultiTensorInputRecorder,
             TransformerEvalWrapper,
         )
@@ -454,7 +454,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_quantizer_int4_weight_only(self):
-        from torchao._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
@@ -492,7 +492,7 @@ def test_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper(self):
-        from torchao._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
@@ -525,7 +525,7 @@ def test_eval_wrapper(self):
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper_llama3(self):
-        from torchao._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
index ed98cb1c03..36f0befa80 100644
--- a/torchao/quantization/README.md
+++ b/torchao/quantization/README.md
@@ -396,7 +396,7 @@ The `quantize_` and `autoquant` apis now automatically use our recommended induc
 ## (To be moved to prototype) A16W4 WeightOnly Quantization with GPTQ
 
 ```python
-from torchao._eval import InputRecorder, TransformerEvalWrapper
+from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 precision = torch.bfloat16
 device = "cuda"

From 30567b41db7a13c3c41b34db660e3db98eb6829e Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 27 Feb 2025 10:10:58 -0800
Subject: [PATCH 4/7] Move files to _models

---
 benchmarks/_models/llama/eval.py                 |  8 ++++----
 benchmarks/_models/llama/generate.py             | 14 +++++++-------
 benchmarks/_models/llama/perf_profile.py         |  4 ++--
 benchmarks/quantized_training/pretrain_llama2.py |  2 +-
 scripts/convert_hf_checkpoint.py                 |  2 +-
 test/prototype/test_spinquant.py                 |  2 +-
 test/quantization/test_gptq_mt.py                |  4 ++--
 test/quantization/test_quant_api.py              | 16 ++++++++--------
 test/test_ao_models.py                           |  2 +-
 torchao/_models/__init__.py                      |  0
 {benchmarks => torchao}/_models/_eval.py         |  0
 .../_models/llama => torchao/_models}/model.py   |  0
 .../llama => torchao/_models}/tokenizer.py       |  0
 .../mixed_precision/scripts/BO_acc_throughput.py |  8 ++++----
 torchao/prototype/spinquant/spinquant.py         |  2 +-
 torchao/quantization/GPTQ.py                     |  2 +-
 torchao/quantization/README.md                   |  2 +-
 17 files changed, 34 insertions(+), 34 deletions(-)
 create mode 100644 torchao/_models/__init__.py
 rename {benchmarks => torchao}/_models/_eval.py (100%)
 rename {benchmarks/_models/llama => torchao/_models}/model.py (100%)
 rename {benchmarks/_models/llama => torchao/_models}/tokenizer.py (100%)

diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
index 4e4d5a979f..3454c69014 100644
--- a/benchmarks/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -15,7 +15,7 @@
 from tokenizer import get_tokenizer
 
 import torchao
-from benchmarks._models.llama.model import prepare_inputs_for_model
+from torchao._models.model import prepare_inputs_for_model
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -120,7 +120,7 @@ def run_evaluation(
             quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
         if "int4wo" in quantization and "gptq" in quantization:
             # avoid circular imports
-            from benchmarks._models._eval import MultiTensorInputRecorder
+            from torchao._models._eval import MultiTensorInputRecorder
             from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
 
             groupsize = int(quantization.split("-")[-2])
@@ -172,7 +172,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from benchmarks._models.llama.model import TransformerBlock
+            from torchao._models.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
@@ -242,7 +242,7 @@ def run_evaluation(
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         TransformerEvalWrapper(
             model=model.to(device),
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
index bf0aedec78..e0d3a08be1 100644
--- a/benchmarks/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -72,8 +72,8 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 
 
 def multinomial_sample_one_no_sync(
@@ -476,7 +476,7 @@ def ffn_or_attn_only(mod, fqn):
                 filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
             )
         elif quantization.startswith("awq"):
-            from benchmarks._models._eval import TransformerEvalWrapper
+            from torchao._models._eval import TransformerEvalWrapper
             from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
 
             if not TORCH_VERSION_AT_LEAST_2_3:
@@ -575,8 +575,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from benchmarks._models.llama.model import prepare_inputs_for_model
-            from benchmarks._models._eval import InputRecorder
+            from torchao._models.model import prepare_inputs_for_model
+            from torchao._models._eval import InputRecorder
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -665,8 +665,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from benchmarks._models.llama.model import prepare_inputs_for_model
-            from benchmarks._models._eval import InputRecorder
+            from torchao._models.model import prepare_inputs_for_model
+            from torchao._models._eval import InputRecorder
 
             calibration_seq_length = 256
             inputs = (
diff --git a/benchmarks/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
index d1e9cab83c..6cd924a493 100644
--- a/benchmarks/_models/llama/perf_profile.py
+++ b/benchmarks/_models/llama/perf_profile.py
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from benchmarks._models.llama.model import Transformer
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer
+from torchao._models.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py
index 2eb66f5e6b..371831f4ca 100644
--- a/benchmarks/quantized_training/pretrain_llama2.py
+++ b/benchmarks/quantized_training/pretrain_llama2.py
@@ -22,7 +22,7 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from benchmarks._models.llama.model import (
+from torchao._models.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 1b0939c951..128e6120ef 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -14,7 +14,7 @@
 import torch
 from safetensors.torch import load_file as load_safetensors_file
 
-from benchmarks._models.llama.model import ModelArgs
+from torchao._models.model import ModelArgs
 
 
 @torch.inference_mode()
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index a50b9d9cb7..36a47a9f89 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from benchmarks._models.llama.model import Transformer
+from torchao._models.model import Transformer
 from torchao.prototype.spinquant import apply_spinquant
 
 
diff --git a/test/quantization/test_gptq_mt.py b/test/quantization/test_gptq_mt.py
index f82315714b..40e234e8c8 100644
--- a/test/quantization/test_gptq_mt.py
+++ b/test/quantization/test_gptq_mt.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests
 
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer, MultiTensor
 from torchao.quantization.utils import _lm_eval_available
 from torchao.utils import is_fbcode
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 6fba865a39..ecb550729a 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -22,8 +22,8 @@
 from torch.testing._internal.common_utils import TestCase
 
 from torchao import quantize_
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 from torchao.dtypes import AffineQuantizedTensor
 from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.quant_api import (
@@ -278,7 +278,7 @@ def test_8da4w_quantizer(self):
     # https://github.com/pytorch-labs/gpt-fast/blob/6253c6bb054e658d67566150f87329b87815ae63/scripts/convert_hf_checkpoint.py
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_8da4w_gptq_quantizer(self):
-        from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
+        from torchao._models._eval import InputRecorder, TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer
 
         # should be similar to TorchCompileDynamicQuantizer
@@ -348,7 +348,7 @@ def test_8da4w_gptq_quantizer(self):
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch verion is 2.4 or lower"
     )
     def test_8da4w_quantizer_eval(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
         precision = torch.bfloat16
@@ -384,7 +384,7 @@ def test_8da4w_quantizer_eval(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer_int4_weight_only(self):
-        from benchmarks._models._eval import (
+        from torchao._models._eval import (
             MultiTensorInputRecorder,
             TransformerEvalWrapper,
         )
@@ -454,7 +454,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_quantizer_int4_weight_only(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
@@ -492,7 +492,7 @@ def test_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
@@ -525,7 +525,7 @@ def test_eval_wrapper(self):
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper_llama3(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
diff --git a/test/test_ao_models.py b/test/test_ao_models.py
index 064e2a9a54..05af8ef7d9 100644
--- a/test/test_ao_models.py
+++ b/test/test_ao_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from benchmarks._models.llama.model import Transformer
+from torchao._models.model import Transformer
 
 _AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 
diff --git a/torchao/_models/__init__.py b/torchao/_models/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmarks/_models/_eval.py b/torchao/_models/_eval.py
similarity index 100%
rename from benchmarks/_models/_eval.py
rename to torchao/_models/_eval.py
diff --git a/benchmarks/_models/llama/model.py b/torchao/_models/model.py
similarity index 100%
rename from benchmarks/_models/llama/model.py
rename to torchao/_models/model.py
diff --git a/benchmarks/_models/llama/tokenizer.py b/torchao/_models/tokenizer.py
similarity index 100%
rename from benchmarks/_models/llama/tokenizer.py
rename to torchao/_models/tokenizer.py
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
index 19bf96d153..46af69f670 100644
--- a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
+++ b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -18,15 +18,15 @@
 )
 
 import torchao
-from benchmarks._models.llama.generate import (
+from torchao._models.generate import (
     _load_model,
     decode_one_token,
     device_sync,
     encode_tokens,
     prefill,
 )
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 
 default_device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -99,7 +99,7 @@ def generate(
             _replace_with_custom_fn_if_matches_filter(
                 model,
                 AffineQuantizedKVCache.from_float,
-                lambda x, y: isinstance(x, benchmarks._models.llama.model.KVCache),
+                lambda x, y: isinstance(x, torchao._models.model.KVCache),
             )
 
     # format model input
diff --git a/torchao/prototype/spinquant/spinquant.py b/torchao/prototype/spinquant/spinquant.py
index bfa83a332a..83a1bc0b30 100644
--- a/torchao/prototype/spinquant/spinquant.py
+++ b/torchao/prototype/spinquant/spinquant.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from benchmarks._models.llama.model import RMSNorm, Transformer
+from torchao._models.model import RMSNorm, Transformer
 from torchao.prototype.spinquant.hadamard_utils import (
     apply_exact_had_to_linear,
     get_hadK,
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
index 763530a55c..dc4145d18d 100644
--- a/torchao/quantization/GPTQ.py
+++ b/torchao/quantization/GPTQ.py
@@ -81,7 +81,7 @@ def __init__(
         # needed for GPTQ on the torchao llama model
         import torchao
 
-        benchmarks._models.llama.model.use_index_put_for_kv_cache = True
+        torchao._models.model.use_index_put_for_kv_cache = True
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
index 36f0befa80..d2b6e0c016 100644
--- a/torchao/quantization/README.md
+++ b/torchao/quantization/README.md
@@ -396,7 +396,7 @@ The `quantize_` and `autoquant` apis now automatically use our recommended induc
 ## (To be moved to prototype) A16W4 WeightOnly Quantization with GPTQ
 
 ```python
-from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
+from torchao._models._eval import InputRecorder, TransformerEvalWrapper
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 precision = torch.bfloat16
 device = "cuda"

From 677aca7968a45355363652e51398265fdaf87d49 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Thu, 27 Feb 2025 10:54:29 -0800
Subject: [PATCH 5/7] Updates

---
 benchmarks/_models/llama/eval.py              |   4 +-
 benchmarks/_models/llama/generate.py          | 128 +++---------------
 benchmarks/_models/llama/perf_profile.py      |   4 +-
 benchmarks/_models/sam/eval_combo.py          |   2 +-
 .../quantized_training/pretrain_llama2.py     |   4 +-
 examples/sam2_amg_server/annotate_with_rle.py |   2 +-
 examples/sam2_amg_server/cli.py               |   6 +-
 examples/sam2_amg_server/cli_on_modal.py      |   8 +-
 examples/sam2_amg_server/compare_rle_lists.py |   2 +-
 .../sam2_amg_server/compile_export_utils.py   |  10 +-
 examples/sam2_amg_server/generate_data.py     |  10 +-
 examples/sam2_amg_server/server.py            |   8 +-
 .../sam2_vos_example/compile_export_utils.py  |   2 +-
 examples/sam2_vos_example/video_profile.py    |   4 +-
 scripts/convert_hf_checkpoint.py              |   2 +-
 test/prototype/test_spinquant.py              |   2 +-
 test/quantization/test_gptq_mt.py             |   4 +-
 test/quantization/test_quant_api.py           |   4 +-
 test/test_ao_models.py                        |   2 +-
 {benchmarks => torchao}/_models/README.md     |   0
 torchao/_models/llm/__init__.py               |   0
 torchao/_models/{ => llm}/model.py            |   0
 torchao/_models/{ => llm}/tokenizer.py        |   0
 .../_models/sam2/__init__.py                  |   2 +-
 .../_models/sam2/automatic_mask_generator.py  |   8 +-
 .../_models/sam2/build_sam.py                 |   2 +-
 .../sam2/configs/sam2.1/sam2.1_hiera_b+.yaml  |  28 ++--
 .../sam2/configs/sam2.1/sam2.1_hiera_l.yaml   |  28 ++--
 .../sam2/configs/sam2.1/sam2.1_hiera_s.yaml   |  28 ++--
 .../sam2/configs/sam2.1/sam2.1_hiera_t.yaml   |  28 ++--
 .../sam2.1_hiera_b+_MOSE_finetune.yaml        |   0
 .../sam2/configs/sam2/sam2_hiera_b+.yaml      |  28 ++--
 .../sam2/configs/sam2/sam2_hiera_l.yaml       |  28 ++--
 .../sam2/configs/sam2/sam2_hiera_s.yaml       |  28 ++--
 .../sam2/configs/sam2/sam2_hiera_t.yaml       |  28 ++--
 .../_models/sam2/csrc/connected_components.cu |   0
 .../_models/sam2/map_tensor.py                |   0
 .../_models/sam2/modeling/__init__.py         |   0
 .../sam2/modeling/backbones/__init__.py       |   0
 .../sam2/modeling/backbones/hieradet.py       |   4 +-
 .../sam2/modeling/backbones/image_encoder.py  |   2 +-
 .../_models/sam2/modeling/backbones/utils.py  |   0
 .../_models/sam2/modeling/memory_attention.py |   4 +-
 .../_models/sam2/modeling/memory_encoder.py   |   2 +-
 .../sam2/modeling/position_encoding.py        |   0
 .../_models/sam2/modeling/sam/__init__.py     |   0
 .../_models/sam2/modeling/sam/mask_decoder.py |   2 +-
 .../sam2/modeling/sam/prompt_encoder.py       |   4 +-
 .../_models/sam2/modeling/sam/transformer.py  |   6 +-
 .../_models/sam2/modeling/sam2_base.py        |   8 +-
 .../_models/sam2/modeling/sam2_utils.py       |   2 +-
 .../_models/sam2/sam2_hiera_b+.yaml           |   0
 .../_models/sam2/sam2_hiera_l.yaml            |   0
 .../_models/sam2/sam2_hiera_s.yaml            |   0
 .../_models/sam2/sam2_hiera_t.yaml            |   0
 .../_models/sam2/sam2_image_predictor.py      |   6 +-
 .../_models/sam2/sam2_video_predictor.py      |   6 +-
 .../_models/sam2/utils/__init__.py            |   0
 .../_models/sam2/utils/amg.py                 |   0
 .../_models/sam2/utils/misc.py                |   0
 .../_models/sam2/utils/transforms.py          |   4 +-
 {benchmarks => torchao}/_models/utils.py      |  89 ++++++++++++
 .../scripts/BO_acc_throughput.py              |  14 +-
 torchao/prototype/spinquant/spinquant.py      |   2 +-
 torchao/quantization/GPTQ.py                  |   2 +-
 torchao/utils.py                              |  20 +++
 66 files changed, 319 insertions(+), 302 deletions(-)
 rename {benchmarks => torchao}/_models/README.md (100%)
 create mode 100644 torchao/_models/llm/__init__.py
 rename torchao/_models/{ => llm}/model.py (100%)
 rename torchao/_models/{ => llm}/tokenizer.py (100%)
 rename {benchmarks => torchao}/_models/sam2/__init__.py (81%)
 rename {benchmarks => torchao}/_models/sam2/automatic_mask_generator.py (99%)
 rename {benchmarks => torchao}/_models/sam2/build_sam.py (98%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml (71%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml (72%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml (72%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml (72%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml (100%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2/sam2_hiera_b+.yaml (70%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2/sam2_hiera_l.yaml (71%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2/sam2_hiera_s.yaml (71%)
 rename {benchmarks => torchao}/_models/sam2/configs/sam2/sam2_hiera_t.yaml (72%)
 rename {benchmarks => torchao}/_models/sam2/csrc/connected_components.cu (100%)
 rename {benchmarks => torchao}/_models/sam2/map_tensor.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/__init__.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/backbones/__init__.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/backbones/hieradet.py (98%)
 rename {benchmarks => torchao}/_models/sam2/modeling/backbones/image_encoder.py (98%)
 rename {benchmarks => torchao}/_models/sam2/modeling/backbones/utils.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/memory_attention.py (97%)
 rename {benchmarks => torchao}/_models/sam2/modeling/memory_encoder.py (98%)
 rename {benchmarks => torchao}/_models/sam2/modeling/position_encoding.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam/__init__.py (100%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam/mask_decoder.py (99%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam/prompt_encoder.py (98%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam/transformer.py (98%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam2_base.py (99%)
 rename {benchmarks => torchao}/_models/sam2/modeling/sam2_utils.py (99%)
 rename {benchmarks => torchao}/_models/sam2/sam2_hiera_b+.yaml (100%)
 rename {benchmarks => torchao}/_models/sam2/sam2_hiera_l.yaml (100%)
 rename {benchmarks => torchao}/_models/sam2/sam2_hiera_s.yaml (100%)
 rename {benchmarks => torchao}/_models/sam2/sam2_hiera_t.yaml (100%)
 rename {benchmarks => torchao}/_models/sam2/sam2_image_predictor.py (99%)
 rename {benchmarks => torchao}/_models/sam2/sam2_video_predictor.py (99%)
 rename {benchmarks => torchao}/_models/sam2/utils/__init__.py (100%)
 rename {benchmarks => torchao}/_models/sam2/utils/amg.py (100%)
 rename {benchmarks => torchao}/_models/sam2/utils/misc.py (100%)
 rename {benchmarks => torchao}/_models/sam2/utils/transforms.py (97%)
 rename {benchmarks => torchao}/_models/utils.py (54%)

diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
index 3454c69014..615b21ec47 100644
--- a/benchmarks/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -15,7 +15,7 @@
 from tokenizer import get_tokenizer
 
 import torchao
-from torchao._models.model import prepare_inputs_for_model
+from torchao._models.llm.model import prepare_inputs_for_model
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -172,7 +172,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from torchao._models.model import TransformerBlock
+            from torchao._models.llm.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
index e0d3a08be1..d327d34962 100644
--- a/benchmarks/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -7,20 +7,30 @@
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 
 import torchao
-from benchmarks._models.utils import (
+from torchao._models.utils import (
+    _load_model,
+    decode_n_tokens,
+    decode_one_token,
+    encode_tokens,
     get_arch_name,
+    prefill,
     write_json_result_local,
     write_json_result_ossci,
 )
 from torchao.quantization.quant_primitives import MappingType
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_5,
+    default_device,
+    device_sync,
+    get_model_size_in_bytes,
+)
 
 torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
 torch.backends.cuda.enable_cudnn_sdp(True)
@@ -49,97 +59,12 @@ def device_timer(device):
         print(f"device={device} is not yet suppported")
 
 
-def device_sync(device):
-    if "cuda" in device:
-        torch.cuda.synchronize(device)
-    elif "xpu" in device:
-        torch.xpu.synchronize(device)
-    elif ("cpu" in device) or ("mps" in device):
-        pass
-    else:
-        print(f"device={device} is not yet suppported")
-
-
-default_device = (
-    "cuda"
-    if torch.cuda.is_available()
-    else "xpu"
-    if torch.xpu.is_available()
-    else "cpu"
-)
-
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.model import Transformer, prepare_inputs_for_model
-from torchao._models.tokenizer import get_tokenizer
-
-
-def multinomial_sample_one_no_sync(
-    probs_sort,
-):  # Does multinomial sampling without a cuda synchronization
-    q = torch.empty_like(probs_sort).exponential_(1)
-    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
-
-
-def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-    logits = logits / max(temperature, 1e-5)
-
-    if top_k is not None:
-        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-        pivot = v.select(-1, -1).unsqueeze(-1)
-        logits = torch.where(logits < pivot, -float("Inf"), logits)
-    probs = torch.nn.functional.softmax(logits, dim=-1)
-    return probs
-
-
-def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-    probs = logits_to_probs(logits[:, -1], temperature, top_k)
-    idx_next = multinomial_sample_one_no_sync(probs)
-    return idx_next, probs
-
-
-def prefill(
-    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
-) -> torch.Tensor:
-    # input_pos: [B, S]
-    logits = model(x, input_pos)
-    return sample(logits, **sampling_kwargs)[0]
-
-
-def decode_one_token(
-    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # input_pos: [B, 1]
-    assert input_pos.shape[-1] == 1
-    logits = model(x, input_pos)
-    return sample(logits, **sampling_kwargs)
-
-
-def decode_n_tokens(
-    model: Transformer,
-    cur_token: torch.Tensor,
-    input_pos: torch.Tensor,
-    num_new_tokens: int,
-    callback=lambda _: _,
-    **sampling_kwargs,
-):
-    new_tokens, new_probs = [], []
-    for i in range(num_new_tokens):
-        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            next_token, next_prob = decode_one_token(
-                model, cur_token, input_pos, **sampling_kwargs
-            )
-            next_token, next_prob = next_token.clone(), next_prob.clone()
-            input_pos += 1
-            # in some instances not having this causes weird issues with the stored tokens when you run the next decode_one_token step
-            new_tokens.append(next_token.clone())
-            callback(new_tokens[-1])
-            new_probs.append(next_prob)
-            cur_token = next_token
-
-    return new_tokens, new_probs
+from torchao._models.llm.model import Transformer, prepare_inputs_for_model
+from torchao._models.llm.tokenizer import get_tokenizer
 
 
 def model_forward(model, x, input_pos):
@@ -230,25 +155,6 @@ def generate(
     return seq
 
 
-def encode_tokens(tokenizer, string, bos=True, device=default_device):
-    tokens = tokenizer.encode(string)
-    if bos:
-        tokens = [tokenizer.bos_id()] + tokens
-    return torch.tensor(tokens, dtype=torch.int, device=device)
-
-
-def _load_model(checkpoint_path, device, precision):
-    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
-    if "model" in checkpoint and "stories" in str(checkpoint_path):
-        checkpoint = checkpoint["model"]
-    with torch.device("meta"):
-        model = Transformer.from_name(checkpoint_path.parent.name)
-    model.load_state_dict(checkpoint, assign=True)
-    model = model.to(device=device, dtype=precision)
-
-    return model.eval()
-
-
 B_INST, E_INST = "[INST]", "[/INST]"
 
 
@@ -575,8 +481,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from torchao._models.model import prepare_inputs_for_model
             from torchao._models._eval import InputRecorder
+            from torchao._models.llm.model import prepare_inputs_for_model
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -665,8 +571,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from torchao._models.model import prepare_inputs_for_model
             from torchao._models._eval import InputRecorder
+            from torchao._models.llm.model import prepare_inputs_for_model
 
             calibration_seq_length = 256
             inputs = (
diff --git a/benchmarks/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
index 6cd924a493..ffc99be854 100644
--- a/benchmarks/_models/llama/perf_profile.py
+++ b/benchmarks/_models/llama/perf_profile.py
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from torchao._models.model import Transformer
-from torchao._models.tokenizer import get_tokenizer
+from torchao._models.llm.model import Transformer
+from torchao._models.llm.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
diff --git a/benchmarks/_models/sam/eval_combo.py b/benchmarks/_models/sam/eval_combo.py
index 7f17df4f4f..781c10c935 100644
--- a/benchmarks/_models/sam/eval_combo.py
+++ b/benchmarks/_models/sam/eval_combo.py
@@ -9,7 +9,7 @@
 from metrics import calculate_miou, create_result_entry
 
 import torchao
-from benchmarks._models.utils import (
+from torchao._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py
index 371831f4ca..5cc6c9ba52 100644
--- a/benchmarks/quantized_training/pretrain_llama2.py
+++ b/benchmarks/quantized_training/pretrain_llama2.py
@@ -22,13 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao._models.model import (
+from torchao import quantize_
+from torchao._models.llm.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
-from torchao import quantize_
 from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
diff --git a/examples/sam2_amg_server/annotate_with_rle.py b/examples/sam2_amg_server/annotate_with_rle.py
index 3c3bbc77b0..55e5512011 100644
--- a/examples/sam2_amg_server/annotate_with_rle.py
+++ b/examples/sam2_amg_server/annotate_with_rle.py
@@ -14,7 +14,7 @@
 )
 from tqdm import tqdm
 
-from benchmarks._models.sam2.utils.amg import area_from_rle, rle_to_mask
+from torchao._models.sam2.utils.amg import area_from_rle, rle_to_mask
 
 
 def timestamped_print(*args, **kwargs):
diff --git a/examples/sam2_amg_server/cli.py b/examples/sam2_amg_server/cli.py
index b5feac395e..2f6758b7d3 100644
--- a/examples/sam2_amg_server/cli.py
+++ b/examples/sam2_amg_server/cli.py
@@ -12,9 +12,9 @@
     show_anns,
 )
 
-from benchmarks._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-from benchmarks._models.sam2.build_sam import build_sam2
-from benchmarks._models.sam2.utils.amg import rle_to_mask
+from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from torchao._models.sam2.build_sam import build_sam2
+from torchao._models.sam2.utils.amg import rle_to_mask
 
 
 def main_docstring():
diff --git a/examples/sam2_amg_server/cli_on_modal.py b/examples/sam2_amg_server/cli_on_modal.py
index d44de90bf7..5fe56eeb1a 100644
--- a/examples/sam2_amg_server/cli_on_modal.py
+++ b/examples/sam2_amg_server/cli_on_modal.py
@@ -84,10 +84,10 @@ def build(self):
             from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
             from sam2.build_sam import build_sam2
         else:
-            from benchmarks._models.sam2.automatic_mask_generator import (
+            from torchao._models.sam2.automatic_mask_generator import (
                 SAM2AutomaticMaskGenerator,
             )
-            from benchmarks._models.sam2.build_sam import build_sam2
+            from torchao._models.sam2.build_sam import build_sam2
 
         os.chdir(f"{TARGET}ao_src_0/examples/sam2_amg_server")
         import sys
@@ -139,11 +139,11 @@ def build(self):
             from sam2.utils.amg import mask_to_rle_pytorch as mask_to_rle_pytorch_2
             from sam2.utils.amg import rle_to_mask
         else:
-            from benchmarks._models.sam2.utils.amg import (
+            from torchao._models.sam2.utils.amg import (
                 mask_to_rle_pytorch_2,
                 rle_to_mask,
             )
-        from benchmarks._models.sam2.utils.amg import area_from_rle
+        from torchao._models.sam2.utils.amg import area_from_rle
 
         self.np = np
         self.tio = tio
diff --git a/examples/sam2_amg_server/compare_rle_lists.py b/examples/sam2_amg_server/compare_rle_lists.py
index 88be3df491..7a1c78b846 100644
--- a/examples/sam2_amg_server/compare_rle_lists.py
+++ b/examples/sam2_amg_server/compare_rle_lists.py
@@ -7,7 +7,7 @@
 import torch
 
 
-# from benchmarks._models.sam2.utils.amg import rle_to_mask
+# from torchao._models.sam2.utils.amg import rle_to_mask
 def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
     """Compute a binary mask from an uncompressed RLE."""
     h, w = rle["size"]
diff --git a/examples/sam2_amg_server/compile_export_utils.py b/examples/sam2_amg_server/compile_export_utils.py
index ab6e76a4fc..d1c6fc06fa 100644
--- a/examples/sam2_amg_server/compile_export_utils.py
+++ b/examples/sam2_amg_server/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
@@ -519,12 +519,12 @@ def set_fast(
         # A bunch of extra compiles at module level
         # Note that this can cause recompilations!
         # We might want to guard on that
-        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
+        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
             fullgraph=True, dynamic=True
-        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
-        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
+        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
+        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
             fullgraph=True, dynamic=True
-        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
+        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
         mask_generator.calculate_stability_score = torch.compile(
             fullgraph=True, dynamic=True
         )(mask_generator.calculate_stability_score)
diff --git a/examples/sam2_amg_server/generate_data.py b/examples/sam2_amg_server/generate_data.py
index dc82348d0b..50eeccb912 100644
--- a/examples/sam2_amg_server/generate_data.py
+++ b/examples/sam2_amg_server/generate_data.py
@@ -192,7 +192,7 @@ def gen_masks_ao_batch(
         center_points_label_torch_batch = [
             torch.from_numpy(t).unsqueeze(1) for t in center_points_label_batch
         ]
-        from benchmarks._models.sam2.map_tensor import to_map_tensor
+        from torchao._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch_batch = list(map(to_map_tensor, center_points_torch_batch))
         center_points_label_torch_batch = list(
@@ -255,7 +255,7 @@ def gen_masks_ao(
 
         center_points_torch = torch.from_numpy(center_points).unsqueeze(1)
         center_points_label_torch = torch.from_numpy(center_points_label).unsqueeze(1)
-        from benchmarks._models.sam2.map_tensor import to_map_tensor
+        from torchao._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch = to_map_tensor(center_points_torch)
         center_points_label_torch = to_map_tensor(center_points_label_torch)
@@ -532,11 +532,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import mask_to_rle_pytorch
     else:
-        from benchmarks._models.sam2.automatic_mask_generator import (
+        from torchao._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from benchmarks._models.sam2.build_sam import build_sam2
-        from benchmarks._models.sam2.utils.amg import (
+        from torchao._models.sam2.build_sam import build_sam2
+        from torchao._models.sam2.utils.amg import (
             mask_to_rle_pytorch_2 as mask_to_rle_pytorch,
         )
     torch.manual_seed(seed)
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
index ea9953dbed..7e35858590 100644
--- a/examples/sam2_amg_server/server.py
+++ b/examples/sam2_amg_server/server.py
@@ -26,7 +26,7 @@
 from fastapi.responses import StreamingResponse
 from torch._inductor import config as inductorconfig
 
-from benchmarks._models.utils import (
+from torchao._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
@@ -460,11 +460,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import rle_to_mask
     else:
-        from benchmarks._models.sam2.automatic_mask_generator import (
+        from torchao._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from benchmarks._models.sam2.build_sam import build_sam2
-        from benchmarks._models.sam2.utils.amg import rle_to_mask
+        from torchao._models.sam2.build_sam import build_sam2
+        from torchao._models.sam2.utils.amg import rle_to_mask
 
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
diff --git a/examples/sam2_vos_example/compile_export_utils.py b/examples/sam2_vos_example/compile_export_utils.py
index 00f1b56794..7d1b3eddf3 100644
--- a/examples/sam2_vos_example/compile_export_utils.py
+++ b/examples/sam2_vos_example/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from benchmarks._models.sam2.sam2_video_predictor import SAM2VideoPredictor
+from torchao._models.sam2.sam2_video_predictor import SAM2VideoPredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
diff --git a/examples/sam2_vos_example/video_profile.py b/examples/sam2_vos_example/video_profile.py
index 44b90bd77b..8ee9151cc4 100644
--- a/examples/sam2_vos_example/video_profile.py
+++ b/examples/sam2_vos_example/video_profile.py
@@ -280,7 +280,7 @@ def main(
     if use_baseline:
         from sam2.build_sam import build_sam2_video_predictor
     else:
-        from benchmarks._models.sam2.build_sam import build_sam2_video_predictor
+        from torchao._models.sam2.build_sam import build_sam2_video_predictor
 
     device = "cuda:0"
     # hydra_overrides_extra = ["++model.compile_image_encoder=true"]
@@ -292,7 +292,7 @@ def main(
     )
     predictor._frame_batch_size = frame_batch_size
     predictor.image_encoder.trunk = predictor.image_encoder.trunk.to(torch.bfloat16)
-    from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
+    from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
 
     rope_attention_modules = [
         module for module in predictor.modules() if isinstance(module, RoPEAttention)
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 128e6120ef..24ba4717e8 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -14,7 +14,7 @@
 import torch
 from safetensors.torch import load_file as load_safetensors_file
 
-from torchao._models.model import ModelArgs
+from torchao._models.llm.model import ModelArgs
 
 
 @torch.inference_mode()
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index 36a47a9f89..99c2955360 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.model import Transformer
+from torchao._models.llm.model import Transformer
 from torchao.prototype.spinquant import apply_spinquant
 
 
diff --git a/test/quantization/test_gptq_mt.py b/test/quantization/test_gptq_mt.py
index 40e234e8c8..1064c41841 100644
--- a/test/quantization/test_gptq_mt.py
+++ b/test/quantization/test_gptq_mt.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests
 
-from torchao._models.model import Transformer, prepare_inputs_for_model
-from torchao._models.tokenizer import get_tokenizer
+from torchao._models.llm.model import Transformer, prepare_inputs_for_model
+from torchao._models.llm.tokenizer import get_tokenizer
 from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer, MultiTensor
 from torchao.quantization.utils import _lm_eval_available
 from torchao.utils import is_fbcode
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index ecb550729a..72359c78af 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -22,8 +22,8 @@
 from torch.testing._internal.common_utils import TestCase
 
 from torchao import quantize_
-from torchao._models.model import Transformer, prepare_inputs_for_model
-from torchao._models.tokenizer import get_tokenizer
+from torchao._models.llm.model import Transformer, prepare_inputs_for_model
+from torchao._models.llm.tokenizer import get_tokenizer
 from torchao.dtypes import AffineQuantizedTensor
 from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.quant_api import (
diff --git a/test/test_ao_models.py b/test/test_ao_models.py
index 05af8ef7d9..f31188802b 100644
--- a/test/test_ao_models.py
+++ b/test/test_ao_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.model import Transformer
+from torchao._models.llm.model import Transformer
 
 _AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 
diff --git a/benchmarks/_models/README.md b/torchao/_models/README.md
similarity index 100%
rename from benchmarks/_models/README.md
rename to torchao/_models/README.md
diff --git a/torchao/_models/llm/__init__.py b/torchao/_models/llm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchao/_models/model.py b/torchao/_models/llm/model.py
similarity index 100%
rename from torchao/_models/model.py
rename to torchao/_models/llm/model.py
diff --git a/torchao/_models/tokenizer.py b/torchao/_models/llm/tokenizer.py
similarity index 100%
rename from torchao/_models/tokenizer.py
rename to torchao/_models/llm/tokenizer.py
diff --git a/benchmarks/_models/sam2/__init__.py b/torchao/_models/sam2/__init__.py
similarity index 81%
rename from benchmarks/_models/sam2/__init__.py
rename to torchao/_models/sam2/__init__.py
index f49e12ba4e..0dc11c2fde 100644
--- a/benchmarks/_models/sam2/__init__.py
+++ b/torchao/_models/sam2/__init__.py
@@ -8,4 +8,4 @@
 from hydra.core.global_hydra import GlobalHydra
 
 if not GlobalHydra.instance().is_initialized():
-    initialize_config_module("benchmarks._models.sam2", version_base="1.2")
+    initialize_config_module("torchao._models.sam2", version_base="1.2")
diff --git a/benchmarks/_models/sam2/automatic_mask_generator.py b/torchao/_models/sam2/automatic_mask_generator.py
similarity index 99%
rename from benchmarks/_models/sam2/automatic_mask_generator.py
rename to torchao/_models/sam2/automatic_mask_generator.py
index 4e82f3ef04..6f4f1d3e7b 100644
--- a/benchmarks/_models/sam2/automatic_mask_generator.py
+++ b/torchao/_models/sam2/automatic_mask_generator.py
@@ -11,9 +11,9 @@
 import torch
 from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
 
-from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
-from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
-from benchmarks._models.sam2.utils.amg import (
+from torchao._models.sam2.modeling.sam2_base import SAM2Base
+from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from torchao._models.sam2.utils.amg import (
     MaskData,
     _mask_to_rle_pytorch_2_0,
     _mask_to_rle_pytorch_2_1,
@@ -33,7 +33,7 @@
     uncrop_masks,
     uncrop_points,
 )
-from benchmarks._models.sam2.utils.misc import (
+from torchao._models.sam2.utils.misc import (
     crop_image,
     get_image_size,
 )
diff --git a/benchmarks/_models/sam2/build_sam.py b/torchao/_models/sam2/build_sam.py
similarity index 98%
rename from benchmarks/_models/sam2/build_sam.py
rename to torchao/_models/sam2/build_sam.py
index eea26ccee4..70c4b81d09 100644
--- a/benchmarks/_models/sam2/build_sam.py
+++ b/torchao/_models/sam2/build_sam.py
@@ -106,7 +106,7 @@ def build_sam2_video_predictor(
     **kwargs,
 ):
     hydra_overrides = [
-        "++model._target_=benchmarks._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
+        "++model._target_=torchao._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
     ]
     if apply_postprocessing:
         hydra_overrides_extra = hydra_overrides_extra.copy()
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
similarity index 71%
rename from benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
rename to torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
index 1742a20e95..42cd897c67 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
+++ b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
similarity index 72%
rename from benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
rename to torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
index 17bf334745..ba9dafd489 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
+++ b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
similarity index 72%
rename from benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
rename to torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
index 7b5f000254..898898b158 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
+++ b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
similarity index 72%
rename from benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
rename to torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
index 84c6e92e9c..c6318f843b 100644
--- a/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
+++ b/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml b/torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
similarity index 100%
rename from benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
rename to torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
similarity index 70%
rename from benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
rename to torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
index 0f6c1c56cc..b3ba469471 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
+++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
similarity index 71%
rename from benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
rename to torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
index 4baf4e38eb..59a8a1e36b 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
+++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
similarity index 71%
rename from benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
rename to torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
index 84b4b52a8e..b051d3be63 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
+++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml b/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
similarity index 72%
rename from benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
rename to torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
index b572a7e4ee..6b108e708f 100644
--- a/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
+++ b/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
+        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/benchmarks/_models/sam2/csrc/connected_components.cu b/torchao/_models/sam2/csrc/connected_components.cu
similarity index 100%
rename from benchmarks/_models/sam2/csrc/connected_components.cu
rename to torchao/_models/sam2/csrc/connected_components.cu
diff --git a/benchmarks/_models/sam2/map_tensor.py b/torchao/_models/sam2/map_tensor.py
similarity index 100%
rename from benchmarks/_models/sam2/map_tensor.py
rename to torchao/_models/sam2/map_tensor.py
diff --git a/benchmarks/_models/sam2/modeling/__init__.py b/torchao/_models/sam2/modeling/__init__.py
similarity index 100%
rename from benchmarks/_models/sam2/modeling/__init__.py
rename to torchao/_models/sam2/modeling/__init__.py
diff --git a/benchmarks/_models/sam2/modeling/backbones/__init__.py b/torchao/_models/sam2/modeling/backbones/__init__.py
similarity index 100%
rename from benchmarks/_models/sam2/modeling/backbones/__init__.py
rename to torchao/_models/sam2/modeling/backbones/__init__.py
diff --git a/benchmarks/_models/sam2/modeling/backbones/hieradet.py b/torchao/_models/sam2/modeling/backbones/hieradet.py
similarity index 98%
rename from benchmarks/_models/sam2/modeling/backbones/hieradet.py
rename to torchao/_models/sam2/modeling/backbones/hieradet.py
index b56c983c8f..91e98f795e 100644
--- a/benchmarks/_models/sam2/modeling/backbones/hieradet.py
+++ b/torchao/_models/sam2/modeling/backbones/hieradet.py
@@ -13,12 +13,12 @@
 import torch.nn.functional as F
 from iopath.common.file_io import g_pathmgr
 
-from benchmarks._models.sam2.modeling.backbones.utils import (
+from torchao._models.sam2.modeling.backbones.utils import (
     PatchEmbed,
     window_partition,
     window_unpartition,
 )
-from benchmarks._models.sam2.modeling.sam2_utils import MLP, DropPath
+from torchao._models.sam2.modeling.sam2_utils import MLP, DropPath
 
 
 def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
diff --git a/benchmarks/_models/sam2/modeling/backbones/image_encoder.py b/torchao/_models/sam2/modeling/backbones/image_encoder.py
similarity index 98%
rename from benchmarks/_models/sam2/modeling/backbones/image_encoder.py
rename to torchao/_models/sam2/modeling/backbones/image_encoder.py
index efa1d963e4..0f0a256867 100644
--- a/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
+++ b/torchao/_models/sam2/modeling/backbones/image_encoder.py
@@ -29,7 +29,7 @@ def __init__(
     def forward(self, sample: torch.Tensor):
         # Forward through backbone
         with torch.autograd.profiler.record_function("self.neck(self.trunk(sample))"):
-            from benchmarks._models.sam2.map_tensor import MapTensor, to_map_tensor
+            from torchao._models.sam2.map_tensor import MapTensor, to_map_tensor
 
             if isinstance(sample, MapTensor):
                 features, pos = self.neck(self.trunk(sample.elems.flatten(0, 1)))
diff --git a/benchmarks/_models/sam2/modeling/backbones/utils.py b/torchao/_models/sam2/modeling/backbones/utils.py
similarity index 100%
rename from benchmarks/_models/sam2/modeling/backbones/utils.py
rename to torchao/_models/sam2/modeling/backbones/utils.py
diff --git a/benchmarks/_models/sam2/modeling/memory_attention.py b/torchao/_models/sam2/modeling/memory_attention.py
similarity index 97%
rename from benchmarks/_models/sam2/modeling/memory_attention.py
rename to torchao/_models/sam2/modeling/memory_attention.py
index c32707cf31..5ac6288af0 100644
--- a/benchmarks/_models/sam2/modeling/memory_attention.py
+++ b/torchao/_models/sam2/modeling/memory_attention.py
@@ -9,8 +9,8 @@
 import torch
 from torch import Tensor, nn
 
-from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
-from benchmarks._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
+from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
+from torchao._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
 
 
 class MemoryAttentionLayer(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/memory_encoder.py b/torchao/_models/sam2/modeling/memory_encoder.py
similarity index 98%
rename from benchmarks/_models/sam2/modeling/memory_encoder.py
rename to torchao/_models/sam2/modeling/memory_encoder.py
index 84116aa225..a13f4077cd 100644
--- a/benchmarks/_models/sam2/modeling/memory_encoder.py
+++ b/torchao/_models/sam2/modeling/memory_encoder.py
@@ -11,7 +11,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from benchmarks._models.sam2.modeling.sam2_utils import (
+from torchao._models.sam2.modeling.sam2_utils import (
     DropPath,
     LayerNorm2d,
     get_clones,
diff --git a/benchmarks/_models/sam2/modeling/position_encoding.py b/torchao/_models/sam2/modeling/position_encoding.py
similarity index 100%
rename from benchmarks/_models/sam2/modeling/position_encoding.py
rename to torchao/_models/sam2/modeling/position_encoding.py
diff --git a/benchmarks/_models/sam2/modeling/sam/__init__.py b/torchao/_models/sam2/modeling/sam/__init__.py
similarity index 100%
rename from benchmarks/_models/sam2/modeling/sam/__init__.py
rename to torchao/_models/sam2/modeling/sam/__init__.py
diff --git a/benchmarks/_models/sam2/modeling/sam/mask_decoder.py b/torchao/_models/sam2/modeling/sam/mask_decoder.py
similarity index 99%
rename from benchmarks/_models/sam2/modeling/sam/mask_decoder.py
rename to torchao/_models/sam2/modeling/sam/mask_decoder.py
index 1c29113197..7d25697018 100644
--- a/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
+++ b/torchao/_models/sam2/modeling/sam/mask_decoder.py
@@ -9,7 +9,7 @@
 import torch
 from torch import nn
 
-from benchmarks._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
+from torchao._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
 
 
 class MaskDecoder(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py b/torchao/_models/sam2/modeling/sam/prompt_encoder.py
similarity index 98%
rename from benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
rename to torchao/_models/sam2/modeling/sam/prompt_encoder.py
index 2c3abbfa34..94b7fda8b2 100644
--- a/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
+++ b/torchao/_models/sam2/modeling/sam/prompt_encoder.py
@@ -9,8 +9,8 @@
 import torch
 from torch import nn
 
-from benchmarks._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
-from benchmarks._models.sam2.modeling.sam2_utils import LayerNorm2d
+from torchao._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
+from torchao._models.sam2.modeling.sam2_utils import LayerNorm2d
 
 
 class PromptEncoder(nn.Module):
diff --git a/benchmarks/_models/sam2/modeling/sam/transformer.py b/torchao/_models/sam2/modeling/sam/transformer.py
similarity index 98%
rename from benchmarks/_models/sam2/modeling/sam/transformer.py
rename to torchao/_models/sam2/modeling/sam/transformer.py
index 3c6d3b83cd..bf0b58d6fd 100644
--- a/benchmarks/_models/sam2/modeling/sam/transformer.py
+++ b/torchao/_models/sam2/modeling/sam/transformer.py
@@ -14,12 +14,12 @@
 import torch.nn.functional as F
 from torch import Tensor, nn
 
-from benchmarks._models.sam2.modeling.position_encoding import (
+from torchao._models.sam2.modeling.position_encoding import (
     apply_rotary_enc,
     compute_axial_cis,
 )
-from benchmarks._models.sam2.modeling.sam2_utils import MLP
-from benchmarks._models.sam2.utils.misc import get_sdpa_settings
+from torchao._models.sam2.modeling.sam2_utils import MLP
+from torchao._models.sam2.utils.misc import get_sdpa_settings
 
 warnings.simplefilter(action="ignore", category=FutureWarning)
 # Check whether Flash Attention is available (and use it by default)
diff --git a/benchmarks/_models/sam2/modeling/sam2_base.py b/torchao/_models/sam2/modeling/sam2_base.py
similarity index 99%
rename from benchmarks/_models/sam2/modeling/sam2_base.py
rename to torchao/_models/sam2/modeling/sam2_base.py
index c5d1f54829..4c2a24a0ef 100644
--- a/benchmarks/_models/sam2/modeling/sam2_base.py
+++ b/torchao/_models/sam2/modeling/sam2_base.py
@@ -9,10 +9,10 @@
 import torch.nn.functional as F
 from torch.nn.init import trunc_normal_
 
-from benchmarks._models.sam2.modeling.sam.mask_decoder import MaskDecoder
-from benchmarks._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
-from benchmarks._models.sam2.modeling.sam.transformer import TwoWayTransformer
-from benchmarks._models.sam2.modeling.sam2_utils import (
+from torchao._models.sam2.modeling.sam.mask_decoder import MaskDecoder
+from torchao._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
+from torchao._models.sam2.modeling.sam.transformer import TwoWayTransformer
+from torchao._models.sam2.modeling.sam2_utils import (
     MLP,
     get_1d_sine_pe,
     select_closest_cond_frames,
diff --git a/benchmarks/_models/sam2/modeling/sam2_utils.py b/torchao/_models/sam2/modeling/sam2_utils.py
similarity index 99%
rename from benchmarks/_models/sam2/modeling/sam2_utils.py
rename to torchao/_models/sam2/modeling/sam2_utils.py
index 1c00f534e3..579bfc671a 100644
--- a/benchmarks/_models/sam2/modeling/sam2_utils.py
+++ b/torchao/_models/sam2/modeling/sam2_utils.py
@@ -13,7 +13,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from benchmarks._models.sam2.utils.misc import mask_to_box
+from torchao._models.sam2.utils.misc import mask_to_box
 
 
 def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
diff --git a/benchmarks/_models/sam2/sam2_hiera_b+.yaml b/torchao/_models/sam2/sam2_hiera_b+.yaml
similarity index 100%
rename from benchmarks/_models/sam2/sam2_hiera_b+.yaml
rename to torchao/_models/sam2/sam2_hiera_b+.yaml
diff --git a/benchmarks/_models/sam2/sam2_hiera_l.yaml b/torchao/_models/sam2/sam2_hiera_l.yaml
similarity index 100%
rename from benchmarks/_models/sam2/sam2_hiera_l.yaml
rename to torchao/_models/sam2/sam2_hiera_l.yaml
diff --git a/benchmarks/_models/sam2/sam2_hiera_s.yaml b/torchao/_models/sam2/sam2_hiera_s.yaml
similarity index 100%
rename from benchmarks/_models/sam2/sam2_hiera_s.yaml
rename to torchao/_models/sam2/sam2_hiera_s.yaml
diff --git a/benchmarks/_models/sam2/sam2_hiera_t.yaml b/torchao/_models/sam2/sam2_hiera_t.yaml
similarity index 100%
rename from benchmarks/_models/sam2/sam2_hiera_t.yaml
rename to torchao/_models/sam2/sam2_hiera_t.yaml
diff --git a/benchmarks/_models/sam2/sam2_image_predictor.py b/torchao/_models/sam2/sam2_image_predictor.py
similarity index 99%
rename from benchmarks/_models/sam2/sam2_image_predictor.py
rename to torchao/_models/sam2/sam2_image_predictor.py
index a2c53bdf0a..a4aa1c668c 100644
--- a/benchmarks/_models/sam2/sam2_image_predictor.py
+++ b/torchao/_models/sam2/sam2_image_predictor.py
@@ -11,9 +11,9 @@
 import torch
 from PIL.Image import Image
 
-from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
-from benchmarks._models.sam2.utils.misc import get_image_size
-from benchmarks._models.sam2.utils.transforms import SAM2Transforms
+from torchao._models.sam2.modeling.sam2_base import SAM2Base
+from torchao._models.sam2.utils.misc import get_image_size
+from torchao._models.sam2.utils.transforms import SAM2Transforms
 
 
 class SAM2ImagePredictor(torch.nn.Module):
diff --git a/benchmarks/_models/sam2/sam2_video_predictor.py b/torchao/_models/sam2/sam2_video_predictor.py
similarity index 99%
rename from benchmarks/_models/sam2/sam2_video_predictor.py
rename to torchao/_models/sam2/sam2_video_predictor.py
index 6715178958..53b0a11d7c 100644
--- a/benchmarks/_models/sam2/sam2_video_predictor.py
+++ b/torchao/_models/sam2/sam2_video_predictor.py
@@ -10,8 +10,8 @@
 import torch
 from tqdm import tqdm
 
-from benchmarks._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
-from benchmarks._models.sam2.utils.misc import (
+from torchao._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from torchao._models.sam2.utils.misc import (
     concat_points,
     fill_holes_in_mask_scores,
     load_video_frames,
@@ -52,7 +52,7 @@ def batch_inference_states(inference_states: list):
 
         batched_inference_state = copy.copy(inference_states[0])
 
-        from benchmarks._models.sam2.map_tensor import to_map_tensor
+        from torchao._models.sam2.map_tensor import to_map_tensor
 
         # NOTE: Making a build assumption only images differ
         all_images = torch.stack([state["images"] for state in inference_states])
diff --git a/benchmarks/_models/sam2/utils/__init__.py b/torchao/_models/sam2/utils/__init__.py
similarity index 100%
rename from benchmarks/_models/sam2/utils/__init__.py
rename to torchao/_models/sam2/utils/__init__.py
diff --git a/benchmarks/_models/sam2/utils/amg.py b/torchao/_models/sam2/utils/amg.py
similarity index 100%
rename from benchmarks/_models/sam2/utils/amg.py
rename to torchao/_models/sam2/utils/amg.py
diff --git a/benchmarks/_models/sam2/utils/misc.py b/torchao/_models/sam2/utils/misc.py
similarity index 100%
rename from benchmarks/_models/sam2/utils/misc.py
rename to torchao/_models/sam2/utils/misc.py
diff --git a/benchmarks/_models/sam2/utils/transforms.py b/torchao/_models/sam2/utils/transforms.py
similarity index 97%
rename from benchmarks/_models/sam2/utils/transforms.py
rename to torchao/_models/sam2/utils/transforms.py
index 2d5e46193b..c616233050 100644
--- a/benchmarks/_models/sam2/utils/transforms.py
+++ b/torchao/_models/sam2/utils/transforms.py
@@ -78,7 +78,7 @@ def postprocess_masks(
         """
         Perform PostProcessing on output masks.
         """
-        from benchmarks._models.sam2.utils.misc import get_connected_components
+        from torchao._models.sam2.utils.misc import get_connected_components
 
         masks = masks.float()
         input_masks = masks
@@ -125,7 +125,7 @@ def postprocess_masks_1_channel(
         """
         Perform PostProcessing on output masks.
         """
-        from benchmarks._models.sam2.utils.misc import get_connected_components
+        from torchao._models.sam2.utils.misc import get_connected_components
 
         assert masks.dim() == 4
         assert masks.size(1) == 1
diff --git a/benchmarks/_models/utils.py b/torchao/_models/utils.py
similarity index 54%
rename from benchmarks/_models/utils.py
rename to torchao/_models/utils.py
index 346feb57ae..bffe33aacc 100644
--- a/benchmarks/_models/utils.py
+++ b/torchao/_models/utils.py
@@ -4,9 +4,13 @@
 import os
 import platform
 import time
+from typing import Optional, Tuple
 
 import torch
 
+from torchao._models.llm.model import Transformer
+from torchao.utils import default_device
+
 
 def get_arch_name() -> str:
     if torch.cuda.is_available():
@@ -104,3 +108,88 @@ def write_json_result_local(output_json_path, headers, row):
 
     with open(f"{os.path.splitext(output_json_path)[0]}.json", "a") as f:
         print(json.dumps(record), file=f)
+
+
+def encode_tokens(tokenizer, string, bos=True, device=default_device):
+    tokens = tokenizer.encode(string)
+    if bos:
+        tokens = [tokenizer.bos_id()] + tokens
+    return torch.tensor(tokens, dtype=torch.int, device=device)
+
+
+def _load_model(checkpoint_path, device, precision):
+    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
+    if "model" in checkpoint and "stories" in str(checkpoint_path):
+        checkpoint = checkpoint["model"]
+    with torch.device("meta"):
+        model = Transformer.from_name(checkpoint_path.parent.name)
+    model.load_state_dict(checkpoint, assign=True)
+    model = model.to(device=device, dtype=precision)
+
+    return model.eval()
+
+
+def multinomial_sample_one_no_sync(
+    probs_sort,
+):  # Does multinomial sampling without a cuda synchronization
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+
+def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    logits = logits / max(temperature, 1e-5)
+
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+
+def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    probs = logits_to_probs(logits[:, -1], temperature, top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next, probs
+
+
+def prefill(
+    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> torch.Tensor:
+    # input_pos: [B, S]
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)[0]
+
+
+def decode_one_token(
+    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # input_pos: [B, 1]
+    assert input_pos.shape[-1] == 1
+    logits = model(x, input_pos)
+    return sample(logits, **sampling_kwargs)
+
+
+def decode_n_tokens(
+    model: Transformer,
+    cur_token: torch.Tensor,
+    input_pos: torch.Tensor,
+    num_new_tokens: int,
+    callback=lambda _: _,
+    **sampling_kwargs,
+):
+    new_tokens, new_probs = [], []
+    for i in range(num_new_tokens):
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, **sampling_kwargs
+            )
+            next_token, next_prob = next_token.clone(), next_prob.clone()
+            input_pos += 1
+            # in some instances not having this causes weird issues with the stored tokens when you run the next decode_one_token step
+            new_tokens.append(next_token.clone())
+            callback(new_tokens[-1])
+            new_probs.append(next_prob)
+            cur_token = next_token
+
+    return new_tokens, new_probs
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
index 46af69f670..a0e62b9ebd 100644
--- a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
+++ b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -19,14 +19,16 @@
 
 import torchao
 from torchao._models.generate import (
-    _load_model,
     decode_one_token,
-    device_sync,
-    encode_tokens,
     prefill,
 )
-from torchao._models.model import Transformer, prepare_inputs_for_model
-from torchao._models.tokenizer import get_tokenizer
+from torchao._models.llm.model import Transformer, prepare_inputs_for_model
+from torchao._models.llm.tokenizer import get_tokenizer
+from torchao._models.utils import (
+    _load_model,
+    encode_tokens,
+)
+from torchao.utils import device_sync
 
 default_device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -99,7 +101,7 @@ def generate(
             _replace_with_custom_fn_if_matches_filter(
                 model,
                 AffineQuantizedKVCache.from_float,
-                lambda x, y: isinstance(x, torchao._models.model.KVCache),
+                lambda x, y: isinstance(x, torchao._models.llm.model.KVCache),
             )
 
     # format model input
diff --git a/torchao/prototype/spinquant/spinquant.py b/torchao/prototype/spinquant/spinquant.py
index 83a1bc0b30..ce78cc0cc5 100644
--- a/torchao/prototype/spinquant/spinquant.py
+++ b/torchao/prototype/spinquant/spinquant.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from torchao._models.model import RMSNorm, Transformer
+from torchao._models.llm.model import RMSNorm, Transformer
 from torchao.prototype.spinquant.hadamard_utils import (
     apply_exact_had_to_linear,
     get_hadK,
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
index dc4145d18d..88febdd532 100644
--- a/torchao/quantization/GPTQ.py
+++ b/torchao/quantization/GPTQ.py
@@ -81,7 +81,7 @@ def __init__(
         # needed for GPTQ on the torchao llama model
         import torchao
 
-        torchao._models.model.use_index_put_for_kv_cache = True
+        torchao._models.llm.model.use_index_put_for_kv_cache = True
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/torchao/utils.py b/torchao/utils.py
index 2a67f8a9c9..c814fd7b27 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -641,6 +641,26 @@ def is_sm_at_least_100():
     )
 
 
+default_device = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "xpu"
+    if torch.xpu.is_available()
+    else "cpu"
+)
+
+
+def device_sync(device):
+    if "cuda" in device:
+        torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
+    elif ("cpu" in device) or ("mps" in device):
+        pass
+    else:
+        print(f"device={device} is not yet suppported")
+
+
 TORCH_VERSION_AFTER_2_5 = _torch_version_at_least("2.5.0.dev")
 TORCH_VERSION_AFTER_2_4 = _torch_version_at_least("2.4.0.dev")
 TORCH_VERSION_AFTER_2_3 = _torch_version_at_least("2.3.0.dev")

From c7db10c1147a79c887739e54ed9208d2953252aa Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Fri, 28 Feb 2025 12:01:40 -0800
Subject: [PATCH 6/7] Updates

---
 {torchao => benchmarks}/_models/README.md     |  0
 {torchao => benchmarks}/_models/_eval.py      |  0
 benchmarks/_models/llama/eval.py              | 21 ++++++++------
 benchmarks/_models/llama/generate.py          | 16 +++++------
 .../llm => benchmarks/_models/llama}/model.py |  0
 benchmarks/_models/llama/perf_profile.py      |  4 +--
 .../_models/llama}/tokenizer.py               |  0
 .../_models/sam}/__init__.py                  |  0
 benchmarks/_models/sam/eval_combo.py          |  2 +-
 .../_models/sam2/__init__.py                  |  2 +-
 .../_models/sam2/automatic_mask_generator.py  |  8 +++---
 .../_models/sam2/build_sam.py                 |  2 +-
 .../sam2/configs/sam2.1/sam2.1_hiera_b+.yaml  | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_l.yaml   | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_s.yaml   | 28 +++++++++----------
 .../sam2/configs/sam2.1/sam2.1_hiera_t.yaml   | 28 +++++++++----------
 .../sam2.1_hiera_b+_MOSE_finetune.yaml        |  0
 .../sam2/configs/sam2/sam2_hiera_b+.yaml      | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_l.yaml       | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_s.yaml       | 28 +++++++++----------
 .../sam2/configs/sam2/sam2_hiera_t.yaml       | 28 +++++++++----------
 .../_models/sam2/csrc/connected_components.cu |  0
 .../_models/sam2/map_tensor.py                |  0
 .../_models/sam2/modeling/__init__.py         |  0
 .../sam2/modeling/backbones/__init__.py       |  0
 .../sam2/modeling/backbones/hieradet.py       |  4 +--
 .../sam2/modeling/backbones/image_encoder.py  |  2 +-
 .../_models/sam2/modeling/backbones/utils.py  |  0
 .../_models/sam2/modeling/memory_attention.py |  4 +--
 .../_models/sam2/modeling/memory_encoder.py   |  2 +-
 .../sam2/modeling/position_encoding.py        |  0
 .../_models/sam2/modeling/sam/__init__.py     |  0
 .../_models/sam2/modeling/sam/mask_decoder.py |  2 +-
 .../sam2/modeling/sam/prompt_encoder.py       |  4 +--
 .../_models/sam2/modeling/sam/transformer.py  |  6 ++--
 .../_models/sam2/modeling/sam2_base.py        |  8 +++---
 .../_models/sam2/modeling/sam2_utils.py       |  2 +-
 .../_models/sam2/sam2_hiera_b+.yaml           |  0
 .../_models/sam2/sam2_hiera_l.yaml            |  0
 .../_models/sam2/sam2_hiera_s.yaml            |  0
 .../_models/sam2/sam2_hiera_t.yaml            |  0
 .../_models/sam2/sam2_image_predictor.py      |  6 ++--
 .../_models/sam2/sam2_video_predictor.py      |  6 ++--
 .../_models/sam2/utils/__init__.py            |  0
 .../_models/sam2/utils/amg.py                 |  0
 .../_models/sam2/utils/misc.py                |  0
 .../_models/sam2/utils/transforms.py          |  4 +--
 {torchao => benchmarks}/_models/utils.py      |  2 +-
 .../quantized_training/pretrain_llama2.py     |  4 +--
 examples/sam2_amg_server/annotate_with_rle.py |  2 +-
 examples/sam2_amg_server/cli.py               |  6 ++--
 examples/sam2_amg_server/cli_on_modal.py      |  8 +++---
 examples/sam2_amg_server/compare_rle_lists.py |  2 +-
 .../sam2_amg_server/compile_export_utils.py   | 12 ++++----
 examples/sam2_amg_server/generate_data.py     | 10 +++----
 examples/sam2_amg_server/server.py            |  8 +++---
 .../sam2_vos_example/compile_export_utils.py  |  2 +-
 examples/sam2_vos_example/video_profile.py    |  4 +--
 scripts/convert_hf_checkpoint.py              |  2 +-
 test/prototype/test_spinquant.py              |  2 +-
 test/quantization/test_gptq_mt.py             |  4 +--
 test/quantization/test_quant_api.py           | 16 +++++------
 test/test_ao_models.py                        |  2 +-
 torchao/_models/llama/__init__.py             |  0
 .../scripts/BO_acc_throughput.py              | 16 ++++++-----
 torchao/prototype/spinquant/spinquant.py      |  2 +-
 torchao/quantization/GPTQ.py                  |  4 +--
 torchao/quantization/README.md                |  2 +-
 68 files changed, 222 insertions(+), 217 deletions(-)
 rename {torchao => benchmarks}/_models/README.md (100%)
 rename {torchao => benchmarks}/_models/_eval.py (100%)
 rename {torchao/_models/llm => benchmarks/_models/llama}/model.py (100%)
 rename {torchao/_models/llm => benchmarks/_models/llama}/tokenizer.py (100%)
 rename {torchao/_models => benchmarks/_models/sam}/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/__init__.py (81%)
 rename {torchao => benchmarks}/_models/sam2/automatic_mask_generator.py (99%)
 rename {torchao => benchmarks}/_models/sam2/build_sam.py (98%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml (71%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml (72%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml (72%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml (72%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_b+.yaml (70%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_l.yaml (71%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_s.yaml (71%)
 rename {torchao => benchmarks}/_models/sam2/configs/sam2/sam2_hiera_t.yaml (72%)
 rename {torchao => benchmarks}/_models/sam2/csrc/connected_components.cu (100%)
 rename {torchao => benchmarks}/_models/sam2/map_tensor.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/hieradet.py (98%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/image_encoder.py (98%)
 rename {torchao => benchmarks}/_models/sam2/modeling/backbones/utils.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/memory_attention.py (97%)
 rename {torchao => benchmarks}/_models/sam2/modeling/memory_encoder.py (98%)
 rename {torchao => benchmarks}/_models/sam2/modeling/position_encoding.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/mask_decoder.py (99%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/prompt_encoder.py (98%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam/transformer.py (98%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam2_base.py (99%)
 rename {torchao => benchmarks}/_models/sam2/modeling/sam2_utils.py (99%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_b+.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_l.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_s.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_hiera_t.yaml (100%)
 rename {torchao => benchmarks}/_models/sam2/sam2_image_predictor.py (99%)
 rename {torchao => benchmarks}/_models/sam2/sam2_video_predictor.py (99%)
 rename {torchao => benchmarks}/_models/sam2/utils/__init__.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/amg.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/misc.py (100%)
 rename {torchao => benchmarks}/_models/sam2/utils/transforms.py (97%)
 rename {torchao => benchmarks}/_models/utils.py (99%)
 create mode 100644 torchao/_models/llama/__init__.py

diff --git a/torchao/_models/README.md b/benchmarks/_models/README.md
similarity index 100%
rename from torchao/_models/README.md
rename to benchmarks/_models/README.md
diff --git a/torchao/_models/_eval.py b/benchmarks/_models/_eval.py
similarity index 100%
rename from torchao/_models/_eval.py
rename to benchmarks/_models/_eval.py
diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
index 615b21ec47..4c077c92a0 100644
--- a/benchmarks/_models/llama/eval.py
+++ b/benchmarks/_models/llama/eval.py
@@ -8,14 +8,13 @@
 from typing import List, Optional
 
 import torch
-from generate import (
-    _load_model,
-    device_sync,
-)
 from tokenizer import get_tokenizer
 
 import torchao
-from torchao._models.llm.model import prepare_inputs_for_model
+from benchmarks._models.llama.model import prepare_inputs_for_model
+from benchmarks._models.utils import (
+    _load_model,
+)
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -28,7 +27,11 @@
     quantize_,
     uintx_weight_only,
 )
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, unwrap_tensor_subclass
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_5,
+    device_sync,
+    unwrap_tensor_subclass,
+)
 
 
 def run_evaluation(
@@ -120,7 +123,7 @@ def run_evaluation(
             quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
         if "int4wo" in quantization and "gptq" in quantization:
             # avoid circular imports
-            from torchao._models._eval import MultiTensorInputRecorder
+            from benchmarks._models._eval import MultiTensorInputRecorder
             from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
 
             groupsize = int(quantization.split("-")[-2])
@@ -172,7 +175,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from torchao._models.llm.model import TransformerBlock
+            from benchmarks._models.llama.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
@@ -242,7 +245,7 @@ def run_evaluation(
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
-        from torchao._models._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         TransformerEvalWrapper(
             model=model.to(device),
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
index d327d34962..9f527c31ba 100644
--- a/benchmarks/_models/llama/generate.py
+++ b/benchmarks/_models/llama/generate.py
@@ -14,7 +14,7 @@
 import torch._inductor.config
 
 import torchao
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     _load_model,
     decode_n_tokens,
     decode_one_token,
@@ -63,8 +63,8 @@ def device_timer(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.llm.model import Transformer, prepare_inputs_for_model
-from torchao._models.llm.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 
 
 def model_forward(model, x, input_pos):
@@ -382,7 +382,7 @@ def ffn_or_attn_only(mod, fqn):
                 filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
             )
         elif quantization.startswith("awq"):
-            from torchao._models._eval import TransformerEvalWrapper
+            from benchmarks._models._eval import TransformerEvalWrapper
             from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
 
             if not TORCH_VERSION_AT_LEAST_2_3:
@@ -481,8 +481,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from torchao._models._eval import InputRecorder
-            from torchao._models.llm.model import prepare_inputs_for_model
+            from benchmarks._models._eval import InputRecorder
+            from benchmarks._models.llama.model import prepare_inputs_for_model
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -571,8 +571,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from torchao._models._eval import InputRecorder
-            from torchao._models.llm.model import prepare_inputs_for_model
+            from benchmarks._models._eval import InputRecorder
+            from benchmarks._models.llama.model import prepare_inputs_for_model
 
             calibration_seq_length = 256
             inputs = (
diff --git a/torchao/_models/llm/model.py b/benchmarks/_models/llama/model.py
similarity index 100%
rename from torchao/_models/llm/model.py
rename to benchmarks/_models/llama/model.py
diff --git a/benchmarks/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
index ffc99be854..d1e9cab83c 100644
--- a/benchmarks/_models/llama/perf_profile.py
+++ b/benchmarks/_models/llama/perf_profile.py
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from torchao._models.llm.model import Transformer
-from torchao._models.llm.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
diff --git a/torchao/_models/llm/tokenizer.py b/benchmarks/_models/llama/tokenizer.py
similarity index 100%
rename from torchao/_models/llm/tokenizer.py
rename to benchmarks/_models/llama/tokenizer.py
diff --git a/torchao/_models/__init__.py b/benchmarks/_models/sam/__init__.py
similarity index 100%
rename from torchao/_models/__init__.py
rename to benchmarks/_models/sam/__init__.py
diff --git a/benchmarks/_models/sam/eval_combo.py b/benchmarks/_models/sam/eval_combo.py
index 781c10c935..7f17df4f4f 100644
--- a/benchmarks/_models/sam/eval_combo.py
+++ b/benchmarks/_models/sam/eval_combo.py
@@ -9,7 +9,7 @@
 from metrics import calculate_miou, create_result_entry
 
 import torchao
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
diff --git a/torchao/_models/sam2/__init__.py b/benchmarks/_models/sam2/__init__.py
similarity index 81%
rename from torchao/_models/sam2/__init__.py
rename to benchmarks/_models/sam2/__init__.py
index 0dc11c2fde..f49e12ba4e 100644
--- a/torchao/_models/sam2/__init__.py
+++ b/benchmarks/_models/sam2/__init__.py
@@ -8,4 +8,4 @@
 from hydra.core.global_hydra import GlobalHydra
 
 if not GlobalHydra.instance().is_initialized():
-    initialize_config_module("torchao._models.sam2", version_base="1.2")
+    initialize_config_module("benchmarks._models.sam2", version_base="1.2")
diff --git a/torchao/_models/sam2/automatic_mask_generator.py b/benchmarks/_models/sam2/automatic_mask_generator.py
similarity index 99%
rename from torchao/_models/sam2/automatic_mask_generator.py
rename to benchmarks/_models/sam2/automatic_mask_generator.py
index 6f4f1d3e7b..4e82f3ef04 100644
--- a/torchao/_models/sam2/automatic_mask_generator.py
+++ b/benchmarks/_models/sam2/automatic_mask_generator.py
@@ -11,9 +11,9 @@
 import torch
 from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
 
-from torchao._models.sam2.modeling.sam2_base import SAM2Base
-from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
-from torchao._models.sam2.utils.amg import (
+from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
+from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from benchmarks._models.sam2.utils.amg import (
     MaskData,
     _mask_to_rle_pytorch_2_0,
     _mask_to_rle_pytorch_2_1,
@@ -33,7 +33,7 @@
     uncrop_masks,
     uncrop_points,
 )
-from torchao._models.sam2.utils.misc import (
+from benchmarks._models.sam2.utils.misc import (
     crop_image,
     get_image_size,
 )
diff --git a/torchao/_models/sam2/build_sam.py b/benchmarks/_models/sam2/build_sam.py
similarity index 98%
rename from torchao/_models/sam2/build_sam.py
rename to benchmarks/_models/sam2/build_sam.py
index 70c4b81d09..eea26ccee4 100644
--- a/torchao/_models/sam2/build_sam.py
+++ b/benchmarks/_models/sam2/build_sam.py
@@ -106,7 +106,7 @@ def build_sam2_video_predictor(
     **kwargs,
 ):
     hydra_overrides = [
-        "++model._target_=torchao._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
+        "++model._target_=benchmarks._models.sam2.sam2_video_predictor.SAM2VideoPredictor",
     ]
     if apply_postprocessing:
         hydra_overrides_extra = hydra_overrides_extra.copy()
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
similarity index 71%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
index 42cd897c67..1742a20e95 100644
--- a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
similarity index 72%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
index ba9dafd489..17bf334745 100644
--- a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
similarity index 72%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
index 898898b158..7b5f000254 100644
--- a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
similarity index 72%
rename from torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
rename to benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
index c6318f843b..84c6e92e9c 100644
--- a/torchao/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
+++ b/benchmarks/_models/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml b/benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
similarity index 100%
rename from torchao/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
rename to benchmarks/_models/sam2/configs/sam2.1_training/sam2.1_hiera_b+_MOSE_finetune.yaml
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
similarity index 70%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
index b3ba469471..0f6c1c56cc 100644
--- a/torchao/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_b+.yaml
@@ -2,18 +2,18 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 112
       num_heads: 2
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -24,17 +24,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -45,7 +45,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -57,23 +57,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
similarity index 71%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
index 59a8a1e36b..4baf4e38eb 100644
--- a/torchao/_models/sam2/configs/sam2/sam2_hiera_l.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_l.yaml
@@ -2,12 +2,12 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 144
       num_heads: 2
       stages: [2, 6, 36, 4]
@@ -15,9 +15,9 @@ model:
       window_pos_embed_bkg_spatial_size: [7, 7]
       window_spec: [8, 4, 16, 8]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -28,17 +28,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -49,7 +49,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -61,23 +61,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
similarity index 71%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
index b051d3be63..84b4b52a8e 100644
--- a/torchao/_models/sam2/configs/sam2/sam2_hiera_s.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_s.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 11, 2]
       global_att_blocks: [7, 10, 13]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
similarity index 72%
rename from torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
rename to benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
index 6b108e708f..b572a7e4ee 100644
--- a/torchao/_models/sam2/configs/sam2/sam2_hiera_t.yaml
+++ b/benchmarks/_models/sam2/configs/sam2/sam2_hiera_t.yaml
@@ -2,21 +2,21 @@
 
 # Model
 model:
-  _target_: torchao._models.sam2.modeling.sam2_base.SAM2Base
+  _target_: benchmarks._models.sam2.modeling.sam2_base.SAM2Base
   image_encoder:
-    _target_: torchao._models.sam2.modeling.backbones.image_encoder.ImageEncoder
+    _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.ImageEncoder
     scalp: 1
     trunk:
-      _target_: torchao._models.sam2.modeling.backbones.hieradet.Hiera
+      _target_: benchmarks._models.sam2.modeling.backbones.hieradet.Hiera
       embed_dim: 96
       num_heads: 1
       stages: [1, 2, 7, 2]
       global_att_blocks: [5, 7, 9]
       window_pos_embed_bkg_spatial_size: [7, 7]
     neck:
-      _target_: torchao._models.sam2.modeling.backbones.image_encoder.FpnNeck
+      _target_: benchmarks._models.sam2.modeling.backbones.image_encoder.FpnNeck
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 256
         normalize: true
         scale: null
@@ -27,17 +27,17 @@ model:
       fpn_interp_model: nearest
 
   memory_attention:
-    _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttention
+    _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttention
     d_model: 256
     pos_enc_at_input: true
     layer:
-      _target_: torchao._models.sam2.modeling.memory_attention.MemoryAttentionLayer
+      _target_: benchmarks._models.sam2.modeling.memory_attention.MemoryAttentionLayer
       activation: relu
       dim_feedforward: 2048
       dropout: 0.1
       pos_enc_at_attn: false
       self_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         embedding_dim: 256
@@ -48,7 +48,7 @@ model:
       pos_enc_at_cross_attn_keys: true
       pos_enc_at_cross_attn_queries: false
       cross_attention:
-        _target_: torchao._models.sam2.modeling.sam.transformer.RoPEAttention
+        _target_: benchmarks._models.sam2.modeling.sam.transformer.RoPEAttention
         rope_theta: 10000.0
         feat_sizes: [32, 32]
         rope_k_repeat: True
@@ -60,23 +60,23 @@ model:
     num_layers: 4
 
   memory_encoder:
-      _target_: torchao._models.sam2.modeling.memory_encoder.MemoryEncoder
+      _target_: benchmarks._models.sam2.modeling.memory_encoder.MemoryEncoder
       out_dim: 64
       position_encoding:
-        _target_: torchao._models.sam2.modeling.position_encoding.PositionEmbeddingSine
+        _target_: benchmarks._models.sam2.modeling.position_encoding.PositionEmbeddingSine
         num_pos_feats: 64
         normalize: true
         scale: null
         temperature: 10000
       mask_downsampler:
-        _target_: torchao._models.sam2.modeling.memory_encoder.MaskDownSampler
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.MaskDownSampler
         kernel_size: 3
         stride: 2
         padding: 1
       fuser:
-        _target_: torchao._models.sam2.modeling.memory_encoder.Fuser
+        _target_: benchmarks._models.sam2.modeling.memory_encoder.Fuser
         layer:
-          _target_: torchao._models.sam2.modeling.memory_encoder.CXBlock
+          _target_: benchmarks._models.sam2.modeling.memory_encoder.CXBlock
           dim: 256
           kernel_size: 7
           padding: 3
diff --git a/torchao/_models/sam2/csrc/connected_components.cu b/benchmarks/_models/sam2/csrc/connected_components.cu
similarity index 100%
rename from torchao/_models/sam2/csrc/connected_components.cu
rename to benchmarks/_models/sam2/csrc/connected_components.cu
diff --git a/torchao/_models/sam2/map_tensor.py b/benchmarks/_models/sam2/map_tensor.py
similarity index 100%
rename from torchao/_models/sam2/map_tensor.py
rename to benchmarks/_models/sam2/map_tensor.py
diff --git a/torchao/_models/sam2/modeling/__init__.py b/benchmarks/_models/sam2/modeling/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/__init__.py
rename to benchmarks/_models/sam2/modeling/__init__.py
diff --git a/torchao/_models/sam2/modeling/backbones/__init__.py b/benchmarks/_models/sam2/modeling/backbones/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/__init__.py
rename to benchmarks/_models/sam2/modeling/backbones/__init__.py
diff --git a/torchao/_models/sam2/modeling/backbones/hieradet.py b/benchmarks/_models/sam2/modeling/backbones/hieradet.py
similarity index 98%
rename from torchao/_models/sam2/modeling/backbones/hieradet.py
rename to benchmarks/_models/sam2/modeling/backbones/hieradet.py
index 91e98f795e..b56c983c8f 100644
--- a/torchao/_models/sam2/modeling/backbones/hieradet.py
+++ b/benchmarks/_models/sam2/modeling/backbones/hieradet.py
@@ -13,12 +13,12 @@
 import torch.nn.functional as F
 from iopath.common.file_io import g_pathmgr
 
-from torchao._models.sam2.modeling.backbones.utils import (
+from benchmarks._models.sam2.modeling.backbones.utils import (
     PatchEmbed,
     window_partition,
     window_unpartition,
 )
-from torchao._models.sam2.modeling.sam2_utils import MLP, DropPath
+from benchmarks._models.sam2.modeling.sam2_utils import MLP, DropPath
 
 
 def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
diff --git a/torchao/_models/sam2/modeling/backbones/image_encoder.py b/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
similarity index 98%
rename from torchao/_models/sam2/modeling/backbones/image_encoder.py
rename to benchmarks/_models/sam2/modeling/backbones/image_encoder.py
index 0f0a256867..efa1d963e4 100644
--- a/torchao/_models/sam2/modeling/backbones/image_encoder.py
+++ b/benchmarks/_models/sam2/modeling/backbones/image_encoder.py
@@ -29,7 +29,7 @@ def __init__(
     def forward(self, sample: torch.Tensor):
         # Forward through backbone
         with torch.autograd.profiler.record_function("self.neck(self.trunk(sample))"):
-            from torchao._models.sam2.map_tensor import MapTensor, to_map_tensor
+            from benchmarks._models.sam2.map_tensor import MapTensor, to_map_tensor
 
             if isinstance(sample, MapTensor):
                 features, pos = self.neck(self.trunk(sample.elems.flatten(0, 1)))
diff --git a/torchao/_models/sam2/modeling/backbones/utils.py b/benchmarks/_models/sam2/modeling/backbones/utils.py
similarity index 100%
rename from torchao/_models/sam2/modeling/backbones/utils.py
rename to benchmarks/_models/sam2/modeling/backbones/utils.py
diff --git a/torchao/_models/sam2/modeling/memory_attention.py b/benchmarks/_models/sam2/modeling/memory_attention.py
similarity index 97%
rename from torchao/_models/sam2/modeling/memory_attention.py
rename to benchmarks/_models/sam2/modeling/memory_attention.py
index 5ac6288af0..c32707cf31 100644
--- a/torchao/_models/sam2/modeling/memory_attention.py
+++ b/benchmarks/_models/sam2/modeling/memory_attention.py
@@ -9,8 +9,8 @@
 import torch
 from torch import Tensor, nn
 
-from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
-from torchao._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
+from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
+from benchmarks._models.sam2.modeling.sam2_utils import get_activation_fn, get_clones
 
 
 class MemoryAttentionLayer(nn.Module):
diff --git a/torchao/_models/sam2/modeling/memory_encoder.py b/benchmarks/_models/sam2/modeling/memory_encoder.py
similarity index 98%
rename from torchao/_models/sam2/modeling/memory_encoder.py
rename to benchmarks/_models/sam2/modeling/memory_encoder.py
index a13f4077cd..84116aa225 100644
--- a/torchao/_models/sam2/modeling/memory_encoder.py
+++ b/benchmarks/_models/sam2/modeling/memory_encoder.py
@@ -11,7 +11,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torchao._models.sam2.modeling.sam2_utils import (
+from benchmarks._models.sam2.modeling.sam2_utils import (
     DropPath,
     LayerNorm2d,
     get_clones,
diff --git a/torchao/_models/sam2/modeling/position_encoding.py b/benchmarks/_models/sam2/modeling/position_encoding.py
similarity index 100%
rename from torchao/_models/sam2/modeling/position_encoding.py
rename to benchmarks/_models/sam2/modeling/position_encoding.py
diff --git a/torchao/_models/sam2/modeling/sam/__init__.py b/benchmarks/_models/sam2/modeling/sam/__init__.py
similarity index 100%
rename from torchao/_models/sam2/modeling/sam/__init__.py
rename to benchmarks/_models/sam2/modeling/sam/__init__.py
diff --git a/torchao/_models/sam2/modeling/sam/mask_decoder.py b/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
similarity index 99%
rename from torchao/_models/sam2/modeling/sam/mask_decoder.py
rename to benchmarks/_models/sam2/modeling/sam/mask_decoder.py
index 7d25697018..1c29113197 100644
--- a/torchao/_models/sam2/modeling/sam/mask_decoder.py
+++ b/benchmarks/_models/sam2/modeling/sam/mask_decoder.py
@@ -9,7 +9,7 @@
 import torch
 from torch import nn
 
-from torchao._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
+from benchmarks._models.sam2.modeling.sam2_utils import MLP, LayerNorm2d
 
 
 class MaskDecoder(nn.Module):
diff --git a/torchao/_models/sam2/modeling/sam/prompt_encoder.py b/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
similarity index 98%
rename from torchao/_models/sam2/modeling/sam/prompt_encoder.py
rename to benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
index 94b7fda8b2..2c3abbfa34 100644
--- a/torchao/_models/sam2/modeling/sam/prompt_encoder.py
+++ b/benchmarks/_models/sam2/modeling/sam/prompt_encoder.py
@@ -9,8 +9,8 @@
 import torch
 from torch import nn
 
-from torchao._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
-from torchao._models.sam2.modeling.sam2_utils import LayerNorm2d
+from benchmarks._models.sam2.modeling.position_encoding import PositionEmbeddingRandom
+from benchmarks._models.sam2.modeling.sam2_utils import LayerNorm2d
 
 
 class PromptEncoder(nn.Module):
diff --git a/torchao/_models/sam2/modeling/sam/transformer.py b/benchmarks/_models/sam2/modeling/sam/transformer.py
similarity index 98%
rename from torchao/_models/sam2/modeling/sam/transformer.py
rename to benchmarks/_models/sam2/modeling/sam/transformer.py
index bf0b58d6fd..3c6d3b83cd 100644
--- a/torchao/_models/sam2/modeling/sam/transformer.py
+++ b/benchmarks/_models/sam2/modeling/sam/transformer.py
@@ -14,12 +14,12 @@
 import torch.nn.functional as F
 from torch import Tensor, nn
 
-from torchao._models.sam2.modeling.position_encoding import (
+from benchmarks._models.sam2.modeling.position_encoding import (
     apply_rotary_enc,
     compute_axial_cis,
 )
-from torchao._models.sam2.modeling.sam2_utils import MLP
-from torchao._models.sam2.utils.misc import get_sdpa_settings
+from benchmarks._models.sam2.modeling.sam2_utils import MLP
+from benchmarks._models.sam2.utils.misc import get_sdpa_settings
 
 warnings.simplefilter(action="ignore", category=FutureWarning)
 # Check whether Flash Attention is available (and use it by default)
diff --git a/torchao/_models/sam2/modeling/sam2_base.py b/benchmarks/_models/sam2/modeling/sam2_base.py
similarity index 99%
rename from torchao/_models/sam2/modeling/sam2_base.py
rename to benchmarks/_models/sam2/modeling/sam2_base.py
index 4c2a24a0ef..c5d1f54829 100644
--- a/torchao/_models/sam2/modeling/sam2_base.py
+++ b/benchmarks/_models/sam2/modeling/sam2_base.py
@@ -9,10 +9,10 @@
 import torch.nn.functional as F
 from torch.nn.init import trunc_normal_
 
-from torchao._models.sam2.modeling.sam.mask_decoder import MaskDecoder
-from torchao._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
-from torchao._models.sam2.modeling.sam.transformer import TwoWayTransformer
-from torchao._models.sam2.modeling.sam2_utils import (
+from benchmarks._models.sam2.modeling.sam.mask_decoder import MaskDecoder
+from benchmarks._models.sam2.modeling.sam.prompt_encoder import PromptEncoder
+from benchmarks._models.sam2.modeling.sam.transformer import TwoWayTransformer
+from benchmarks._models.sam2.modeling.sam2_utils import (
     MLP,
     get_1d_sine_pe,
     select_closest_cond_frames,
diff --git a/torchao/_models/sam2/modeling/sam2_utils.py b/benchmarks/_models/sam2/modeling/sam2_utils.py
similarity index 99%
rename from torchao/_models/sam2/modeling/sam2_utils.py
rename to benchmarks/_models/sam2/modeling/sam2_utils.py
index 579bfc671a..1c00f534e3 100644
--- a/torchao/_models/sam2/modeling/sam2_utils.py
+++ b/benchmarks/_models/sam2/modeling/sam2_utils.py
@@ -13,7 +13,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from torchao._models.sam2.utils.misc import mask_to_box
+from benchmarks._models.sam2.utils.misc import mask_to_box
 
 
 def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
diff --git a/torchao/_models/sam2/sam2_hiera_b+.yaml b/benchmarks/_models/sam2/sam2_hiera_b+.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_b+.yaml
rename to benchmarks/_models/sam2/sam2_hiera_b+.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_l.yaml b/benchmarks/_models/sam2/sam2_hiera_l.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_l.yaml
rename to benchmarks/_models/sam2/sam2_hiera_l.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_s.yaml b/benchmarks/_models/sam2/sam2_hiera_s.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_s.yaml
rename to benchmarks/_models/sam2/sam2_hiera_s.yaml
diff --git a/torchao/_models/sam2/sam2_hiera_t.yaml b/benchmarks/_models/sam2/sam2_hiera_t.yaml
similarity index 100%
rename from torchao/_models/sam2/sam2_hiera_t.yaml
rename to benchmarks/_models/sam2/sam2_hiera_t.yaml
diff --git a/torchao/_models/sam2/sam2_image_predictor.py b/benchmarks/_models/sam2/sam2_image_predictor.py
similarity index 99%
rename from torchao/_models/sam2/sam2_image_predictor.py
rename to benchmarks/_models/sam2/sam2_image_predictor.py
index a4aa1c668c..a2c53bdf0a 100644
--- a/torchao/_models/sam2/sam2_image_predictor.py
+++ b/benchmarks/_models/sam2/sam2_image_predictor.py
@@ -11,9 +11,9 @@
 import torch
 from PIL.Image import Image
 
-from torchao._models.sam2.modeling.sam2_base import SAM2Base
-from torchao._models.sam2.utils.misc import get_image_size
-from torchao._models.sam2.utils.transforms import SAM2Transforms
+from benchmarks._models.sam2.modeling.sam2_base import SAM2Base
+from benchmarks._models.sam2.utils.misc import get_image_size
+from benchmarks._models.sam2.utils.transforms import SAM2Transforms
 
 
 class SAM2ImagePredictor(torch.nn.Module):
diff --git a/torchao/_models/sam2/sam2_video_predictor.py b/benchmarks/_models/sam2/sam2_video_predictor.py
similarity index 99%
rename from torchao/_models/sam2/sam2_video_predictor.py
rename to benchmarks/_models/sam2/sam2_video_predictor.py
index 53b0a11d7c..6715178958 100644
--- a/torchao/_models/sam2/sam2_video_predictor.py
+++ b/benchmarks/_models/sam2/sam2_video_predictor.py
@@ -10,8 +10,8 @@
 import torch
 from tqdm import tqdm
 
-from torchao._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
-from torchao._models.sam2.utils.misc import (
+from benchmarks._models.sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from benchmarks._models.sam2.utils.misc import (
     concat_points,
     fill_holes_in_mask_scores,
     load_video_frames,
@@ -52,7 +52,7 @@ def batch_inference_states(inference_states: list):
 
         batched_inference_state = copy.copy(inference_states[0])
 
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         # NOTE: Making a build assumption only images differ
         all_images = torch.stack([state["images"] for state in inference_states])
diff --git a/torchao/_models/sam2/utils/__init__.py b/benchmarks/_models/sam2/utils/__init__.py
similarity index 100%
rename from torchao/_models/sam2/utils/__init__.py
rename to benchmarks/_models/sam2/utils/__init__.py
diff --git a/torchao/_models/sam2/utils/amg.py b/benchmarks/_models/sam2/utils/amg.py
similarity index 100%
rename from torchao/_models/sam2/utils/amg.py
rename to benchmarks/_models/sam2/utils/amg.py
diff --git a/torchao/_models/sam2/utils/misc.py b/benchmarks/_models/sam2/utils/misc.py
similarity index 100%
rename from torchao/_models/sam2/utils/misc.py
rename to benchmarks/_models/sam2/utils/misc.py
diff --git a/torchao/_models/sam2/utils/transforms.py b/benchmarks/_models/sam2/utils/transforms.py
similarity index 97%
rename from torchao/_models/sam2/utils/transforms.py
rename to benchmarks/_models/sam2/utils/transforms.py
index c616233050..2d5e46193b 100644
--- a/torchao/_models/sam2/utils/transforms.py
+++ b/benchmarks/_models/sam2/utils/transforms.py
@@ -78,7 +78,7 @@ def postprocess_masks(
         """
         Perform PostProcessing on output masks.
         """
-        from torchao._models.sam2.utils.misc import get_connected_components
+        from benchmarks._models.sam2.utils.misc import get_connected_components
 
         masks = masks.float()
         input_masks = masks
@@ -125,7 +125,7 @@ def postprocess_masks_1_channel(
         """
         Perform PostProcessing on output masks.
         """
-        from torchao._models.sam2.utils.misc import get_connected_components
+        from benchmarks._models.sam2.utils.misc import get_connected_components
 
         assert masks.dim() == 4
         assert masks.size(1) == 1
diff --git a/torchao/_models/utils.py b/benchmarks/_models/utils.py
similarity index 99%
rename from torchao/_models/utils.py
rename to benchmarks/_models/utils.py
index bffe33aacc..dc2648a209 100644
--- a/torchao/_models/utils.py
+++ b/benchmarks/_models/utils.py
@@ -8,7 +8,7 @@
 
 import torch
 
-from torchao._models.llm.model import Transformer
+from benchmarks._models.llama.model import Transformer
 from torchao.utils import default_device
 
 
diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py
index 5cc6c9ba52..2eb66f5e6b 100644
--- a/benchmarks/quantized_training/pretrain_llama2.py
+++ b/benchmarks/quantized_training/pretrain_llama2.py
@@ -22,13 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao import quantize_
-from torchao._models.llm.model import (
+from benchmarks._models.llama.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
+from torchao import quantize_
 from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
diff --git a/examples/sam2_amg_server/annotate_with_rle.py b/examples/sam2_amg_server/annotate_with_rle.py
index 55e5512011..3c3bbc77b0 100644
--- a/examples/sam2_amg_server/annotate_with_rle.py
+++ b/examples/sam2_amg_server/annotate_with_rle.py
@@ -14,7 +14,7 @@
 )
 from tqdm import tqdm
 
-from torchao._models.sam2.utils.amg import area_from_rle, rle_to_mask
+from benchmarks._models.sam2.utils.amg import area_from_rle, rle_to_mask
 
 
 def timestamped_print(*args, **kwargs):
diff --git a/examples/sam2_amg_server/cli.py b/examples/sam2_amg_server/cli.py
index 2f6758b7d3..b5feac395e 100644
--- a/examples/sam2_amg_server/cli.py
+++ b/examples/sam2_amg_server/cli.py
@@ -12,9 +12,9 @@
     show_anns,
 )
 
-from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-from torchao._models.sam2.build_sam import build_sam2
-from torchao._models.sam2.utils.amg import rle_to_mask
+from benchmarks._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from benchmarks._models.sam2.build_sam import build_sam2
+from benchmarks._models.sam2.utils.amg import rle_to_mask
 
 
 def main_docstring():
diff --git a/examples/sam2_amg_server/cli_on_modal.py b/examples/sam2_amg_server/cli_on_modal.py
index 5fe56eeb1a..d44de90bf7 100644
--- a/examples/sam2_amg_server/cli_on_modal.py
+++ b/examples/sam2_amg_server/cli_on_modal.py
@@ -84,10 +84,10 @@ def build(self):
             from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
             from sam2.build_sam import build_sam2
         else:
-            from torchao._models.sam2.automatic_mask_generator import (
+            from benchmarks._models.sam2.automatic_mask_generator import (
                 SAM2AutomaticMaskGenerator,
             )
-            from torchao._models.sam2.build_sam import build_sam2
+            from benchmarks._models.sam2.build_sam import build_sam2
 
         os.chdir(f"{TARGET}ao_src_0/examples/sam2_amg_server")
         import sys
@@ -139,11 +139,11 @@ def build(self):
             from sam2.utils.amg import mask_to_rle_pytorch as mask_to_rle_pytorch_2
             from sam2.utils.amg import rle_to_mask
         else:
-            from torchao._models.sam2.utils.amg import (
+            from benchmarks._models.sam2.utils.amg import (
                 mask_to_rle_pytorch_2,
                 rle_to_mask,
             )
-        from torchao._models.sam2.utils.amg import area_from_rle
+        from benchmarks._models.sam2.utils.amg import area_from_rle
 
         self.np = np
         self.tio = tio
diff --git a/examples/sam2_amg_server/compare_rle_lists.py b/examples/sam2_amg_server/compare_rle_lists.py
index 7a1c78b846..88be3df491 100644
--- a/examples/sam2_amg_server/compare_rle_lists.py
+++ b/examples/sam2_amg_server/compare_rle_lists.py
@@ -7,7 +7,7 @@
 import torch
 
 
-# from torchao._models.sam2.utils.amg import rle_to_mask
+# from benchmarks._models.sam2.utils.amg import rle_to_mask
 def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
     """Compute a binary mask from an uncompressed RLE."""
     h, w = rle["size"]
diff --git a/examples/sam2_amg_server/compile_export_utils.py b/examples/sam2_amg_server/compile_export_utils.py
index d1c6fc06fa..a1b6b5f891 100644
--- a/examples/sam2_amg_server/compile_export_utils.py
+++ b/examples/sam2_amg_server/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
@@ -513,18 +513,18 @@ def set_fast(
             dynamic=True,
         )
 
-    import torchao
+    import benchmarks
 
     if allow_recompiles:
         # A bunch of extra compiles at module level
         # Note that this can cause recompilations!
         # We might want to guard on that
-        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
+        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
             fullgraph=True, dynamic=True
-        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
-        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
+        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
+        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
             fullgraph=True, dynamic=True
-        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
+        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
         mask_generator.calculate_stability_score = torch.compile(
             fullgraph=True, dynamic=True
         )(mask_generator.calculate_stability_score)
diff --git a/examples/sam2_amg_server/generate_data.py b/examples/sam2_amg_server/generate_data.py
index 50eeccb912..dc82348d0b 100644
--- a/examples/sam2_amg_server/generate_data.py
+++ b/examples/sam2_amg_server/generate_data.py
@@ -192,7 +192,7 @@ def gen_masks_ao_batch(
         center_points_label_torch_batch = [
             torch.from_numpy(t).unsqueeze(1) for t in center_points_label_batch
         ]
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch_batch = list(map(to_map_tensor, center_points_torch_batch))
         center_points_label_torch_batch = list(
@@ -255,7 +255,7 @@ def gen_masks_ao(
 
         center_points_torch = torch.from_numpy(center_points).unsqueeze(1)
         center_points_label_torch = torch.from_numpy(center_points_label).unsqueeze(1)
-        from torchao._models.sam2.map_tensor import to_map_tensor
+        from benchmarks._models.sam2.map_tensor import to_map_tensor
 
         center_points_torch = to_map_tensor(center_points_torch)
         center_points_label_torch = to_map_tensor(center_points_label_torch)
@@ -532,11 +532,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import mask_to_rle_pytorch
     else:
-        from torchao._models.sam2.automatic_mask_generator import (
+        from benchmarks._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from torchao._models.sam2.build_sam import build_sam2
-        from torchao._models.sam2.utils.amg import (
+        from benchmarks._models.sam2.build_sam import build_sam2
+        from benchmarks._models.sam2.utils.amg import (
             mask_to_rle_pytorch_2 as mask_to_rle_pytorch,
         )
     torch.manual_seed(seed)
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
index 7e35858590..ea9953dbed 100644
--- a/examples/sam2_amg_server/server.py
+++ b/examples/sam2_amg_server/server.py
@@ -26,7 +26,7 @@
 from fastapi.responses import StreamingResponse
 from torch._inductor import config as inductorconfig
 
-from torchao._models.utils import (
+from benchmarks._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
@@ -460,11 +460,11 @@ def main(
         from sam2.build_sam import build_sam2
         from sam2.utils.amg import rle_to_mask
     else:
-        from torchao._models.sam2.automatic_mask_generator import (
+        from benchmarks._models.sam2.automatic_mask_generator import (
             SAM2AutomaticMaskGenerator,
         )
-        from torchao._models.sam2.build_sam import build_sam2
-        from torchao._models.sam2.utils.amg import rle_to_mask
+        from benchmarks._models.sam2.build_sam import build_sam2
+        from benchmarks._models.sam2.utils.amg import rle_to_mask
 
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
diff --git a/examples/sam2_vos_example/compile_export_utils.py b/examples/sam2_vos_example/compile_export_utils.py
index 7d1b3eddf3..00f1b56794 100644
--- a/examples/sam2_vos_example/compile_export_utils.py
+++ b/examples/sam2_vos_example/compile_export_utils.py
@@ -4,7 +4,7 @@
 
 import torch
 
-from torchao._models.sam2.sam2_video_predictor import SAM2VideoPredictor
+from benchmarks._models.sam2.sam2_video_predictor import SAM2VideoPredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
diff --git a/examples/sam2_vos_example/video_profile.py b/examples/sam2_vos_example/video_profile.py
index 8ee9151cc4..44b90bd77b 100644
--- a/examples/sam2_vos_example/video_profile.py
+++ b/examples/sam2_vos_example/video_profile.py
@@ -280,7 +280,7 @@ def main(
     if use_baseline:
         from sam2.build_sam import build_sam2_video_predictor
     else:
-        from torchao._models.sam2.build_sam import build_sam2_video_predictor
+        from benchmarks._models.sam2.build_sam import build_sam2_video_predictor
 
     device = "cuda:0"
     # hydra_overrides_extra = ["++model.compile_image_encoder=true"]
@@ -292,7 +292,7 @@ def main(
     )
     predictor._frame_batch_size = frame_batch_size
     predictor.image_encoder.trunk = predictor.image_encoder.trunk.to(torch.bfloat16)
-    from torchao._models.sam2.modeling.sam.transformer import RoPEAttention
+    from benchmarks._models.sam2.modeling.sam.transformer import RoPEAttention
 
     rope_attention_modules = [
         module for module in predictor.modules() if isinstance(module, RoPEAttention)
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 24ba4717e8..1b0939c951 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -14,7 +14,7 @@
 import torch
 from safetensors.torch import load_file as load_safetensors_file
 
-from torchao._models.llm.model import ModelArgs
+from benchmarks._models.llama.model import ModelArgs
 
 
 @torch.inference_mode()
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
index 99c2955360..a50b9d9cb7 100644
--- a/test/prototype/test_spinquant.py
+++ b/test/prototype/test_spinquant.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.llm.model import Transformer
+from benchmarks._models.llama.model import Transformer
 from torchao.prototype.spinquant import apply_spinquant
 
 
diff --git a/test/quantization/test_gptq_mt.py b/test/quantization/test_gptq_mt.py
index 1064c41841..f82315714b 100644
--- a/test/quantization/test_gptq_mt.py
+++ b/test/quantization/test_gptq_mt.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests
 
-from torchao._models.llm.model import Transformer, prepare_inputs_for_model
-from torchao._models.llm.tokenizer import get_tokenizer
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer, MultiTensor
 from torchao.quantization.utils import _lm_eval_available
 from torchao.utils import is_fbcode
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 72359c78af..1176367a3d 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -21,9 +21,9 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import TestCase
 
+from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
+from benchmarks._models.llama.tokenizer import get_tokenizer
 from torchao import quantize_
-from torchao._models.llm.model import Transformer, prepare_inputs_for_model
-from torchao._models.llm.tokenizer import get_tokenizer
 from torchao.dtypes import AffineQuantizedTensor
 from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.quant_api import (
@@ -278,7 +278,7 @@ def test_8da4w_quantizer(self):
     # https://github.com/pytorch-labs/gpt-fast/blob/6253c6bb054e658d67566150f87329b87815ae63/scripts/convert_hf_checkpoint.py
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_8da4w_gptq_quantizer(self):
-        from torchao._models._eval import InputRecorder, TransformerEvalWrapper
+        from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer
 
         # should be similar to TorchCompileDynamicQuantizer
@@ -348,7 +348,7 @@ def test_8da4w_gptq_quantizer(self):
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch verion is 2.4 or lower"
     )
     def test_8da4w_quantizer_eval(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
         precision = torch.bfloat16
@@ -384,7 +384,7 @@ def test_8da4w_quantizer_eval(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer_int4_weight_only(self):
-        from torchao._models._eval import (
+        from benchmarks._models._eval import (
             MultiTensorInputRecorder,
             TransformerEvalWrapper,
         )
@@ -454,7 +454,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_quantizer_int4_weight_only(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
@@ -492,7 +492,7 @@ def test_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
@@ -525,7 +525,7 @@ def test_eval_wrapper(self):
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper_llama3(self):
-        from torchao._models._eval import TransformerEvalWrapper
+        from benchmarks._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
diff --git a/test/test_ao_models.py b/test/test_ao_models.py
index f31188802b..064e2a9a54 100644
--- a/test/test_ao_models.py
+++ b/test/test_ao_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from torchao._models.llm.model import Transformer
+from benchmarks._models.llama.model import Transformer
 
 _AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 
diff --git a/torchao/_models/llama/__init__.py b/torchao/_models/llama/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
index a0e62b9ebd..0fc875d44d 100644
--- a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
+++ b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -18,15 +18,17 @@
 )
 
 import torchao
-from torchao._models.generate import (
-    decode_one_token,
-    prefill,
+from benchmarks._models.llama.model import (
+    KVCache,
+    Transformer,
+    prepare_inputs_for_model,
 )
-from torchao._models.llm.model import Transformer, prepare_inputs_for_model
-from torchao._models.llm.tokenizer import get_tokenizer
-from torchao._models.utils import (
+from benchmarks._models.llama.tokenizer import get_tokenizer
+from benchmarks._models.utils import (
     _load_model,
+    decode_one_token,
     encode_tokens,
+    prefill,
 )
 from torchao.utils import device_sync
 
@@ -101,7 +103,7 @@ def generate(
             _replace_with_custom_fn_if_matches_filter(
                 model,
                 AffineQuantizedKVCache.from_float,
-                lambda x, y: isinstance(x, torchao._models.llm.model.KVCache),
+                lambda x, y: isinstance(x, KVCache),
             )
 
     # format model input
diff --git a/torchao/prototype/spinquant/spinquant.py b/torchao/prototype/spinquant/spinquant.py
index ce78cc0cc5..bfa83a332a 100644
--- a/torchao/prototype/spinquant/spinquant.py
+++ b/torchao/prototype/spinquant/spinquant.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from torchao._models.llm.model import RMSNorm, Transformer
+from benchmarks._models.llama.model import RMSNorm, Transformer
 from torchao.prototype.spinquant.hadamard_utils import (
     apply_exact_had_to_linear,
     get_hadK,
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
index 88febdd532..02bb73a903 100644
--- a/torchao/quantization/GPTQ.py
+++ b/torchao/quantization/GPTQ.py
@@ -79,9 +79,9 @@ def __init__(
         # trace model for one input
         one_input = [multi.values[0].cpu() for multi in inputs]  # pyre-ignore[16]
         # needed for GPTQ on the torchao llama model
-        import torchao
+        import benchmarks
 
-        torchao._models.llm.model.use_index_put_for_kv_cache = True
+        benchmarks._models.llama.model.use_index_put_for_kv_cache = True
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
index d2b6e0c016..36f0befa80 100644
--- a/torchao/quantization/README.md
+++ b/torchao/quantization/README.md
@@ -396,7 +396,7 @@ The `quantize_` and `autoquant` apis now automatically use our recommended induc
 ## (To be moved to prototype) A16W4 WeightOnly Quantization with GPTQ
 
 ```python
-from torchao._models._eval import InputRecorder, TransformerEvalWrapper
+from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 precision = torch.bfloat16
 device = "cuda"

From e88f03e903077b2b32d669ee3809c425c4bcbb29 Mon Sep 17 00:00:00 2001
From: jainapurva <apurvajain.kota@gmail.com>
Date: Mon, 3 Mar 2025 13:21:34 -0800
Subject: [PATCH 7/7] Replace all torchao/_models -> benchmarks/_models

---
 .github/workflows/dashboard_perf_test.yml     |  10 +-
 README.md                                     |   4 +-
 benchmarks/_models/llama/README.md            |   2 +-
 docs/source/contributor_guide.rst             |  10 +-
 .../sam2_amg_server/result_batch_size_16.csv  | 154 +++++++++---------
 torchao/_models/llama/__init__.py             |   0
 torchao/_models/llm/__init__.py               |   0
 torchao/prototype/awq/README.md               |   8 +-
 .../scripts/BO_acc_throughput.py              |   2 +-
 torchao/quantization/README.md                |  10 +-
 torchao/sparsity/README.md                    |   2 +-
 11 files changed, 98 insertions(+), 104 deletions(-)
 delete mode 100644 torchao/_models/llama/__init__.py
 delete mode 100644 torchao/_models/llm/__init__.py

diff --git a/.github/workflows/dashboard_perf_test.yml b/.github/workflows/dashboard_perf_test.yml
index 81ea40d341..64338aff7a 100644
--- a/.github/workflows/dashboard_perf_test.yml
+++ b/.github/workflows/dashboard_perf_test.yml
@@ -42,19 +42,19 @@ jobs:
 
           mkdir -p ${{ runner.temp }}/benchmark-results
           # llama3 - compile baseline
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # llama3 - autoquant
-          ${CONDA_RUN} python torchao/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
+          ${CONDA_RUN} python benchmarks/_models/llama/generate.py --checkpoint_path "${CHECKPOINT_PATH}/${MODEL_REPO}/model.pth" --compile --compile_prefill --quantization autoquant --output_json_path ${{ runner.temp }}/benchmark-results/llama3-benchmark-results.json
 
           # skipping SAM because of https://hud.pytorch.org/pr/pytorch/ao/1407
           # # SAM
           # ${CONDA_RUN} pip install git+https://github.com/pytorch-labs/segment-anything-fast.git@main
           # # SAM compile baselilne
-          # ${CONDA_RUN} sh torchao/_models/sam/setup.sh
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} sh benchmarks/_models/sam/setup.sh
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
-          # ${CONDA_RUN} python torchao/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
+          # ${CONDA_RUN} python benchmarks/_models/sam/eval_combo.py --coco_root_dir datasets/coco2017 --coco_slice_name val2017 --sam_checkpoint_base_path checkpoints --sam_model_type vit_h --point_sampling_cache_dir tmp/sam_coco_mask_center_cache --mask_debug_out_dir tmp/sam_eval_masks_out --batch_size 32 --num_workers 8 --use_compile max-autotune --use_half bfloat16 --device cuda --compression autoquant --output_json_path ${{ runner.temp }}/benchmark-results/sam-benchmark-results.json
 
           # SAM 2.1
           # ${CONDA_RUN} sh scripts/download_sam2_ckpts.sh ${CHECKPOINT_PATH}/sam2
diff --git a/README.md b/README.md
index 606b48986d..a48899e123 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ torchao just works with `torch.compile()` and `FSDP2` over most PyTorch models o
 
 ### Post Training Quantization
 
-Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/torchao/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
+Quantizing and Sparsifying your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/), sparsity [here](/benchmarks/_models/sam/README.md) and a HuggingFace inference example [here](scripts/hf_eval.py)
 
 For inference, we have the option of
 1. Quantize only the weights: works best for memory bound models
@@ -52,7 +52,7 @@ We also provide a developer facing API so you can implement your own quantizatio
 
 We've added kv cache quantization and other features in order to enable long context length (and necessarily memory efficient) inference.
 
-In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](torchao/_models/llama/README.md)
+In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](benchmarks/_models/llama/README.md)
 
 ## Training
 
diff --git a/benchmarks/_models/llama/README.md b/benchmarks/_models/llama/README.md
index 99f1919fc9..9e1bd2b062 100644
--- a/benchmarks/_models/llama/README.md
+++ b/benchmarks/_models/llama/README.md
@@ -8,7 +8,7 @@ and follow the steps to gain access.
 Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
 download and convert the model weights
 
-once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
+once done you can execute benchmarks from the benchmarks/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation
 directly using `generate.py` or `eval.py`.
 
 ## KV Cache Quantization - Memory Efficient Inference
diff --git a/docs/source/contributor_guide.rst b/docs/source/contributor_guide.rst
index ab6d433e27..c204fdc67d 100644
--- a/docs/source/contributor_guide.rst
+++ b/docs/source/contributor_guide.rst
@@ -125,11 +125,11 @@ After you have the quantization flow implemented, you can run benchmark and eval
 
 Note: llama model (llama2/llama3) is our representative model for memory bound models and sam is our representative model for compute bound models.
 
-* `llama <https://github.com/pytorch/ao/tree/main/torchao/_models/llama>`__
-  * `benchmark <https://github.com/pytorch/ao/blob/main/torchao/_models/llama/generate.py>`__
-  * `eval <https://github.com/pytorch/ao/blob/main/torchao/_models/llama/eval.py>`__
-* `sam <https://github.com/pytorch/ao/tree/main/torchao/_models/sam>`__
-  * `benchmark and eval <https://github.com/pytorch/ao/blob/main/torchao/_models/sam/eval_combo.py>`__
+* `llama <https://github.com/pytorch/ao/tree/main/benchmarks/_models/llama>`__
+  * `benchmark <https://github.com/pytorch/ao/blob/main/benchmarks/_models/llama/generate.py>`__
+  * `eval <https://github.com/pytorch/ao/blob/main/benchmarks/_models/llama/eval.py>`__
+* `sam <https://github.com/pytorch/ao/tree/main/benchmarks/_models/sam>`__
+  * `benchmark and eval <https://github.com/pytorch/ao/blob/main/benchmarks/_models/sam/eval_combo.py>`__
 
 Please checkout the ``--help`` option for each of the script to understand the supported options, e.g. you can use ``--profile=profile_path`` to get the chrome trace of the run to understand detailed `chrome trace <https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-tracing-functionality>`__.
 
diff --git a/examples/sam2_amg_server/result_batch_size_16.csv b/examples/sam2_amg_server/result_batch_size_16.csv
index 4e8c338df4..0d59b0a6cf 100644
--- a/examples/sam2_amg_server/result_batch_size_16.csv
+++ b/examples/sam2_amg_server/result_batch_size_16.csv
@@ -32,21 +32,21 @@ num-images,total_time,first,p99,baseline,max,export-model,second,furious,environ
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -90,21 +90,21 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -186,21 +186,21 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -244,21 +244,21 @@ RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -302,21 +302,21 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -343,7 +343,7 @@ W0104 14:58:02.413000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:03.167000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:04.568000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(4194304x256, 256x128)
-  mm 8.7354 ms 100.0% 
+  mm 8.7354 ms 100.0%
   triton_mm_146 13.3706 ms 65.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_139 17.0872 ms 51.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_141 17.6846 ms 49.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -361,7 +361,7 @@ W0104 14:58:07.799000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:08.210000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:08.894000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(8192x256, 256x2048)
-  mm 0.2846 ms 100.0% 
+  mm 0.2846 ms 100.0%
   triton_mm_184 0.4445 ms 64.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_177 0.5668 ms 50.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_179 0.5790 ms 49.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -379,7 +379,7 @@ W0104 14:58:11.387000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:11.755000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:12.364000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0186 ms 100.0% 
+  mm 0.0186 ms 100.0%
   triton_mm_626 0.0359 ms 51.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_627 0.0361 ms 51.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_625 0.0365 ms 50.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -397,7 +397,7 @@ W0104 14:58:14.841000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:15.202000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:15.806000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0180 ms 100.0% 
+  mm 0.0180 ms 100.0%
   triton_mm_646 0.0357 ms 50.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_645 0.0360 ms 49.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_644 0.0370 ms 48.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -415,7 +415,7 @@ W0104 14:58:16.861000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:17.223000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:17.833000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0185 ms 100.0% 
+  mm 0.0185 ms 100.0%
   triton_mm_682 0.0360 ms 51.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_681 0.0364 ms 50.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_680 0.0365 ms 50.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -433,7 +433,7 @@ W0104 14:58:18.895000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:19.255000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:19.866000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0186 ms 100.0% 
+  mm 0.0186 ms 100.0%
   triton_mm_736 0.0360 ms 51.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_737 0.0360 ms 51.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_735 0.0365 ms 50.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -451,7 +451,7 @@ W0104 14:58:20.929000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:21.292000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:21.909000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0180 ms 100.0% 
+  mm 0.0180 ms 100.0%
   triton_mm_792 0.0361 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_791 0.0363 ms 49.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_790 0.0370 ms 48.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -469,7 +469,7 @@ W0104 14:58:22.960000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:23.317000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:23.931000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(1024x256, 256x256)
-  mm 0.0185 ms 100.0% 
+  mm 0.0185 ms 100.0%
   triton_mm_847 0.0361 ms 51.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_846 0.0363 ms 51.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_845 0.0368 ms 50.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
@@ -483,7 +483,7 @@ SingleProcess AUTOTUNE benchmarking takes 2.0045 seconds and 0.0040 seconds prec
 AUTOTUNE mm(1024x256, 256x4)
   triton_mm_883 0.0162 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
   triton_mm_884 0.0162 ms 99.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-  mm 0.0166 ms 97.5% 
+  mm 0.0166 ms 97.5%
   triton_mm_885 0.0232 ms 69.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
   triton_mm_889 0.0233 ms 69.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_890 0.0235 ms 68.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=16, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -501,7 +501,7 @@ AUTOTUNE mm(2048x2, 2x128)
   triton_mm_5 0.0073 ms 91.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=2, num_warps=4
   triton_mm_7 0.0077 ms 86.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=5, num_warps=4
   triton_mm_6 0.0078 ms 85.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=8
-  mm 0.0079 ms 84.2% 
+  mm 0.0079 ms 84.2%
   triton_mm_8 0.0083 ms 80.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=False, GROUP_M=8, num_stages=3, num_warps=4
 SingleProcess AUTOTUNE benchmarking takes 2.3704 seconds and 0.0024 seconds precompiling for 17 choices
 E0104 14:58:30.506000 1111794 site-packages/torch/_inductor/select_algorithm.py:1400] [0/0] Exception out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help. for benchmark choice TritonTemplateCaller(/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/amg_fast_export_inductor_cache_dir/cz/cczuf4mbz67rz32kb4erom4hh3extdrznp22adm5ibnzg5hixbva.py, ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4)
@@ -511,8 +511,8 @@ W0104 14:58:31.755000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:32.124000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:32.745000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE addmm(8192x256, 8192x256, 256x256)
-  bias_addmm 0.0492 ms 100.0% 
-  addmm 0.0681 ms 72.3% 
+  bias_addmm 0.0492 ms 100.0%
+  addmm 0.0681 ms 72.3%
   triton_mm_27 0.0801 ms 61.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_29 0.0805 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_25 0.0822 ms 59.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -529,8 +529,8 @@ W0104 14:58:33.985000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:34.346000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:34.965000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE addmm(8192x128, 8192x256, 256x128)
-  bias_addmm 0.0313 ms 100.0% 
-  addmm 0.0400 ms 78.1% 
+  bias_addmm 0.0313 ms 100.0%
+  addmm 0.0400 ms 78.1%
   triton_mm_101 0.0577 ms 54.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_105 0.0588 ms 53.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_103 0.0625 ms 50.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -543,7 +543,7 @@ SingleProcess AUTOTUNE benchmarking takes 2.1995 seconds and 0.0039 seconds prec
 E0104 14:58:34.979000 1111794 site-packages/torch/_inductor/select_algorithm.py:1400] [0/0] Exception out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help. for benchmark choice TritonTemplateCaller(/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/amg_fast_export_inductor_cache_dir/f5/cf54lpxyskhyrlnsvgwdvrzswqz4avvyso3u2cqlseqwgbpj7pgv.py, ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8)
 W0104 14:58:37.259000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(8192x128, 128x256)
-  mm 0.0332 ms 100.0% 
+  mm 0.0332 ms 100.0%
   triton_mm_162 0.0454 ms 73.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_160 0.0457 ms 72.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_158 0.0467 ms 71.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -561,7 +561,7 @@ W0104 14:58:38.545000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:38.959000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:39.649000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(8192x2048, 2048x256)
-  mm 0.2634 ms 100.0% 
+  mm 0.2634 ms 100.0%
   triton_mm_198 0.5623 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_200 0.5694 ms 46.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_196 0.5824 ms 45.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -579,7 +579,7 @@ W0104 14:58:40.825000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:41.198000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:41.816000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(8192x256, 256x256)
-  mm 0.0553 ms 100.0% 
+  mm 0.0553 ms 100.0%
   triton_mm_350 0.0801 ms 69.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_352 0.0803 ms 68.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_348 0.0818 ms 67.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -593,7 +593,7 @@ SingleProcess AUTOTUNE benchmarking takes 2.1333 seconds and 0.0039 seconds prec
 E0104 14:58:41.828000 1111794 site-packages/torch/_inductor/select_algorithm.py:1400] [0/0] Exception out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help. for benchmark choice TritonTemplateCaller(/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/amg_fast_export_inductor_cache_dir/sn/csnohx66tfenmoj7n2bmwgbic34up2jtkkpubt6ri3ulzzs65i4x.py, ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8)
 W0104 14:58:48.250000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE mm(4194304x128, 128x256)
-  mm 9.4713 ms 100.0% 
+  mm 9.4713 ms 100.0%
   triton_mm_279 13.9709 ms 67.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_272 17.6967 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_274 18.6221 ms 50.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -611,8 +611,8 @@ W0104 14:58:52.143000 1111794 site-packages/torch/_inductor/select_algorithm.py:
 W0104 14:58:52.895000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 294912, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 W0104 14:58:54.313000 1111794 site-packages/torch/_inductor/select_algorithm.py:1619] [0/0] out of resource: shared memory, Required: 262144, Hardware limit: 232448. Reducing block sizes or `num_stages` may help.
 AUTOTUNE addmm(4194304x128, 4194304x256, 256x128)
-  bias_addmm 8.5930 ms 100.0% 
-  addmm 11.2420 ms 76.4% 
+  bias_addmm 8.5930 ms 100.0%
+  addmm 11.2420 ms 76.4%
   triton_mm_393 13.5410 ms 63.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_386 17.1705 ms 50.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
   triton_mm_388 17.8044 ms 48.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
@@ -623,10 +623,10 @@ AUTOTUNE addmm(4194304x128, 4194304x256, 256x128)
   triton_mm_391 28.6740 ms 30.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=8
 SingleProcess AUTOTUNE benchmarking takes 6.0558 seconds and 0.0034 seconds precompiling for 21 choices
 AUTOTUNE addmm(1024x32, 1024x256, 256x32)
-  bias_addmm 0.0174 ms 100.0% 
+  bias_addmm 0.0174 ms 100.0%
   triton_mm_664 0.0227 ms 76.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
   triton_mm_663 0.0227 ms 76.6% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=4
-  addmm 0.0228 ms 76.3% 
+  addmm 0.0228 ms 76.3%
   triton_mm_662 0.0333 ms 52.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=16, BLOCK_M=32, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=1, num_warps=2
   triton_mm_665 0.0354 ms 49.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
   triton_mm_669 0.0354 ms 49.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=8
@@ -699,21 +699,21 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 243, in generate_batch
     data = self._generate_masks_batch(images)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 292, in _generate_masks_batch
     all_data = self._process_crop_batch(images, all_crop_boxes, all_layer_idxs, all_orig_size)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/automatic_mask_generator.py"", line 384, in _process_crop_batch
     self.predictor.set_image_batch(all_cropped_im)
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -773,10 +773,10 @@ RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -819,10 +819,10 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -903,10 +903,10 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -949,10 +949,10 @@ RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -995,10 +995,10 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1079,10 +1079,10 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1142,10 +1142,10 @@ RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1188,10 +1188,10 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1272,10 +1272,10 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1318,10 +1318,10 @@ RuntimeError: run_func_( container_handle_, input_handles.data(), input_handles.
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1364,10 +1364,10 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 474, in forward_image
     backbone_out[""backbone_fpn""][0] = self.sam_mask_decoder.conv_s0(
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
@@ -1387,7 +1387,7 @@ RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
 ,,,,,,,,,{'TORCHINDUCTOR_CACHE_DIR': '/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/mps_fast_export_inductor_cache_dir'},mps_ao_ppb_None_fast_export_gpu_preproc,82.75403904914856,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/exported_models/mps_ao_fast,,,16,,,,,,/home/cpuhrsch/blogs/tmp/sam2_amg_example_run_10/amg_baseline_annotations,,,,"W0104 18:14:14.202000 2235960 site-packages/torch/_logging/_internal.py:1084] [0/0] Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
 /home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/_inductor/compile_fx.py:222: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
   warnings.warn(
-V0104 18:14:58.688000 2235960 site-packages/torch/_dynamo/guards.py:2760] [0/1] [__recompiles] Recompiling function _predict_masks in /home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py:432
+V0104 18:14:58.688000 2235960 site-packages/torch/_dynamo/guards.py:2760] [0/1] [__recompiles] Recompiling function _predict_masks in /home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py:432
 V0104 18:14:58.688000 2235960 site-packages/torch/_dynamo/guards.py:2760] [0/1] [__recompiles]     triggered by the following guard failure(s):
 V0104 18:14:58.688000 2235960 site-packages/torch/_dynamo/guards.py:2760] [0/1] [__recompiles]     - 0/0: Ne(L['self']._modules['model']._modules['sam_mask_decoder']._modules['transformer']._modules['final_attn_token_to_image'].num_heads*((128//L['self']._modules['model']._modules['sam_mask_decoder']._modules['transformer']._modules['final_attn_token_to_image'].num_heads)), 8*L['point_coords'].elems.size()[0])  # (_inductor/pattern_matcher.py:1288 in <genexpr>)
 [E104 18:15:24.766972949 shim_common.cpp:376] Exception in aoti_torch: CUDA out of memory. Tried to allocate 576.00 MiB. GPU 0 has a total capacity of 94.99 GiB of which 498.44 MiB is free. Including non-PyTorch memory, this process has 94.49 GiB memory in use. Of the allocated memory 91.63 GiB is allocated by PyTorch, and 1.31 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
@@ -1454,10 +1454,10 @@ Traceback (most recent call last):
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/utils/_contextlib.py"", line 116, in decorate_context
     return func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/sam2_image_predictor.py"", line 172, in set_image_batch
     backbone_out = self.model.forward_image(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File ""/home/cpuhrsch/dev/ao/torchao/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
+  File ""/home/cpuhrsch/dev/ao/benchmarks/_models/sam2/modeling/sam2_base.py"", line 469, in forward_image
     backbone_out = self.image_encoder(img_batch)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   File ""/home/cpuhrsch/.conda/envs/nightly20241126py312/lib/python3.12/site-packages/torch/nn/modules/module.py"", line 1740, in _wrapped_call_impl
diff --git a/torchao/_models/llama/__init__.py b/torchao/_models/llama/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/torchao/_models/llm/__init__.py b/torchao/_models/llm/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/torchao/prototype/awq/README.md b/torchao/prototype/awq/README.md
index 1040610db5..5f50f2703c 100644
--- a/torchao/prototype/awq/README.md
+++ b/torchao/prototype/awq/README.md
@@ -2,7 +2,7 @@
 Adapted from https://github.com/mit-han-lab/llm-awq
 
 ## Benchmarks
-Evaluation perplexity numbers were calculated using the script in awq/example.py Group size of 64 was used for all quantization methods. For Llama-2-7b-chat-hf, performance benchmarks were calculated using the torchao/_models/llama/generate.py script and run on a 1xA100 80GB SXM4 instance. The awq-uint4 quantization method does not use an efficient fused kernel which is why performance is not great. awq-hqq uses tinygemm int4->bf16 kernel + hqq to provide better performance.
+Evaluation perplexity numbers were calculated using the script in awq/example.py Group size of 64 was used for all quantization methods. For Llama-2-7b-chat-hf, performance benchmarks were calculated using the benchmarks/_models/llama/generate.py script and run on a 1xA100 80GB SXM4 instance. The awq-uint4 quantization method does not use an efficient fused kernel which is why performance is not great. awq-hqq uses tinygemm int4->bf16 kernel + hqq to provide better performance.
 
 | Model              | Quantization | Tokens/sec | Throughput (GB/sec) | Peak Mem (GB) | Model Size (GB) |
 |--------------------|--------------|------------|---------------------|---------------|-----------------|
@@ -23,9 +23,3 @@ The following tests were performed using LM eval and groupsize = 128
 |                    | awq-uint4    | 11.409     | 0.519           | 0.756      | 0.577         |
 |                    | int4wo-hqq   | 11.905     | 0.528           | 0.757      | 0.563         |
 |                    | int4wo-128   | 12.380     | 0.502           | 0.753      | 0.548         |
-
-
-
-
-
-
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
index 0fc875d44d..251dff5ba0 100644
--- a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
+++ b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -400,7 +400,7 @@ def run_sequential_BO(
     args,
 ):
     """
-    currently use the loader and benchmark code from torchao/_models/llama/generate,
+    currently use the loader and benchmark code from benchmarks/_models/llama/generate,
     and use lm_eval for ppl evaluation
     """
     # load tokenizers
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
index 36f0befa80..5610779bfe 100644
--- a/torchao/quantization/README.md
+++ b/torchao/quantization/README.md
@@ -320,7 +320,7 @@ Note that the workaround is also required for `torch.compile` with `freezing` (`
 ### KV Cache Quantization
 We've added kv cache quantization and other features in order to enable long context length (and necessarily memory efficient) inference.
 
-In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](../../torchao/_models/llama/README.md#KV-Cache-Quantization-Memory-Efficient-Inference)
+In practice these features alongside int4 weight only quantization allow us to **reduce peak memory by ~55%**, meaning we can Llama3.1-8B inference with a **130k context length with only 18.9 GB of peak memory.** More details can be found [here](../../benchmarks/_models/llama/README.md#KV-Cache-Quantization-Memory-Efficient-Inference)
 
 ### Sparse-Marlin
 
@@ -346,7 +346,7 @@ Marlin QQQ is an optimized GPU kernel that supports W4A8 mixed precision GEMM. F
 |             | w4a8-g128               |  187.62       |  640.32                 | 4.82            |  3.41           |
 
 ### Gemlite Triton
-Int4 and Int8 quantization using the [Gemlite Triton](https://github.com/mobiusml/gemlite) kernels. You can try it out with the `quantize_` api as above alongside the constructor `gemlite_uintx_weight_only`.  An example can be found in `torchao/_models/llama/generate.py`.
+Int4 and Int8 quantization using the [Gemlite Triton](https://github.com/mobiusml/gemlite) kernels. You can try it out with the `quantize_` api as above alongside the constructor `gemlite_uintx_weight_only`.  An example can be found in `benchmarks/_models/llama/generate.py`.
 
 Note: we test on gemlite 0.4.1, but should be able to use any version after that, we'd recommend to use the latest release to get the most recent performance improvements.
 
@@ -362,7 +362,7 @@ We're trying to develop kernels for low bit quantization for intx quantization f
 |             | uintx-4-64-hqq          |  8.124              |  47.85        |  213.24                 | 11.85            |  4.46           |
 |             | uintx-2-8-hqq           | 39.605              |  34.83        |  261.42                 | 14.99            |  7.51           |
 
-You try can out these apis with the `quantize_` api as above alongside the config `UIntXWeightOnlyConfig`. An example can be found in  in `torchao/_models/llama/generate.py`.
+You try can out these apis with the `quantize_` api as above alongside the config `UIntXWeightOnlyConfig`. An example can be found in  in `benchmarks/_models/llama/generate.py`.
 
 ### int8_dynamic_activation_intx_weight Quantization
 We have kernels that do 8-bit dynamic quantization of activations and uintx groupwise quantization of weights.  These kernels are experimental and can only be run on a device with an ARM CPU (e.g., a Mac computers with Apple silicon).  The benchmarks below were run on an M1 Mac Pro, with 8 perf cores, and 2 efficiency cores, and 32GB of RAM.  In all cases, torch.compile was used.
@@ -373,7 +373,7 @@ We have kernels that do 8-bit dynamic quantization of activations and uintx grou
 |               | int8_dynamic_activation_intx_weight-4-256-false  |  16.03        |  65.81                  |  NA              | 4.11            |
 |               | int8_dynamic_activation_intx_weight-3-256-false  |  18.94        |  59.97                  |  NA              | 3.17            |
 
-You can try out these apis with the `quantize_` api as above alongside the constructor `int8_dynamic_activation_intx_weight`.  An example can be found in `torchao/_models/llama/generate.py`.
+You can try out these apis with the `quantize_` api as above alongside the constructor `int8_dynamic_activation_intx_weight`.  An example can be found in `benchmarks/_models/llama/generate.py`.
 
 ### Codebook Quantization
 The benchmarks below were run on a single NVIDIA-A6000 GPU.
@@ -385,7 +385,7 @@ The benchmarks below were run on a single NVIDIA-A6000 GPU.
 | Llama-3.1-8B| Base (bfloat16)         |  7.713              |  32.16        |  482.70                 | 16.35            | 15.01           |
 |             | codebook-4-64           |  10.095             |  1.73         |  8.63                   | 23.11            |  4.98           |
 
-You try can out these apis with the `quantize_` api as above alongside the constructor `codebook_weight_only` an example can be found in  in `torchao/_models/llama/generate.py`.
+You try can out these apis with the `quantize_` api as above alongside the constructor `codebook_weight_only` an example can be found in  in `benchmarks/_models/llama/generate.py`.
 
 ### Automatic Inductor Configuration
 
diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
index b689a3adf4..fced804b65 100644
--- a/torchao/sparsity/README.md
+++ b/torchao/sparsity/README.md
@@ -28,7 +28,7 @@ The following benchmarks we ran for sam ViT-h on an NVIDIA-A100-80GB, with batch
 |            | 2:4 sparsity (attn + mlp)                                                                            | 24.30 | 13429        | 0.5306              | **1.07x**        | **91.31%**        |
 |            | int8 dynamic quant (attn)<br>int8 dynamic quant + 2:4 sparsity (mlp lin1)<br>2:4 sparsity (mlp lin2) | 26.46 | 14865        | 0.5668              | **1.16x**        | **97.54%**        |
 
-To reproduce our benchmarks please follow these [instructions](/torchao/_models/sam/README.md).
+To reproduce our benchmarks please follow these [instructions](/benchmarks/_models/sam/README.md).
 
 ### LLama3