Move files to _models

jainapurva · jainapurva · commit 30567b41db7a · 2025-02-27T10:10:58.000-08:00
diff --git a/benchmarks/_models/llama/eval.py b/benchmarks/_models/llama/eval.py
@@ -15,7 +15,7 @@
 from tokenizer import get_tokenizer
 
 import torchao
-from benchmarks._models.llama.model import prepare_inputs_for_model
+from torchao._models.model import prepare_inputs_for_model
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -120,7 +120,7 @@ def run_evaluation(
             quantize_(model, int4_weight_only(layout=MarlinSparseLayout()))
         if "int4wo" in quantization and "gptq" in quantization:
             # avoid circular imports
-            from benchmarks._models._eval import MultiTensorInputRecorder
+            from torchao._models._eval import MultiTensorInputRecorder
             from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer
 
             groupsize = int(quantization.split("-")[-2])
@@ -172,7 +172,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from benchmarks._models.llama.model import TransformerBlock
+            from torchao._models.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
@@ -242,7 +242,7 @@ def run_evaluation(
     with torch.no_grad():
         print("Running evaluation ...")
         # avoid circular imports
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         TransformerEvalWrapper(
             model=model.to(device),
diff --git a/benchmarks/_models/llama/generate.py b/benchmarks/_models/llama/generate.py
@@ -72,8 +72,8 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 
 
 def multinomial_sample_one_no_sync(
@@ -476,7 +476,7 @@ def ffn_or_attn_only(mod, fqn):
                 filter_fn=lambda x, *args: isinstance(x, torch.nn.Embedding),
             )
         elif quantization.startswith("awq"):
-            from benchmarks._models._eval import TransformerEvalWrapper
+            from torchao._models._eval import TransformerEvalWrapper
             from torchao.utils import TORCH_VERSION_AT_LEAST_2_3
 
             if not TORCH_VERSION_AT_LEAST_2_3:
@@ -575,8 +575,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from benchmarks._models.llama.model import prepare_inputs_for_model
-            from benchmarks._models._eval import InputRecorder
+            from torchao._models.model import prepare_inputs_for_model
+            from torchao._models._eval import InputRecorder
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -665,8 +665,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from benchmarks._models.llama.model import prepare_inputs_for_model
-            from benchmarks._models._eval import InputRecorder
+            from torchao._models.model import prepare_inputs_for_model
+            from torchao._models._eval import InputRecorder
 
             calibration_seq_length = 256
             inputs = (
diff --git a/benchmarks/_models/llama/perf_profile.py b/benchmarks/_models/llama/perf_profile.py
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from benchmarks._models.llama.model import Transformer
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer
+from torchao._models.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py
@@ -22,7 +22,7 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from benchmarks._models.llama.model import (
+from torchao._models.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -14,7 +14,7 @@
 import torch
 from safetensors.torch import load_file as load_safetensors_file
 
-from benchmarks._models.llama.model import ModelArgs
+from torchao._models.model import ModelArgs
 
 
 @torch.inference_mode()
diff --git a/test/prototype/test_spinquant.py b/test/prototype/test_spinquant.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from benchmarks._models.llama.model import Transformer
+from torchao._models.model import Transformer
 from torchao.prototype.spinquant import apply_spinquant
 
 
diff --git a/test/quantization/test_gptq_mt.py b/test/quantization/test_gptq_mt.py
@@ -5,8 +5,8 @@
 import torch.nn.functional as F
 from torch.testing._internal.common_utils import run_tests
 
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 from torchao.quantization.GPTQ_MT import Int4WeightOnlyGPTQQuantizer, MultiTensor
 from torchao.quantization.utils import _lm_eval_available
 from torchao.utils import is_fbcode
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -22,8 +22,8 @@
 from torch.testing._internal.common_utils import TestCase
 
 from torchao import quantize_
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 from torchao.dtypes import AffineQuantizedTensor
 from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.quant_api import (
@@ -278,7 +278,7 @@ def test_8da4w_quantizer(self):
     # https://github.com/pytorch-labs/gpt-fast/blob/6253c6bb054e658d67566150f87329b87815ae63/scripts/convert_hf_checkpoint.py
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_8da4w_gptq_quantizer(self):
-        from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
+        from torchao._models._eval import InputRecorder, TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer
 
         # should be similar to TorchCompileDynamicQuantizer
@@ -348,7 +348,7 @@ def test_8da4w_gptq_quantizer(self):
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch verion is 2.4 or lower"
     )
     def test_8da4w_quantizer_eval(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
         precision = torch.bfloat16
@@ -384,7 +384,7 @@ def test_8da4w_quantizer_eval(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_gptq_quantizer_int4_weight_only(self):
-        from benchmarks._models._eval import (
+        from torchao._models._eval import (
             MultiTensorInputRecorder,
             TransformerEvalWrapper,
         )
@@ -454,7 +454,7 @@ def test_gptq_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_quantizer_int4_weight_only(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
         from torchao.quantization.GPTQ import Int4WeightOnlyQuantizer
 
         precision = torch.bfloat16
@@ -492,7 +492,7 @@ def test_quantizer_int4_weight_only(self):
 
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
@@ -525,7 +525,7 @@ def test_eval_wrapper(self):
     # EVAL IS CURRENTLY BROKEN FOR LLAMA 3, VERY LOW ACCURACY
     @unittest.skip("skipping until we get checkpoints for gpt-fast")
     def test_eval_wrapper_llama3(self):
-        from benchmarks._models._eval import TransformerEvalWrapper
+        from torchao._models._eval import TransformerEvalWrapper
 
         precision = torch.bfloat16
         device = "cuda"
diff --git a/test/test_ao_models.py b/test/test_ao_models.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from benchmarks._models.llama.model import Transformer
+from torchao._models.model import Transformer
 
 _AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
 
diff --git a/torchao/_models/__init__.py b/torchao/_models/__init__.py
diff --git a/torchao/_models/_eval.py b/torchao/_models/_eval.py
diff --git a/torchao/_models/model.py b/torchao/_models/model.py
diff --git a/torchao/_models/tokenizer.py b/torchao/_models/tokenizer.py
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py b/torchao/prototype/quantization/mixed_precision/scripts/BO_acc_throughput.py
@@ -18,15 +18,15 @@
 )
 
 import torchao
-from benchmarks._models.llama.generate import (
+from torchao._models.generate import (
     _load_model,
     decode_one_token,
     device_sync,
     encode_tokens,
     prefill,
 )
-from benchmarks._models.llama.model import Transformer, prepare_inputs_for_model
-from benchmarks._models.llama.tokenizer import get_tokenizer
+from torchao._models.model import Transformer, prepare_inputs_for_model
+from torchao._models.tokenizer import get_tokenizer
 
 default_device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -99,7 +99,7 @@ def generate(
             _replace_with_custom_fn_if_matches_filter(
                 model,
                 AffineQuantizedKVCache.from_float,
-                lambda x, y: isinstance(x, benchmarks._models.llama.model.KVCache),
+                lambda x, y: isinstance(x, torchao._models.model.KVCache),
             )
 
     # format model input
diff --git a/torchao/prototype/spinquant/spinquant.py b/torchao/prototype/spinquant/spinquant.py
@@ -10,7 +10,7 @@
 import torch
 from torch import nn
 
-from benchmarks._models.llama.model import RMSNorm, Transformer
+from torchao._models.model import RMSNorm, Transformer
 from torchao.prototype.spinquant.hadamard_utils import (
     apply_exact_had_to_linear,
     get_hadK,
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -81,7 +81,7 @@ def __init__(
         # needed for GPTQ on the torchao llama model
         import torchao
 
-        benchmarks._models.llama.model.use_index_put_for_kv_cache = True
+        torchao._models.model.use_index_put_for_kv_cache = True
         exported_model = torch._dynamo.export(
             model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake"
         )(*one_input)
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -396,7 +396,7 @@ The `quantize_` and `autoquant` apis now automatically use our recommended induc
 ## (To be moved to prototype) A16W4 WeightOnly Quantization with GPTQ
 
 ```python
-from benchmarks._models._eval import InputRecorder, TransformerEvalWrapper
+from torchao._models._eval import InputRecorder, TransformerEvalWrapper
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 precision = torch.bfloat16
 device = "cuda"