pytorch
diff --git a/‎benchmarks/_models/llama/eval.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/_models/llama/eval.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/_models/llama/generate.py‎
Lines changed: 17 additions & 111 deletions b/‎benchmarks/_models/llama/generate.py‎
Lines changed: 17 additions & 111 deletions
diff --git a/‎benchmarks/_models/llama/perf_profile.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/_models/llama/perf_profile.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/_models/sam/eval_combo.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/_models/sam/eval_combo.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/quantized_training/pretrain_llama2.py‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/quantized_training/pretrain_llama2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/sam2_amg_server/annotate_with_rle.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/sam2_amg_server/annotate_with_rle.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/sam2_amg_server/cli.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/sam2_amg_server/cli.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/sam2_amg_server/cli_on_modal.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/sam2_amg_server/cli_on_modal.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/sam2_amg_server/compare_rle_lists.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/sam2_amg_server/compare_rle_lists.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/sam2_amg_server/compile_export_utils.py‎
Lines changed: 5 additions & 5 deletions b/‎examples/sam2_amg_server/compile_export_utils.py‎
Lines changed: 5 additions & 5 deletions
@@ -15,7 +15,7 @@
 from tokenizer import get_tokenizer
 
 import torchao
-from torchao._models.model import prepare_inputs_for_model
+from torchao._models.llm.model import prepare_inputs_for_model
 from torchao.quantization import (
     PerRow,
     PerTensor,
@@ -172,7 +172,7 @@ def run_evaluation(
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
-            from torchao._models.model import TransformerBlock
+            from torchao._models.llm.model import TransformerBlock
             from torchao.prototype.autoround.autoround_llm import (
                 quantize_model_with_autoround_,
             )
 
@@ -7,20 +7,30 @@
 import time
 from datetime import datetime
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch._dynamo.config
 import torch._inductor.config
 
 import torchao
-from benchmarks._models.utils import (
+from torchao._models.utils import (
+    _load_model,
+    decode_n_tokens,
+    decode_one_token,
+    encode_tokens,
     get_arch_name,
+    prefill,
     write_json_result_local,
     write_json_result_ossci,
 )
 from torchao.quantization.quant_primitives import MappingType
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, get_model_size_in_bytes
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_5,
+    default_device,
+    device_sync,
+    get_model_size_in_bytes,
+)
 
 torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
 torch.backends.cuda.enable_cudnn_sdp(True)
@@ -49,97 +59,12 @@ def device_timer(device):
         print(f"device={device} is not yet suppported")
 
 
-def device_sync(device):
-    if "cuda" in device:
-        torch.cuda.synchronize(device)
-    elif "xpu" in device:
-        torch.xpu.synchronize(device)
-    elif ("cpu" in device) or ("mps" in device):
-        pass
-    else:
-        print(f"device={device} is not yet suppported")
-
-
-default_device = (
-    "cuda"
-    if torch.cuda.is_available()
-    else "xpu"
-    if torch.xpu.is_available()
-    else "cpu"
-)
-
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.model import Transformer, prepare_inputs_for_model
-from torchao._models.tokenizer import get_tokenizer
-
-
-def multinomial_sample_one_no_sync(
-    probs_sort,
-):  # Does multinomial sampling without a cuda synchronization
-    q = torch.empty_like(probs_sort).exponential_(1)
-    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
-
-
-def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-    logits = logits / max(temperature, 1e-5)
-
-    if top_k is not None:
-        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-        pivot = v.select(-1, -1).unsqueeze(-1)
-        logits = torch.where(logits < pivot, -float("Inf"), logits)
-    probs = torch.nn.functional.softmax(logits, dim=-1)
-    return probs
-
-
-def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
-    probs = logits_to_probs(logits[:, -1], temperature, top_k)
-    idx_next = multinomial_sample_one_no_sync(probs)
-    return idx_next, probs
-
-
-def prefill(
-    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
-) -> torch.Tensor:
-    # input_pos: [B, S]
-    logits = model(x, input_pos)
-    return sample(logits, **sampling_kwargs)[0]
-
-
-def decode_one_token(
-    model: Transformer, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # input_pos: [B, 1]
-    assert input_pos.shape[-1] == 1
-    logits = model(x, input_pos)
-    return sample(logits, **sampling_kwargs)
-
-
-def decode_n_tokens(
-    model: Transformer,
-    cur_token: torch.Tensor,
-    input_pos: torch.Tensor,
-    num_new_tokens: int,
-    callback=lambda _: _,
-    **sampling_kwargs,
-):
-    new_tokens, new_probs = [], []
-    for i in range(num_new_tokens):
-        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
-            next_token, next_prob = decode_one_token(
-                model, cur_token, input_pos, **sampling_kwargs
-            )
-            next_token, next_prob = next_token.clone(), next_prob.clone()
-            input_pos += 1
-            # in some instances not having this causes weird issues with the stored tokens when you run the next decode_one_token step
-            new_tokens.append(next_token.clone())
-            callback(new_tokens[-1])
-            new_probs.append(next_prob)
-            cur_token = next_token
-
-    return new_tokens, new_probs
+from torchao._models.llm.model import Transformer, prepare_inputs_for_model
+from torchao._models.llm.tokenizer import get_tokenizer
 
 
 def model_forward(model, x, input_pos):
@@ -230,25 +155,6 @@ def generate(
     return seq
 
 
-def encode_tokens(tokenizer, string, bos=True, device=default_device):
-    tokens = tokenizer.encode(string)
-    if bos:
-        tokens = [tokenizer.bos_id()] + tokens
-    return torch.tensor(tokens, dtype=torch.int, device=device)
-
-
-def _load_model(checkpoint_path, device, precision):
-    checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
-    if "model" in checkpoint and "stories" in str(checkpoint_path):
-        checkpoint = checkpoint["model"]
-    with torch.device("meta"):
-        model = Transformer.from_name(checkpoint_path.parent.name)
-    model.load_state_dict(checkpoint, assign=True)
-    model = model.to(device=device, dtype=precision)
-
-    return model.eval()
-
-
 B_INST, E_INST = "[INST]", "[/INST]"
 
 
@@ -575,8 +481,8 @@ def ffn_or_attn_only(mod, fqn):
                 model, float8_dynamic_activation_float8_weight(granularity=granularity)
             )
         elif "autoquant_v2" in quantization:
-            from torchao._models.model import prepare_inputs_for_model
             from torchao._models._eval import InputRecorder
+            from torchao._models.llm.model import prepare_inputs_for_model
             from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
 
             calibration_seq_length = 256
@@ -665,8 +571,8 @@ def ffn_or_attn_only(mod, fqn):
             # do autoquantization
             model.finalize_autoquant()
         elif "autoquant" in quantization:
-            from torchao._models.model import prepare_inputs_for_model
             from torchao._models._eval import InputRecorder
+            from torchao._models.llm.model import prepare_inputs_for_model
 
             calibration_seq_length = 256
             inputs = (
 
@@ -116,8 +116,8 @@
 import torch
 from torch.nn.attention import SDPBackend
 
-from torchao._models.model import Transformer
-from torchao._models.tokenizer import get_tokenizer
+from torchao._models.llm.model import Transformer
+from torchao._models.llm.tokenizer import get_tokenizer
 from torchao.prototype.profiler import (
     CUDADeviceSpec,
     TransformerPerformanceCounter,
 
@@ -9,7 +9,7 @@
 from metrics import calculate_miou, create_result_entry
 
 import torchao
-from benchmarks._models.utils import (
+from torchao._models.utils import (
     get_arch_name,
     write_json_result_local,
     write_json_result_ossci,
 
@@ -22,13 +22,13 @@
 from torch.utils.checkpoint import checkpoint
 from tqdm import tqdm
 
-from torchao._models.model import (
+from torchao import quantize_
+from torchao._models.llm.model import (
     ModelArgs,
     RMSNorm,
     Transformer,
     transformer_configs,
 )
-from torchao import quantize_
 from torchao.prototype import low_bit_optim
 from torchao.prototype.quantized_training import (
     bitnet_training,
 
@@ -14,7 +14,7 @@
 )
 from tqdm import tqdm
 
-from benchmarks._models.sam2.utils.amg import area_from_rle, rle_to_mask
+from torchao._models.sam2.utils.amg import area_from_rle, rle_to_mask
 
 
 def timestamped_print(*args, **kwargs):
 
@@ -12,9 +12,9 @@
     show_anns,
 )
 
-from benchmarks._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
-from benchmarks._models.sam2.build_sam import build_sam2
-from benchmarks._models.sam2.utils.amg import rle_to_mask
+from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from torchao._models.sam2.build_sam import build_sam2
+from torchao._models.sam2.utils.amg import rle_to_mask
 
 
 def main_docstring():
 
@@ -84,10 +84,10 @@ def build(self):
             from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
             from sam2.build_sam import build_sam2
         else:
-            from benchmarks._models.sam2.automatic_mask_generator import (
+            from torchao._models.sam2.automatic_mask_generator import (
                 SAM2AutomaticMaskGenerator,
             )
-            from benchmarks._models.sam2.build_sam import build_sam2
+            from torchao._models.sam2.build_sam import build_sam2
 
         os.chdir(f"{TARGET}ao_src_0/examples/sam2_amg_server")
         import sys
@@ -139,11 +139,11 @@ def build(self):
             from sam2.utils.amg import mask_to_rle_pytorch as mask_to_rle_pytorch_2
             from sam2.utils.amg import rle_to_mask
         else:
-            from benchmarks._models.sam2.utils.amg import (
+            from torchao._models.sam2.utils.amg import (
                 mask_to_rle_pytorch_2,
                 rle_to_mask,
             )
-        from benchmarks._models.sam2.utils.amg import area_from_rle
+        from torchao._models.sam2.utils.amg import area_from_rle
 
         self.np = np
         self.tio = tio
 
@@ -7,7 +7,7 @@
 import torch
 
 
-# from benchmarks._models.sam2.utils.amg import rle_to_mask
+# from torchao._models.sam2.utils.amg import rle_to_mask
 def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
     """Compute a binary mask from an uncompressed RLE."""
     h, w = rle["size"]
 
@@ -4,7 +4,7 @@
 
 import torch
 
-from benchmarks._models.sam2.sam2_image_predictor import SAM2ImagePredictor
+from torchao._models.sam2.sam2_image_predictor import SAM2ImagePredictor
 
 # Tools used to avoid compilation cold start and dynamo cache lookups
 # We take the compiled model and export it using the largest
@@ -519,12 +519,12 @@ def set_fast(
         # A bunch of extra compiles at module level
         # Note that this can cause recompilations!
         # We might want to guard on that
-        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
+        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0 = torch.compile(
             fullgraph=True, dynamic=True
-        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
-        benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
+        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_0)
+        torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1 = torch.compile(
             fullgraph=True, dynamic=True
-        )(benchmarks._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
+        )(torchao._models.sam2.utils.amg._mask_to_rle_pytorch_2_0_1)
         mask_generator.calculate_stability_score = torch.compile(
             fullgraph=True, dynamic=True
         )(mask_generator.calculate_stability_score)
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`)`
`15`	`15`	`from tqdm import tqdm`
`16`	`16`
`17`		`-from benchmarks._models.sam2.utils.amg import area_from_rle, rle_to_mask`
	`17`	`+from torchao._models.sam2.utils.amg import area_from_rle, rle_to_mask`
`18`	`18`
`19`	`19`
`20`	`20`	`def timestamped_print(args, *kwargs):`