Merge branch 'main' into 1520-enable-torchao-experimental-embedding-quant

dillondesilva · web-flow · commit 12e3c65b9e63 · 2025-04-23T23:45:20.000+10:00
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -950,27 +950,11 @@ jobs:
         run: |
           export TORCHCHAT_ROOT=${PWD}
           echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
-      - name: Load or install ET
-        id: install-et
-        uses: actions/cache@v4
-        with:
-          path: |
-            ./et-build
-            ./torchchat/utils/scripts
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }}
-      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
-        continue-on-error: true
+      - name: Install ExecuTorch
         run: |
           echo "Installing ExecuTorch"
+          export TORCHCHAT_ROOT=${PWD}
           bash torchchat/utils/scripts/install_et.sh
-      - name: Install ExecuTorch python
-        run: |
-          echo "Install ExecuTorch python"
-          export TORCHCHAT_ROOT=$PWD
-          export ET_BUILD_DIR="et-build"
-          ENABLE_ET_PYBIND="${1:-true}"
-          source "torchchat/utils/scripts/install_utils.sh"
-          install_executorch_python_libs $ENABLE_ET_PYBIND
       - name: Install runner
         run: |
           echo "Installing runner"
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-a96eeb1c7d7ba24cf0ccfc105141729acfed22bf
+7513042f39515af4c643bc1f9399952ad7f4f904
diff --git a/install/install_torch.sh b/install/install_torch.sh
@@ -66,6 +66,13 @@ then
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
     #torchtune=="0.7.0" # no 0.6.0 on xpu nightly
   )
+elif [[ -x "$(command -v npu-smi)" ]];
+then
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0.dev20250310+cpu"
+    torchvision=="0.22.0.dev20250310"
+    torchtune=="0.6.0"
+  )
 else
   REQUIREMENTS_TO_INSTALL=(
     torch=="2.8.0.${PYTORCH_NIGHTLY_VERSION}"
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -29,7 +29,7 @@
 from torchchat.utils.build_utils import (
     device_sync,
     is_cpu_device,
-    is_cuda_or_cpu_or_xpu_device,
+    is_supported_device,
     name_to_dtype,
 )
 from torchchat.utils.measure_time import measure_time
@@ -74,10 +74,8 @@ class BuilderArgs:
 
     def __post_init__(self):
         if self.device is None:
-            if torch.cuda.is_available():
-                self.device = "cuda"
-            elif torch.xpu.is_available():
-                self.device = "xpu"
+            if torch.accelerator.is_available():
+                self.device = torch.accelerator.current_accelerator().type
             else:
                 self.device = "cpu"
 
@@ -539,7 +537,7 @@ def _initialize_model(
         _set_gguf_kwargs(builder_args, is_et=is_pte, context="generate")
 
     if builder_args.dso_path:
-        if not is_cuda_or_cpu_or_xpu_device(builder_args.device):
+        if not is_supported_device(builder_args.device):
             print(
                 f"Cannot load specified DSO to {builder_args.device}. Attempting to load model to CPU instead"
             )
@@ -573,7 +571,7 @@ def do_nothing(max_batch_size, max_seq_length):
             raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}")
 
     elif builder_args.aoti_package_path:
-        if not is_cuda_or_cpu_or_xpu_device(builder_args.device):
+        if not is_supported_device(builder_args.device):
             print(
                 f"Cannot load specified PT2 to {builder_args.device}. Attempting to load model to CPU instead"
             )
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -176,8 +176,8 @@ def _add_model_config_args(parser, verb: str) -> None:
         "--device",
         type=str,
         default=None,
-        choices=["fast", "cpu", "cuda", "mps", "xpu"],
-        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
+        choices=["fast", "cpu", "cuda", "mps", "xpu", "npu"],
+        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu, npu",
     )
     model_config_parser.add_argument(
         "--attention-backend",
@@ -359,6 +359,12 @@ def _add_generation_args(parser, verb: str) -> None:
             default=1,
             help="Number of samples",
         )
+        generator_parser.add_argument(
+            "--accumulate-tokens",
+            type=int,
+            default=8,
+            help="Number of generated tokens to accumulate before calling the callback on each one of them.",
+        )
 
     generator_parser.add_argument(
         "--image-prompts",
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -230,6 +230,7 @@ class GeneratorArgs:
     max_autotune: bool = False
     # (Misnomer) See Issue: https://github.com/pytorch/torchchat/issues/1273
     is_torchtune_model: bool = False
+    accumulate_tokens: int = 8
 
     def __post_init__(self):
         if self.compile_prefill and self.sequential_prefill:
@@ -294,6 +295,7 @@ def from_args(cls, args):
             sequential_prefill=sequential_prefill,
             max_autotune=args.max_autotune,
             is_torchtune_model=args.model and args.model.endswith("tune"),
+            accumulate_tokens=getattr(args, "accumulate_tokens", 8),
         )
 
 
@@ -530,12 +532,13 @@ def decode_n_tokens(
         need_probs: bool,
         batch=Optional[Dict[str, Any]],  # Inputs for multimodal models
         callback=lambda _: _,
+        accumulate_tokens: int = 8,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
         attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
-        new_tokens, new_probs = [], []
+        new_tokens = []
         encountered_eos = False
         for _i in range(
             num_new_tokens - 1
@@ -554,38 +557,58 @@ def decode_n_tokens(
                 )
                 input_pos += 1
                 new_tokens.append(next_token.clone())
-                callback(new_tokens[-1], done_generating=_i == num_new_tokens - 2)
-                if need_probs or next_prob is None:
+
+                done_generating = _i == num_new_tokens - 2
+                if need_probs:
+                    callback(new_tokens[-1], done_generating=done_generating)
+                if not need_probs or next_prob is None:
                     yield out_token, None
                 else:
-                    new_probs.append(next_prob.clone())
                     yield out_token, next_prob.clone()
                 cur_token = next_token
 
-                # encountered eos
-                if next_token.item() == eos_token_id or (
-                    eot_id is not None and next_token.item() == eot_id
-                ):
-                    encountered_eos = True
-                    final_token, next_prob = self.decode_one_token(
-                        model,
-                        cur_token,
-                        input_pos,
-                        need_probs,
-                        batch=batch,
-                        **sampling_kwargs,
-                    )
-                    input_pos += 1
-                    yield cur_token.clone(), next_prob.clone()
-                    break
+                if need_probs:
+                    # encountered eos
+                    if next_token.item() == eos_token_id or (
+                        eot_id is not None and next_token.item() == eot_id
+                    ):
+                        encountered_eos = True
+                        final_token, next_prob = self.decode_one_token(
+                            model,
+                            cur_token,
+                            input_pos,
+                            need_probs,
+                            batch=batch,
+                            **sampling_kwargs,
+                        )
+                        input_pos += 1
+                        yield cur_token.clone(), next_prob.clone()
+                        break
+                else:
+                    callback_pos = _i % accumulate_tokens + 1
+                    if done_generating or callback_pos == accumulate_tokens:
+                        callback_num = min(accumulate_tokens, callback_pos)
+                        for i in range(callback_num, 0, -1):
+                            callback(new_tokens[-i], done_generating=done_generating)
+
+                            token_item = new_tokens[-i].item()
+                            # encountered eos
+                            if token_item == eos_token_id or (
+                                eot_id is not None and token_item == eot_id
+                            ):
+                                encountered_eos = True
+                                input_pos += 1
+                                yield new_tokens[-i].clone(), None
+                                break
+                        if encountered_eos:
+                            break
 
         if not encountered_eos:
             eos_token = torch.tensor(
                 [eos_token_id if eot_id is None else eot_id],
                 dtype=cur_token.dtype,
                 device=cur_token.device,
             )
-            new_tokens.append(eos_token.clone())
             eos_token, next_prob = self.decode_one_token(
                 model,
                 eos_token.view(1, -1),
@@ -685,6 +708,7 @@ def generate(
         speculate_k: Optional[int] = 8,
         sequential_prefill=True,
         callback=lambda x: x,
+        accumulate_tokens: int,
         max_seq_length: int,
         attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
@@ -788,14 +812,14 @@ def generate(
                 input_pos = input_pos + num_added
                 next_token = next_tokens[-1]
         else:
-            generated_tokens = []
             for generated_token, _ in self.decode_n_tokens(
                 model,
                 next_token,
                 input_pos,
                 max_new_tokens - 1,
                 batch=batch,
                 callback=callback,
+                accumulate_tokens=accumulate_tokens,
                 need_probs=False,
                 eos_token_id=self.tokenizer.eos_id() if self.tokenizer else 2,
                 eot_id=(
@@ -806,7 +830,6 @@ def generate(
                 attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
-                generated_tokens.append(generated_token.view(-1))
                 yield generated_token, None
 
         generate_stats = {
@@ -1185,6 +1208,7 @@ def callback(x, *, done_generating=False):
                     chat_mode=generator_args.chat_mode,
                     batch=batch,
                     callback=callback,
+                    accumulate_tokens=generator_args.accumulate_tokens,
                     temperature=generator_args.temperature,
                     top_k=generator_args.top_k,
                     sequential_prefill=generator_args.sequential_prefill,
@@ -1213,8 +1237,10 @@ def callback(x, *, done_generating=False):
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
                 elif self.builder_args.device == "cuda":
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
-                else:
+                elif self.builder_args.device == "xpu":
                     print(prof.key_averages().table(sort_by="self_xpu_time_total"))
+                elif self.builder_args.device == "npu":
+                    print(prof.key_averages().table(sort_by="self_npu_time_total"))
                 prof.export_chrome_trace(f"{self.profile}.json")
 
             if start_pos >= max_seq_length:
@@ -1229,11 +1255,7 @@ def callback(x, *, done_generating=False):
                 t - aggregate_metrics.get("time_to_first_token", 0)
             )
 
-            if jit_compile:
-                print(
-                    f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds"
-                )
-            else:
+            if not jit_compile:
                 # aggregate_metrics will not append when is jit_compile, which will affect the average numbers.
                 aggregate_metrics["tokens_per_sec"].append(tokens_sec)
                 aggregate_metrics["first_token_per_sec"].append(first_token_sec)
@@ -1257,6 +1279,10 @@ def callback(x, *, done_generating=False):
                 logging.info(
                     f"*** This first iteration will include cold start effects for dynamic import, hardware caches{', JIT compilation' if jit_compile else ''}. ***"
                 )
+                if jit_compile:
+                    logging.info(
+                        f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds"
+                    )
             print("\n========================================\n")
             if start_pos >= max_seq_length:
                 if generator_args.chat_mode:
@@ -1299,8 +1325,10 @@ def callback(x, *, done_generating=False):
             )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
-        if torch.xpu.is_available():
+        elif torch.xpu.is_available():
             print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB")
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            print(f"Memory used: {torch.npu.max_memory_reserved() / 1e9:.02f} GB")
 
 
 
@@ -1595,7 +1623,6 @@ def sample(
 
         return idx_next, probs
 
-
 def run_generator(
     args,
     rank: Optional[int] =None
@@ -1628,8 +1655,10 @@ def run_generator(
         )
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
-        if torch.xpu.is_available():
+        elif torch.xpu.is_available():
             torch.xpu.reset_peak_memory_stats()
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.reset_peak_memory_stats()
 
         for _ in gen.chat(generator_args):
             pass
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
@@ -233,6 +233,8 @@ def device_sync(device="cpu"):
         torch.cuda.synchronize(device)
     elif "xpu" in device:
         torch.xpu.synchronize(device)
+    elif "npu" in device:
+        torch.npu.synchronize(device)
     elif ("cpu" in device) or ("mps" in device):
         pass
     else:
@@ -275,33 +277,32 @@ def is_mps_available() -> bool:
     # MPS, is that you?
     return True
 
+def select_device() -> str:
+    if torch.accelerator.is_available():
+        device = torch.accelerator.current_accelerator().type
+        if device == "mps" and not is_mps_available():
+            return "cpu"
+        return device
+    else:
+        return "cpu"
 
 def get_device_str(device) -> str:
     if isinstance(device, str) and device == "fast":
-        device = (
-            "cuda"
-            if torch.cuda.is_available()
-            else "mps" if is_mps_available()
-            else "xpu" if torch.xpu.is_available()  else "cpu"
-        )
+        device = select_device()
         return device
     else:
         return str(device)
 
 
 def get_device(device) -> str:
     if isinstance(device, str) and device == "fast":
-        device = (
-            "cuda"
-            if torch.cuda.is_available()
-            else "mps" if is_mps_available()
-            else "xpu" if torch.xpu.is_available()  else "cpu"
-        )
+        device = select_device()
     return torch.device(device)
 
 
 def is_cpu_device(device) -> bool:
     return device == "" or str(device) == "cpu"
 
-def is_cuda_or_cpu_or_xpu_device(device) -> bool:
-    return is_cpu_device(device) or ("cuda" in str(device)) or ("xpu" in str(device))
+def is_supported_device(device) -> bool:
+    device_str = str(device)
+    return is_cpu_device(device) or any(dev in device_str for dev in ('cuda', 'xpu', 'npu'))
diff --git a/torchchat/utils/device_info.py b/torchchat/utils/device_info.py
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-a96eeb1c7d7ba24cf0ccfc105141729acfed22bf`
	`1`	`+7513042f39515af4c643bc1f9399952ad7f4f904`