llm: register sdpa variant (#3802)

lanluo-nvidia · web-flow · commit 70874bebe3a8 · 2025-09-03T15:16:20.000-07:00
diff --git a/docsrc/tutorials/compile_hf_models.rst b/docsrc/tutorials/compile_hf_models.rst
@@ -59,6 +59,10 @@ We have officially verified support for the following LLM families:
        | Qwen/Qwen2.5-7B-Instruct
      - FP16, FP32
      - Yes
+   * - Gemma 3
+     - | google/gemma-3-1b-it
+     - FP16, FP32
+     - Yes
 
 Getting Started with run_llm.py
 -------------------------------
@@ -185,8 +189,8 @@ The number of key/value cache tensors is equal to the number of attention heads
 
 Generating Outputs
 ------------------- 
-We use custom `generate <https://github.com/pytorch/TensorRT/blob/main/tools/llm/utils.py#L112>`_ function to generate the outputs. This function performs standard autoregressive decoding without KV caching.
-There is also a `generate_with_static_cache <https://github.com/pytorch/TensorRT/blob/main/tools/llm/utils.py#L141>`_ function that performs autoregressive decoding with KV caching.
+We use custom `generate <https://github.com/pytorch/TensorRT/blob/9241476a868af46169348ab730d18907365a66ee/tools/llm/utils.py#L112>`_ function to generate the outputs. This function performs standard autoregressive decoding without KV caching.
+There is also a `generate_with_static_cache <https://github.com/pytorch/TensorRT/blob/9241476a868af46169348ab730d18907365a66ee/tools/llm/utils.py#L141>`_ function that performs autoregressive decoding with KV caching.
 
 The ``generate_with_static_cache`` function takes care of preparing the inputs to the model compiled with static KV cache.
 The model inputs are ``input_ids``, ``position_ids``, ``key_cache_0``, ``value_cache_0``, ...., ``start_idx``, ``end_idx``.
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Callable, Optional, Sequence, Union
+from typing import Any, Callable, Optional, Sequence, Union
 
 import torch
 from torch_tensorrt.dynamo._settings import CompilationSettings
@@ -53,20 +53,28 @@
 def _aten_lowering_pass(
     *args: LoweringPassSignature,
     index: Optional[int] = None,
+    **kwargs: Any,
 ) -> Union[
     LoweringPassSignature, Callable[[LoweringPassSignature], LoweringPassSignature]
 ]:
     """Adds a lowering pass to the registry, at a specified index if desired
 
     If no index is specified, the lowering pass is inserted at the end of the list
+
+    Additional keyword arguments can be passed to configure the lowering pass behavior.
+    These will be stored as metadata on the pass function.
     """
 
     def add_lowering_pass(
         lowering_pass: LoweringPassSignature,
     ) -> LoweringPassSignature:
+        # Store additional parameters as metadata on the function
+        if kwargs:
+            lowering_pass._lowering_pass_config = kwargs
+
         ATEN_POST_LOWERING_PASSES.add_pass_with_index(lowering_pass, index)
         logger.debug(
-            f"Added lowering pass {lowering_pass} to list at index {index}, current passlist: {ATEN_POST_LOWERING_PASSES}"
+            f"Added lowering pass {lowering_pass} to list at index {index} with config {kwargs}, current passlist: {ATEN_POST_LOWERING_PASSES}"
         )
         return lowering_pass
 
@@ -81,7 +89,7 @@ def add_lowering_pass(
                 f"aten_lowering_pass decorator called with invalid arguments {args} "
                 "To specify an index to insert the pass, use the keyword 'index='"
             )
-    # If no arguments are specified, the decorator was called with an index keyword
+    # If no arguments are specified, the decorator was called with keyword arguments
     else:
         return add_lowering_pass
 
@@ -95,6 +103,18 @@ def _remove_lowering_pass(*, index: int) -> None:
     return
 
 
+def get_lowering_pass_config(lowering_pass: LoweringPassSignature) -> dict[str, Any]:
+    """Get the configuration parameters for a lowering pass function
+
+    Args:
+        lowering_pass: The lowering pass function
+
+    Returns:
+        Dictionary containing the configuration parameters, or empty dict if none
+    """
+    return getattr(lowering_pass, "_lowering_pass_config", {})
+
+
 def post_lowering(
     gm: torch.fx.GraphModule, settings: CompilationSettings = CompilationSettings()
 ) -> torch.fx.GraphModule:
diff --git a/tests/py/dynamo/models/test_llm_models.py b/tests/py/dynamo/models/test_llm_models.py
@@ -0,0 +1,60 @@
+import os
+import sys
+
+import pytest
+import torch
+import torch_tensorrt
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../tools/llm"))
+import argparse
+
+from run_llm import compile_torchtrt
+from torchtrt_ext import register_sdpa
+
+
+@pytest.mark.unit
+@pytest.mark.parametrize("precision", ["FP16", "BF16", "FP32"])
+def test_gemma3_decoder_layer(precision):
+
+    with torch.inference_mode():
+        args = argparse.Namespace()
+        args.debug = False
+        args.num_tokens = 128
+        args.model = "google/gemma-3-1b-it"
+        args.precision = precision
+        args.min_block_size = 1
+        args.prompt = "What is parallel programming ?"
+        if args.precision == "FP16":
+            dtype = torch.float16
+        elif args.precision == "BF16":
+            dtype = torch.bfloat16
+        else:
+            args.precision = "FP32"
+            dtype = torch.float32
+
+        model = (
+            AutoModelForCausalLM.from_pretrained(
+                args.model,
+                use_cache=False,
+                attn_implementation="sdpa",
+                num_hidden_layers=1,
+            )
+            .eval()
+            .to("cuda")
+        )
+
+        register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
+        model = model.to(dtype)
+        # use randint will generate nan values in the logits, use a fixed input_ids for now
+        # input_ids = torch.randint(0, model.config.vocab_size, (1, args.num_tokens)).to("cuda")
+        input_ids = torch.tensor([[2, 3689, 563, 10616, 14929, 2360]]).to("cuda")
+
+        position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).to("cuda")
+        pyt_outputs = model(input_ids.clone(), position_ids=position_ids.clone())
+        trt_model = compile_torchtrt(model, input_ids, args)
+        trt_outputs = trt_model(input_ids, position_ids=position_ids)
+
+        torch.testing.assert_close(
+            pyt_outputs.logits, trt_outputs.logits, rtol=5e-1, atol=5e-1
+        )
diff --git a/tools/llm/README.md b/tools/llm/README.md
@@ -23,6 +23,7 @@ We have officially verified support for the following models:
 | LLaMA 3.2 | meta-llama/Llama-3.2-1B-Instruct<br>meta-llama/Llama-3.2-3B-Instruct | FP16, FP32 | Yes |
 | Qwen 2.5 | Qwen/Qwen2.5-0.5B-Instruct<br>Qwen/Qwen2.5-1.5B-Instruct<br>Qwen/Qwen2.5-4B-Instruct<br>Qwen/Qwen2.5-7B-Instruct | FP16, FP32 | Yes |
 | Qwen 3 | Qwen/Qwen3-0.6B<br>Qwen/Qwen3-1.7B<br>Qwen/Qwen3-4B<br>Qwen/Qwen3-8B | FP16, FP32 | Yes |
+| Gemma 3 | google/gemma-3-1b-it | FP16, FP32 | Yes |
 
 
 ### Usage
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -58,6 +58,11 @@ def get_model(args):
             .eval()
             .cuda()
         )
+        # register SDPA variant for the model
+        if register_sdpa._SDPA_MAPPING.get(args.model, None) is not None:
+            register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
+        else:
+            register_sdpa._SDPA_MAPPING["default"](model_config=model.config)
 
     if args.precision == "FP16":
         model = model.to(torch.float16)
diff --git a/tools/llm/torchtrt_ext/register_sdpa.py b/tools/llm/torchtrt_ext/register_sdpa.py
diff --git a/tools/llm/torchtrt_ext/sdpa_converter.py b/tools/llm/torchtrt_ext/sdpa_converter.py