change llm model test from gemma3 to qwen to skip auth (#3807)

lanluo-nvidia · web-flow · commit d0cd4698a05c · 2025-09-08T07:51:26.000-07:00
diff --git a/tests/py/dynamo/models/test_llm_models.py b/tests/py/dynamo/models/test_llm_models.py
@@ -15,13 +15,13 @@
 
 @pytest.mark.unit
 @pytest.mark.parametrize("precision", ["FP16", "BF16", "FP32"])
-def test_gemma3_decoder_layer(precision):
+def test_llm_decoder_layer(precision):
 
     with torch.inference_mode():
         args = argparse.Namespace()
         args.debug = False
         args.num_tokens = 128
-        args.model = "google/gemma-3-1b-it"
+        args.model = "Qwen/Qwen2.5-0.5B-Instruct"
         args.precision = precision
         args.min_block_size = 1
         args.prompt = "What is parallel programming ?"
@@ -44,7 +44,10 @@ def test_gemma3_decoder_layer(precision):
             .to("cuda")
         )
 
-        register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
+        if register_sdpa._SDPA_MAPPING.get(args.model, None) is not None:
+            register_sdpa._SDPA_MAPPING[args.model](model_config=model.config)
+        else:
+            register_sdpa._SDPA_MAPPING["default"](model_config=model.config)
         model = model.to(dtype)
         # use randint will generate nan values in the logits, use a fixed input_ids for now
         # input_ids = torch.randint(0, model.config.vocab_size, (1, args.num_tokens)).to("cuda")
diff --git a/tools/llm/torchtrt_ext/sdpa_converter.py b/tools/llm/torchtrt_ext/sdpa_converter.py
@@ -257,9 +257,10 @@ def scaled_dot_product_attention(
         attn_bias = impl.unary.log(
             ctx, target, source_ir, name + "_log", one_minus_temp_mask
         )
-        scaled_add_attn_bias = impl.elementwise.add(
-            ctx, target, source_ir, name + "_attn_bias_add", mm, attn_bias
-        )
+
+    scaled_add_attn_bias = impl.elementwise.add(
+        ctx, target, source_ir, name + "_attn_bias_add", mm, attn_bias
+    )
     softmax = impl.normalization.softmax(
         ctx, target, source_ir, name + "_softmax", scaled_add_attn_bias, -1, False
     )

Original file line number	Diff line number	Diff line change
`@@ -257,9 +257,10 @@ def scaled_dot_product_attention(`
`257`	`257`	`attn_bias = impl.unary.log(`
`258`	`258`	`ctx, target, source_ir, name + "_log", one_minus_temp_mask`
`259`	`259`	`)`
`260`		`- scaled_add_attn_bias = impl.elementwise.add(`
`261`		`- ctx, target, source_ir, name + "_attn_bias_add", mm, attn_bias`
`262`		`- )`
	`260`	`+`
	`261`	`+ scaled_add_attn_bias = impl.elementwise.add(`
	`262`	`+ ctx, target, source_ir, name + "_attn_bias_add", mm, attn_bias`
	`263`	`+ )`
`263`	`264`	`softmax = impl.normalization.softmax(`
`264`	`265`	`ctx, target, source_ir, name + "_softmax", scaled_add_attn_bias, -1, False`
`265`	`266`	`)`