Fixed SDPA perf gap

cehongwang · cehongwang · commit f4b61845c03f · 2025-07-22T22:26:25.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -531,7 +531,7 @@ def aten_ops_gelu(
     )
 
 
-@dynamo_tensorrt_converter(torch.ops.aten.matmul, supports_dynamic_shapes=True)
+@dynamo_tensorrt_converter(torch.ops.aten.matmul.default, supports_dynamic_shapes=True)
 @dynamo_tensorrt_converter(torch.ops.aten.dot.default, supports_dynamic_shapes=True)
 @dynamo_tensorrt_converter(torch.ops.aten.mm.default, supports_dynamic_shapes=True)
 @dynamo_tensorrt_converter(torch.ops.aten.mv.default, supports_dynamic_shapes=True)
diff --git a/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py b/py/torch_tensorrt/dynamo/lowering/_decomposition_groups.py
@@ -171,6 +171,7 @@
     aten.upsample_bilinear2d.vec,
     aten.upsample_trilinear3d.vec,
     aten.upsample_bicubic2d.vec,
+    aten.matmul.default,
 }
 
 
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -9,7 +9,6 @@
     _get_decomp_for_cia,
 )
 from torch._ops import OpOverload
-
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim
 from torch_tensorrt.dynamo.utils import to_torch_device
@@ -423,8 +422,8 @@ def instance_norm_decomposition(
 
 @register_torch_trt_decomposition(
     torch.ops.aten.full_like, registry=TORCH_TRT_DECOMPOSITIONS
-)  # type: ignore
-def full_like_decomposition(*args, **kwargs) -> torch.Tensor:
+)
+def full_like_decomposition(*args: Any, **kwargs: Any) -> torch.Tensor:
     input = args[0]
     shape = args[0].shape
     fill_value = args[1]
@@ -454,11 +453,13 @@ def scaled_dot_product_attention_decomposition(
 ) -> torch.Tensor:
     L, S = query.size(-2), key.size(-2)
     device = query.device
-    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=device)
+
+    if is_causal or attn_mask is not None:
+        attn_bias = torch.zeros((L, S), dtype=query.dtype, device=device)
 
     if is_causal:
         assert attn_mask is None, "attn_mask must be None when is_causal=True"
-        temp_mask = torch.ones(L, S, dtype=torch.bool, device=device).tril(diagonal=0)
+        temp_mask = torch.ones((L, S), dtype=torch.bool, device=device).tril(diagonal=0)
         attn_bias = attn_bias.masked_fill(temp_mask.logical_not(), float("-inf"))
 
     if attn_mask is not None:
@@ -471,17 +472,19 @@ def scaled_dot_product_attention_decomposition(
         key = key.repeat_interleave(query.size(-3) // key.size(-3), -3)
         value = value.repeat_interleave(query.size(-3) // value.size(-3), -3)
 
-    attn_weight = query @ key.transpose(-2, -1)
+    attn_weight = torch.matmul(query, key.transpose(-2, -1))
 
     if scale is None:
         scale = torch.sqrt(torch.scalar_tensor(query.size(-1), dtype=torch.int))
         attn_weight = attn_weight / scale
     else:
         attn_weight = attn_weight * scale
 
-    attn_weight = attn_weight + attn_bias
+    if is_causal or attn_mask is not None:
+        attn_weight = attn_weight + attn_bias
+
     attn_weight = torch.softmax(attn_weight, dim=-1)
-    return attn_weight @ value
+    return torch.matmul(attn_weight, value)
 
 
 @register_torch_trt_decomposition(

Original file line number	Diff line number	Diff line change
`@@ -531,7 +531,7 @@ def aten_ops_gelu(`
`531`	`531`	`)`
`532`	`532`
`533`	`533`
`534`		`-@dynamo_tensorrt_converter(torch.ops.aten.matmul, supports_dynamic_shapes=True)`
	`534`	`+@dynamo_tensorrt_converter(torch.ops.aten.matmul.default, supports_dynamic_shapes=True)`
`535`	`535`	`@dynamo_tensorrt_converter(torch.ops.aten.dot.default, supports_dynamic_shapes=True)`
`536`	`536`	`@dynamo_tensorrt_converter(torch.ops.aten.mm.default, supports_dynamic_shapes=True)`
`537`	`537`	`@dynamo_tensorrt_converter(torch.ops.aten.mv.default, supports_dynamic_shapes=True)`
Original file line number	Diff line number	Diff line change
`@@ -171,6 +171,7 @@`
`171`	`171`	`aten.upsample_bilinear2d.vec,`
`172`	`172`	`aten.upsample_trilinear3d.vec,`
`173`	`173`	`aten.upsample_bicubic2d.vec,`
	`174`	`+ aten.matmul.default,`
`174`	`175`	`}`
`175`	`176`
`176`	`177`