Enable USE_FBGEMM_GENAI (#4703)

cthi · facebook-github-bot · commit a56882de9864 · 2025-09-03T22:24:07.000-07:00
Summary: X-link: pytorch/pytorch#160676 Pull Request resolved: #4703 X-link: facebookresearch/FBGEMM#1728 In this diff we enable the support for the new FBGEMM backed FP8 `torch._scaled_grouped_mm` on ROCm. For now we only enable support for `gfx942` as that is what we have thoroughly tested performance and correctness on. Reviewed By: drisspg Differential Revision: D79564024 fbshipit-source-id: bf2aa1a3eee43d0e47e9ba1e5514152e502da35f
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -1263,6 +1263,7 @@ def quantize(self, x, wq, w_scale, m_sizes):
         out = torch.empty(
             (xq.shape[0], wq.shape[1]), dtype=torch.bfloat16, device=xq.device
         )
+        x_scale = x_scale.view(x_scale.shape[0])
         return xq, wq, x_scale, w_scale, offsets, out
 
     def compute(self, xq, wq, x_scale, w_scale, offsets, out):
@@ -1287,6 +1288,48 @@ def cuda(self) -> bool:
         return False
 
 
+@register_quantize_op
+class ScaledGroupedMMRowwise(FP8StackedGroupedGemmTorch):
+    def __init__(self):
+        self.fast_accum = True
+        self.torch_compile = False
+
+    def compute(self, xq, wq, x_scale, w_scale, offsets, _):
+        if self.torch_compile:
+            f = torch.compile(
+                torch._scaled_grouped_mm,
+                options={
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "TRITON,CK,CUTLASS,ATEN",
+                },
+            )
+        else:
+            f = torch._scaled_grouped_mm
+
+        return f(
+            xq,
+            wq.transpose(-2, -1),
+            offs=offsets,
+            out_dtype=torch.bfloat16,
+            scale_a=x_scale,
+            scale_b=w_scale,
+            scale_result=None,
+            use_fast_accum=self.fast_accum,
+        )
+
+    @property
+    def name(self) -> str:
+        return "scaled_grouped_mm_rowwise"
+
+    @property
+    def hip(self) -> bool:
+        return True
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
 @register_quantize_op
 class FP8StackedGroupwiseGroupedGemm(QuantizeOpBase):
     """
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip
@@ -12,7 +12,6 @@
 #include <tuple>
 
 #include <ATen/core/Tensor.h>
-#include <ATen/hip/HIPContext.h>
 #include <c10/hip/HIPStream.h>
 
 #include "ck/ck.hpp"