Fix Float8Tensor quantize op kernrel preference dispatch

jerryzh168 · jerryzh168 · commit a78fc1190078 · 2025-08-27T17:18:56.000-07:00
Summary: Previously we didn't handle kernel_preference == "fbgemm" properly for the quantize op, this PR makes sure we dispatch to fbgemm kernels when kernel_preference is fbgemm This doesn't have much impact on BC, the serialized checkpoints will use AUTO which is going to be dispatched to triton op for quantize, only thing is fixing the kernel choice for fbgemm kernel preference, which is supposed to be a developer facing API (we expect most users to just use AUTO without worrying about details) Test Plan: python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_kernel_preference_numerical_equivalence Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2883, branch: jerryzh168/stack/59
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -63,7 +63,7 @@ def setUp(self):
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("mode", ["dynamic", "weight-only"])
     @common_utils.parametrize("compile", [True, False])
-    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    @common_utils.parametrize("granularity", [PerTensor()])
     @common_utils.parametrize(
         "kernel_preference",
         [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1859,7 +1859,7 @@ def test_float8_fake_quantize(self, granularity: Granularity):
         torch.manual_seed(self.SEED)
         x = torch.randn(32, 64)
         out = fake_quantizer(x)
-        out_expected = Float8Tensor.to_float8(x, dtype, granularity).dequantize()
+        out_expected = Float8Tensor.from_hp(x, dtype, granularity).dequantize()
         sqnr = compute_error(out, out_expected)
         self.assertGreater(sqnr, 16)
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1568,7 +1568,7 @@ def _float8_weight_only_quant_tensor(weight, config):
     else:
         assert config.version == 2, f"Unexpected version: {config.version}"
         weight_dtype = config.weight_dtype
-        new_weight = Float8Tensor.to_float8(
+        new_weight = Float8Tensor.from_hp(
             weight, float8_dtype=weight_dtype, granularity=PerRow()
         )
     return new_weight
@@ -1766,7 +1766,7 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
             kernel_preference=kernel_preference,
         )
 
-        quantized_weight = Float8Tensor.to_float8(
+        quantized_weight = Float8Tensor.from_hp(
             weight,
             float8_dtype=weight_dtype,
             granularity=weight_granularity,
diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -22,7 +22,7 @@ class QuantizeTensorKwargs(abc.ABC):
 
     class Float8Tensor(...)
         @classmethod
-        def to_float8(cls, tensor, quant_kwargs: QuantizeTensorKwargs)
+        def from_hp(cls, tensor, quant_kwargs: QuantizeTensorKwargs)
             ...
     """
 
@@ -43,7 +43,7 @@ def _choose_quant_func_and_quantize_tensor(
     )
 
     if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
-        return Float8Tensor.to_float8(
+        return Float8Tensor.from_hp(
             tensor,
             quant_kwargs.float8_dtype,
             quant_kwargs.granularity,
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -22,7 +22,7 @@
     preprocess_data,
     preprocess_scale,
 )
-from torchao.quantization.granularity import PerRow
+from torchao.quantization.granularity import PerRow, PerTensor
 from torchao.quantization.observer import get_block_size
 from torchao.quantization.quant_primitives import (
     _choose_scale_float8,
@@ -163,7 +163,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         return _dequantize_affine_float8(qdata, scale, output_dtype)
 
     @classmethod
-    def to_float8(
+    def from_hp(
         cls,
         hp_tensor: torch.Tensor,
         float8_dtype: torch.dtype = torch.float8_e4m3fn,
@@ -177,18 +177,30 @@ def to_float8(
         block_size = get_block_size(hp_tensor.shape, granularity)
         block_size = list(block_size)
 
-        # for per row quantization and kernel_preference default setting, we'll use triton kernel for best performance
+        kernel_choice = None
         if (
             kernel_preference == KernelPreference.AUTO
             and _is_fbgemm_genai_gpu_available()
-            and (
-                tuple(block_size)
-                == (1,) * (hp_tensor.ndim - 1) + (hp_tensor.shape[-1],)
-            )
+            and is_sm_at_least_90()
+            and isinstance(granularity, PerRow)
+            and float8_dtype == torch.float8_e4m3fn
+            and hp_value_lb is None
         ):
-            assert float8_dtype == torch.float8_e4m3fn, (
-                f"Only torch.float8_e4m3fn is supported, got: {float8_dtype}"
+            # for per row quantization and kernel_preference auto setting
+            # we'll use triton quantize kernel for best performance
+            kernel_choice = "triton"
+        elif kernel_preference == KernelPreference.FBGEMM and hp_value_lb is None:
+            # we'll use fbgemm quantize kernel if it's explicitly chosen by user
+            assert _is_fbgemm_genai_gpu_available() and is_sm_at_least_90(), (
+                "Specified fbgemm but fbgemm_gpu_genai is not installed or hardware is not >= SM 9.0 (> H100)"
             )
+            kernel_choice = "fbgemm"
+        else:
+            # fallback quantize kernel for everything else will be torch
+            kernel_choice = "torch"
+
+        if kernel_choice == "triton":
+            assert hp_value_lb is None, f"{hp_value_lb=} is not supported"
             if hp_value_ub is not None:
                 maybe_hp_value_ub_tensor = torch.tensor(
                     hp_value_ub, dtype=torch.float, device=hp_tensor.device
@@ -202,7 +214,39 @@ def to_float8(
             for i in range(hp_tensor.ndim):
                 scale_shape.append(hp_tensor.shape[i] // block_size[i])
             scale = scale.reshape(*scale_shape)
+        elif kernel_choice == "fbgemm":
+            assert hp_value_lb is None, f"{hp_value_lb=} is not supported"
+            if hp_value_ub is not None:
+                maybe_hp_value_ub_tensor = torch.tensor(
+                    hp_value_ub, dtype=torch.float, device=hp_tensor.device
+                )
+            else:
+                maybe_hp_value_ub_tensor = None
+            # not used
+            num_tokens = torch.empty([hp_tensor.size(0)], device=hp_tensor.device)
+            if isinstance(granularity, PerRow):
+                data, scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                    hp_tensor, num_tokens, scale_ub=maybe_hp_value_ub_tensor
+                )
+            else:
+                assert isinstance(granularity, PerTensor), (
+                    f"Expected per tensor, got {granularity}"
+                )
+                # TODO: use fbgemm kernel when it works
+                # current error: torch.AcceleratorError: CUDA error: an illegal memory access was encountered
+                # data, scale = torch.ops.fbgemm.quantize_fp8_per_tensor(
+                #     hp_tensor, num_tokens, scale_ub=maybe_hp_value_ub_tensor
+                # )
+                scale = _choose_scale_float8(
+                    hp_tensor,
+                    float8_dtype=float8_dtype,
+                    block_size=block_size,
+                    hp_value_lb=hp_value_lb,
+                    hp_value_ub=hp_value_ub,
+                )
+                data = _quantize_affine_float8(hp_tensor, scale, float8_dtype)
         else:
+            assert kernel_choice == "torch", f"Expected torch, got {kernel_choice}"
             scale = _choose_scale_float8(
                 hp_tensor,
                 float8_dtype=float8_dtype,