reorganize

danielvegamyhre · danielvegamyhre · commit bb5ddb332c3a · 2025-04-22T08:21:07.000-07:00
diff --git a/torchao/prototype/scaled_grouped_mm/benchmarks/benchmark_kernels.py b/torchao/prototype/scaled_grouped_mm/benchmarks/benchmark_kernels.py
@@ -18,7 +18,7 @@
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
 )
-from torchao.prototype.scaled_grouped_mm.test.utils import (
+from torchao.prototype.scaled_grouped_mm.utils import (
     _to_2d_jagged_float8_tensor_colwise,
     _to_2d_jagged_float8_tensor_rowwise,
 )
diff --git a/torchao/prototype/scaled_grouped_mm/benchmarks/benchmark_scaled_grouped_mm.py b/torchao/prototype/scaled_grouped_mm/benchmarks/benchmark_scaled_grouped_mm.py
diff --git a/torchao/prototype/scaled_grouped_mm/kernels/__init__.py b/torchao/prototype/scaled_grouped_mm/kernels/__init__.py
@@ -0,0 +1,6 @@
+from torchao.prototype.scaled_grouped_mm.kernels.jagged_float8_scales import (
+    triton_fp8_col_major_jagged_colwise_scales as triton_fp8_col_major_jagged_colwise_scales,
+)
+from torchao.prototype.scaled_grouped_mm.kernels.jagged_float8_scales import (
+    triton_fp8_row_major_jagged_rowwise_scales as triton_fp8_row_major_jagged_rowwise_scales,
+)
diff --git a/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py b/torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py
@@ -10,7 +10,7 @@
 
 from torchao.float8.config import ScalingGranularity
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
-from torchao.prototype.scaled_grouped_mm.kernels.jagged_float8_scales import (
+from torchao.prototype.scaled_grouped_mm.kernels import (
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
 )
diff --git a/torchao/prototype/scaled_grouped_mm/test/test_kernels.py b/torchao/prototype/scaled_grouped_mm/test/test_kernels.py
@@ -11,11 +11,11 @@
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
 )
-from torchao.prototype.scaled_grouped_mm.test.utils import (
+from torchao.prototype.scaled_grouped_mm.utils import (
+    _is_column_major,
     _to_2d_jagged_float8_tensor_colwise,
     _to_2d_jagged_float8_tensor_rowwise,
 )
-from torchao.prototype.scaled_grouped_mm.utils import _is_column_major
 
 
 @pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])
diff --git a/torchao/prototype/scaled_grouped_mm/test/utils.py b/torchao/prototype/scaled_grouped_mm/test/utils.py
diff --git a/torchao/prototype/scaled_grouped_mm/utils.py b/torchao/prototype/scaled_grouped_mm/utils.py
@@ -1,11 +1,146 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD 3-Clause license found in the
-# LICENSE file in the root directory of this source tree.
+from typing import Tuple
 
 import torch
 
+from torchao.float8.config import ScalingGranularity
+from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
+
+
+def _to_2d_jagged_float8_tensor_colwise(
+    A_col_major: torch.Tensor,
+    offs: torch.Tensor,
+    target_dtype: torch.dtype = torch.float8_e4m3fn,
+    round_scales_to_power_of_2: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function converts the 2D input tensor A to a jagged float8 tensor,
+    with scales computed along *logical columns* for each group individually,
+    where groups are determined based on the offsets.
+
+    For the right operand of a normal scaled GEMM, the rowwise scales are computed over logical columns.
+    (i.e., a tensor of (K,N) will have scales of shape (1,N).
+
+    However, for a 2D right operand of a grouped GEMM, these logical columns go through multiple distinct
+    groups/subtensors, for which we want to compute scales individually. So we cannot take one set of scales
+    along the logical columns and apply it to the entire tensor.
+
+    Instead, we need to compute scales for each subtensor individually. For a tensor of shape (K,N) this results
+    in scales of shape (1,N * num_groups).
+
+    Args:
+        A (torch.Tensor): The input tensor to be converted to a jagged float8 tensor.
+
+    Returns:
+        A tuple containing the jagged float8 tensor and the scales used for the conversion.
+    """
+    assert A_col_major.ndim == 2, "A must be 2D"
+
+    num_groups = offs.numel()
+    A_fp8_col_major = torch.empty_like(A_col_major, dtype=target_dtype)
+    A_scales = torch.empty(
+        A_fp8_col_major.size(1) * num_groups,
+        dtype=torch.float32,
+        device=A_fp8_col_major.device,
+    )
+
+    start_idx = 0
+    next_scale_idx = 0
+    for end_idx in offs.tolist():
+        # Get the subtensor of A for this group, fetching the next group of rows, with all columns for each.
+        subtensor = A_col_major[start_idx:end_idx, :]  # (local_group_size, K)
+
+        # Compute local rowwise scales for this subtensor, which are along logical columns for the right operand.
+        subtensor_scales = tensor_to_scale(
+            subtensor,
+            target_dtype,
+            scaling_granularity=ScalingGranularity.AXISWISE,
+            axiswise_dim=0,
+            round_scales_to_power_of_2=round_scales_to_power_of_2,
+        )
+
+        # Apply scales to subtensor and convert to float8.
+        tensor_scaled = subtensor.to(torch.float32) * subtensor_scales
+        float8_subtensor = to_fp8_saturated(tensor_scaled, target_dtype)
+
+        # Store this portion of the resulting float8 tensor and scales.
+        A_fp8_col_major[start_idx:end_idx, :] = float8_subtensor
+        A_scales[next_scale_idx : next_scale_idx + subtensor_scales.numel()] = (
+            subtensor_scales.squeeze()
+        )
+
+        # Update start index for next group.
+        start_idx = end_idx
+        next_scale_idx += subtensor_scales.numel()
+
+    return A_fp8_col_major, A_scales
+
+
+def _to_2d_jagged_float8_tensor_rowwise(
+    x: torch.Tensor,
+    offs: torch.Tensor,
+    target_dtype: torch.dtype,
+    round_scales_to_power_of_2: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function converts the 2D input tensor to a jagged float8 tensor,
+    with scales computed along *logical rows* for each group individually,
+    where groups are determined based on the offsets.
+
+    For a 2D *left* operand of a normal scaled GEMM, the rowwise scales are computed over logical rows.
+    (i.e., a tensor of (M,K) will have scales of shape (M,1).
+
+    However, for a 2D left operand of a grouped GEMM, these logical rows go through multiple distinct
+    groups/subtensors, for which we want to compute scales individually. So we cannot take one set of scales
+    along the logical rows and apply it to the entire tensor.
+
+    Instead, we need to compute scales for each subtensor individually. For a tensor of shape (M,K) this results
+    in scales of shape (M * num_groups, 1).
+
+    Args:
+        A (torch.Tensor): The input tensor to be converted to a jagged float8 tensor.
+
+    Returns:
+        A tuple containing the jagged float8 tensor and the scales used for the conversion.
+    """
+    assert x.ndim == 2, "input tensor must be 2D"
+
+    num_groups = offs.numel()
+    x_fp8 = torch.empty_like(x, dtype=target_dtype)
+    x_scales = torch.empty(
+        x_fp8.size(0) * num_groups, dtype=torch.float32, device=x_fp8.device
+    )
+
+    start_idx = 0
+    next_scale_idx = 0
+    for end_idx in offs.tolist():
+        # Get the subtensor of A for this group, fetching all rows with the next group of rows.
+        subtensor = x[:, start_idx:end_idx]  # (M, local_group_size)
+
+        # Compute local rowwise scales for this subtensor, which are along logical rows for the left operand.
+        subtensor_scales = tensor_to_scale(
+            subtensor,
+            target_dtype,
+            scaling_granularity=ScalingGranularity.AXISWISE,
+            axiswise_dim=-1,
+            round_scales_to_power_of_2=round_scales_to_power_of_2,
+        )
+
+        # Apply scales to subtensor and convert to float8.
+        tensor_scaled = subtensor.to(torch.float32) * subtensor_scales
+        float8_subtensor = to_fp8_saturated(tensor_scaled, target_dtype)
+
+        # Store this portion of the resulting float8 tensor and scales.
+        x_fp8[:, start_idx:end_idx] = float8_subtensor
+        x_scales[next_scale_idx : next_scale_idx + subtensor_scales.numel()] = (
+            subtensor_scales.squeeze()
+        )
+
+        # Update start index for next group.
+        start_idx = end_idx
+        next_scale_idx += subtensor_scales.numel()
+
+    return x_fp8, x_scales
+
 
 def _is_column_major(x: torch.Tensor) -> bool:
     """

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`triton_fp8_col_major_jagged_colwise_scales,`
`19`	`19`	`triton_fp8_row_major_jagged_rowwise_scales,`
`20`	`20`	`)`
`21`		`-from torchao.prototype.scaled_grouped_mm.test.utils import (`
	`21`	`+from torchao.prototype.scaled_grouped_mm.utils import (`
`22`	`22`	`_to_2d_jagged_float8_tensor_colwise,`
`23`	`23`	`_to_2d_jagged_float8_tensor_rowwise,`
`24`	`24`	`)`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`
`11`	`11`	`from torchao.float8.config import ScalingGranularity`
`12`	`12`	`from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated`
`13`		`-from torchao.prototype.scaled_grouped_mm.kernels.jagged_float8_scales import (`
	`13`	`+from torchao.prototype.scaled_grouped_mm.kernels import (`
`14`	`14`	`triton_fp8_col_major_jagged_colwise_scales,`
`15`	`15`	`triton_fp8_row_major_jagged_rowwise_scales,`
`16`	`16`	`)`
Original file line number	Diff line number	Diff line change
`@@ -11,11 +11,11 @@`
`11`	`11`	`triton_fp8_col_major_jagged_colwise_scales,`
`12`	`12`	`triton_fp8_row_major_jagged_rowwise_scales,`
`13`	`13`	`)`
`14`		`-from torchao.prototype.scaled_grouped_mm.test.utils import (`
	`14`	`+from torchao.prototype.scaled_grouped_mm.utils import (`
	`15`	`+ _is_column_major,`
`15`	`16`	`_to_2d_jagged_float8_tensor_colwise,`
`16`	`17`	`_to_2d_jagged_float8_tensor_rowwise,`
`17`	`18`	`)`
`18`		`-from torchao.prototype.scaled_grouped_mm.utils import _is_column_major`
`19`	`19`
`20`	`20`
`21`	`21`	`@pytest.mark.parametrize("round_scales_to_power_of_2", [True, False])`