pytorch
diff --git a/‎torchao/prototype/scaled_grouped_mm/benchmark.py‎
Lines changed: 144 additions & 0 deletions b/‎torchao/prototype/scaled_grouped_mm/benchmark.py‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎torchao/prototype/scaled_grouped_mm/kernels/benchmark.py‎
Lines changed: 1 addition & 1 deletion b/‎torchao/prototype/scaled_grouped_mm/kernels/benchmark.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/prototype/scaled_grouped_mm/kernels/jagged_float8_scales.py‎
Lines changed: 6 additions & 2 deletions b/‎torchao/prototype/scaled_grouped_mm/kernels/jagged_float8_scales.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎torchao/prototype/scaled_grouped_mm/kernels/test_jagged_float8_scales.py‎
Lines changed: 1 addition & 1 deletion b/‎torchao/prototype/scaled_grouped_mm/kernels/test_jagged_float8_scales.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py‎
Lines changed: 9 additions & 140 deletions b/‎torchao/prototype/scaled_grouped_mm/scaled_grouped_mm.py‎
Lines changed: 9 additions & 140 deletions
diff --git a/‎torchao/prototype/scaled_grouped_mm/test/__init__.py‎ b/‎torchao/prototype/scaled_grouped_mm/test/__init__.py‎
diff --git a/‎torchao/prototype/scaled_grouped_mm/test_scaled_grouped_mm.py‎ renamed to ‎torchao/prototype/scaled_grouped_mm/test/test_scaled_grouped_mm.py‎ b/‎torchao/prototype/scaled_grouped_mm/test_scaled_grouped_mm.py‎ renamed to ‎torchao/prototype/scaled_grouped_mm/test/test_scaled_grouped_mm.py‎
@@ -0,0 +1,144 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import itertools
+import time
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+
+from torchao.prototype.scaled_grouped_mm import _scaled_grouped_mm
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    high_precision_dtype: torch.dtype
+    A_shape: tuple[int]
+    B_shape: tuple[int]
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    time_us: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    A_shapes = [(2**8, 4096), (2**12, 4096), (2**16, 4096)]
+    B_shapes = [(4, 4096, 4096), (8, 4096, 4096), (16, 4096, 4096)]
+    high_precision_dtypes = [torch.bfloat16]
+    configs = []
+    for A_shape, B_shape, high_precision_dtype in itertools.product(
+        A_shapes, B_shapes, high_precision_dtypes
+    ):
+        configs.append(
+            ExperimentConfig(
+                A_shape=A_shape,
+                B_shape=B_shape,
+                high_precision_dtype=high_precision_dtype,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    # define test inputs
+    A = torch.randn(
+        *config.A_shape,
+        dtype=config.high_precision_dtype,
+        device=device,
+        requires_grad=True,
+    )
+    B_t = torch.randn(
+        *config.B_shape,
+        dtype=config.high_precision_dtype,
+        device=device,
+        requires_grad=True,
+    ).transpose(-2, -1)
+
+    # - configure input to be row-major with groups divided along the column dimension,
+    #   representing the left operand of grad_weight = grad_output_t @ input
+    #   that occurs in the backward pass of the differentiable scaled grouped mm.
+    # - the transposed tensor in col-major format with groups along the row dimension,
+    #    which represents the right operand.
+    n_groups = config.B_shape[0]
+    group_size = A.shape[0] // n_groups
+    offs = torch.arange(
+        group_size,
+        group_size * n_groups + 1,
+        group_size,
+        device=device,
+        dtype=torch.int32,
+    )
+
+    def warmup(func, *args, **kwargs):
+        for _ in range(10):
+            func(*args, **kwargs)
+
+    def forward_backward(A, B_t, offs):
+        out = _scaled_grouped_mm(A, B_t, offs=offs, out_dtype=torch.bfloat16)
+        out.sum().backward()
+
+    # bench triton
+    warmup(forward_backward, A, B_t, offs)
+    start_time_ns = time.perf_counter_ns()
+    forward_backward(A, B_t, offs)
+    time_ns = time.perf_counter_ns() - start_time_ns
+    time_us = time_ns / 1e3
+
+    return ExperimentResult(time_us=time_us)
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "A_shape",
+        "B_shape",
+        "high_precision_dtype",
+        "time_us",
+    ]
+    rows = []
+    for experiment in experiments:
+        A_shape = f"({experiment.config.A_shape[0]}, {experiment.config.A_shape[1]})"
+        B_shape = f"({experiment.config.B_shape[0]}, {experiment.config.B_shape[1]}, {experiment.config.B_shape[2]})"
+        rows.append(
+            [
+                A_shape,
+                B_shape,
+                experiment.config.high_precision_dtype,
+                experiment.result.time_us,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
@@ -18,7 +18,7 @@
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
 )
-from torchao.prototype.scaled_grouped_mm.scaled_grouped_mm import (
+from torchao.prototype.scaled_grouped_mm.test.utils import (
     _to_2d_jagged_float8_tensor_colwise,
     _to_2d_jagged_float8_tensor_rowwise,
 )
 
@@ -160,7 +160,9 @@ def _triton_fp8_row_major_jagged_rowwise_scales(
         data = tl.load(input_ptr + block_offs, mask=block_mask, other=0.0).to(
             input_dtype
         )
-        amax_buffer = tl.maximum(amax_buffer, tl.max(tl.abs(data), axis=1))
+        # we need to cast back to input dtype since triton promotes bf16 to fp32: 
+        # https://github.com/triton-lang/triton/blob/981e987eed9053b952f81153bc0779c99d8c642e/python/triton/language/standard.py#L173
+        amax_buffer = tl.maximum(amax_buffer, tl.max(tl.abs(data), axis=1)).to(input_dtype)
 
     # compute rowwise scales for this group. round scales to nearest power of 2.
     amax_buffer = amax_buffer.to(tl.float64)
@@ -317,7 +319,9 @@ def _triton_fp8_col_major_jagged_colwise_scales(
         data = tl.load(input_ptr + block_offs, mask=block_mask, other=0.0).to(
             input_dtype
         )
-        amax_buffer = tl.maximum(amax_buffer, tl.max(tl.abs(data), axis=0))
+        # we need to cast back to input dtype since triton promotes bf16 to fp32: 
+        # https://github.com/triton-lang/triton/blob/981e987eed9053b952f81153bc0779c99d8c642e/python/triton/language/standard.py#L173
+        amax_buffer = tl.maximum(amax_buffer, tl.max(tl.abs(data), axis=0)).to(input_dtype)
 
     # compute rowwise scales for this group.
     amax_buffer = amax_buffer.to(tl.float64)
 
@@ -11,7 +11,7 @@
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
 )
-from torchao.prototype.scaled_grouped_mm.scaled_grouped_mm import (
+from torchao.prototype.scaled_grouped_mm.test.utils import (
     _to_2d_jagged_float8_tensor_colwise,
     _to_2d_jagged_float8_tensor_rowwise,
 )
 
@@ -10,6 +10,10 @@
 
 from torchao.float8.config import ScalingGranularity
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
+from torchao.prototype.scaled_grouped_mm.kernels.jagged_float8_scales import (
+    triton_fp8_col_major_jagged_colwise_scales,
+    triton_fp8_row_major_jagged_rowwise_scales,
+)
 from torchao.prototype.scaled_grouped_mm.utils import _is_column_major
 
 
@@ -189,17 +193,18 @@ def backward(ctx, grad_output: torch.Tensor):
         # grad_B is a special case. both operands of the grouped gemm will be 2D with offsets determing the "groups."
         # Compute scales for grad_output_t and A, which are both 2D tensors with offsets which define the "jagged" groups.
         grad_output_t_fp8_row_major, grad_output_t_scales = (
-            _to_2d_jagged_float8_tensor_rowwise(
+            triton_fp8_row_major_jagged_rowwise_scales(
                 grad_output_t_row_major,
                 offs,
-                target_dtype=torch.float8_e4m3fn,
+                output_dtype=torch.float8_e4m3fn,
                 round_scales_to_power_of_2=True,
             )
         )
-        A_fp8_col_major, A_scales = _to_2d_jagged_float8_tensor_colwise(
+
+        A_fp8_col_major, A_scales = triton_fp8_col_major_jagged_colwise_scales(
             A_col_major,
             offs,
-            target_dtype=torch.float8_e4m3fn,
+            output_dtype=torch.float8_e4m3fn,
             round_scales_to_power_of_2=True,
         )
 
@@ -216,139 +221,3 @@ def backward(ctx, grad_output: torch.Tensor):
             use_fast_accum=True,
         )
         return grad_A, grad_B.transpose(-2, -1), None, None, None, None
-
-
-def _to_2d_jagged_float8_tensor_colwise(
-    A_col_major: torch.Tensor,
-    offs: torch.Tensor,
-    target_dtype: torch.dtype = torch.float8_e4m3fn,
-    round_scales_to_power_of_2: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    This function converts the 2D input tensor A to a jagged float8 tensor,
-    with scales computed along *logical columns* for each group individually,
-    where groups are determined based on the offsets.
-
-    For the right operand of a normal scaled GEMM, the rowwise scales are computed over logical columns.
-    (i.e., a tensor of (K,N) will have scales of shape (1,N).
-
-    However, for a 2D right operand of a grouped GEMM, these logical columns go through multiple distinct
-    groups/subtensors, for which we want to compute scales individually. So we cannot take one set of scales
-    along the logical columns and apply it to the entire tensor.
-
-    Instead, we need to compute scales for each subtensor individually. For a tensor of shape (K,N) this results
-    in scales of shape (1,N * num_groups).
-
-    Args:
-        A (torch.Tensor): The input tensor to be converted to a jagged float8 tensor.
-
-    Returns:
-        A tuple containing the jagged float8 tensor and the scales used for the conversion.
-    """
-    assert A_col_major.ndim == 2, "A must be 2D"
-
-    num_groups = offs.numel()
-    A_fp8_col_major = torch.empty_like(A_col_major, dtype=target_dtype)
-    A_scales = torch.empty(
-        A_fp8_col_major.size(1) * num_groups,
-        dtype=torch.float32,
-        device=A_fp8_col_major.device,
-    )
-
-    start_idx = 0
-    next_scale_idx = 0
-    for end_idx in offs.tolist():
-        # Get the subtensor of A for this group, fetching the next group of rows, with all columns for each.
-        subtensor = A_col_major[start_idx:end_idx, :]  # (local_group_size, K)
-
-        # Compute local rowwise scales for this subtensor, which are along logical columns for the right operand.
-        subtensor_scales = tensor_to_scale(
-            subtensor,
-            target_dtype,
-            scaling_granularity=ScalingGranularity.AXISWISE,
-            axiswise_dim=0,
-            round_scales_to_power_of_2=round_scales_to_power_of_2,
-        )
-
-        # Apply scales to subtensor and convert to float8.
-        tensor_scaled = subtensor.to(torch.float32) * subtensor_scales
-        float8_subtensor = to_fp8_saturated(tensor_scaled, target_dtype)
-
-        # Store this portion of the resulting float8 tensor and scales.
-        A_fp8_col_major[start_idx:end_idx, :] = float8_subtensor
-        A_scales[next_scale_idx : next_scale_idx + subtensor_scales.numel()] = (
-            subtensor_scales.squeeze()
-        )
-
-        # Update start index for next group.
-        start_idx = end_idx
-        next_scale_idx += subtensor_scales.numel()
-
-    return A_fp8_col_major, A_scales
-
-
-def _to_2d_jagged_float8_tensor_rowwise(
-    x: torch.Tensor,
-    offs: torch.Tensor,
-    target_dtype: torch.dtype,
-    round_scales_to_power_of_2: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    This function converts the 2D input tensor to a jagged float8 tensor,
-    with scales computed along *logical rows* for each group individually,
-    where groups are determined based on the offsets.
-
-    For a 2D *left* operand of a normal scaled GEMM, the rowwise scales are computed over logical rows.
-    (i.e., a tensor of (M,K) will have scales of shape (M,1).
-
-    However, for a 2D left operand of a grouped GEMM, these logical rows go through multiple distinct
-    groups/subtensors, for which we want to compute scales individually. So we cannot take one set of scales
-    along the logical rows and apply it to the entire tensor.
-
-    Instead, we need to compute scales for each subtensor individually. For a tensor of shape (M,K) this results
-    in scales of shape (M * num_groups, 1).
-
-    Args:
-        A (torch.Tensor): The input tensor to be converted to a jagged float8 tensor.
-
-    Returns:
-        A tuple containing the jagged float8 tensor and the scales used for the conversion.
-    """
-    assert x.ndim == 2, "input tensor must be 2D"
-
-    num_groups = offs.numel()
-    x_fp8 = torch.empty_like(x, dtype=target_dtype)
-    x_scales = torch.empty(
-        x_fp8.size(0) * num_groups, dtype=torch.float32, device=x_fp8.device
-    )
-
-    start_idx = 0
-    next_scale_idx = 0
-    for end_idx in offs.tolist():
-        # Get the subtensor of A for this group, fetching all rows with the next group of rows.
-        subtensor = x[:, start_idx:end_idx]  # (M, local_group_size)
-
-        # Compute local rowwise scales for this subtensor, which are along logical rows for the left operand.
-        subtensor_scales = tensor_to_scale(
-            subtensor,
-            target_dtype,
-            scaling_granularity=ScalingGranularity.AXISWISE,
-            axiswise_dim=-1,
-            round_scales_to_power_of_2=round_scales_to_power_of_2,
-        )
-
-        # Apply scales to subtensor and convert to float8.
-        tensor_scaled = subtensor.to(torch.float32) * subtensor_scales
-        float8_subtensor = to_fp8_saturated(tensor_scaled, target_dtype)
-
-        # Store this portion of the resulting float8 tensor and scales.
-        x_fp8[:, start_idx:end_idx] = float8_subtensor
-        x_scales[next_scale_idx : next_scale_idx + subtensor_scales.numel()] = (
-            subtensor_scales.squeeze()
-        )
-
-        # Update start index for next group.
-        start_idx = end_idx
-        next_scale_idx += subtensor_scales.numel()
-
-    return x_fp8, x_scales
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@`
`18`	`18`	`triton_fp8_col_major_jagged_colwise_scales,`
`19`	`19`	`triton_fp8_row_major_jagged_rowwise_scales,`
`20`	`20`	`)`
`21`		`-from torchao.prototype.scaled_grouped_mm.scaled_grouped_mm import (`
	`21`	`+from torchao.prototype.scaled_grouped_mm.test.utils import (`
`22`	`22`	`_to_2d_jagged_float8_tensor_colwise,`
`23`	`23`	`_to_2d_jagged_float8_tensor_rowwise,`
`24`	`24`	`)`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`triton_fp8_col_major_jagged_colwise_scales,`
`12`	`12`	`triton_fp8_row_major_jagged_rowwise_scales,`
`13`	`13`	`)`
`14`		`-from torchao.prototype.scaled_grouped_mm.scaled_grouped_mm import (`
	`14`	`+from torchao.prototype.scaled_grouped_mm.test.utils import (`
`15`	`15`	`_to_2d_jagged_float8_tensor_colwise,`
`16`	`16`	`_to_2d_jagged_float8_tensor_rowwise,`
`17`	`17`	`)`