[moe training] use smaller block sizes for per group scaling kernels to improve perf

danielvegamyhre · danielvegamyhre · commit 03c179b3d849 · 2025-08-02T14:38:53.000-07:00
diff --git a/torchao/prototype/moe_training/benchmarks/benchmark_kernels.py b/torchao/prototype/moe_training/benchmarks/benchmark_kernels.py
@@ -6,13 +6,13 @@
 # this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
 
 import itertools
-import time
 from dataclasses import dataclass
 from typing import List
 
 import torch
 from tabulate import tabulate
 from tqdm import tqdm
+from triton.testing import do_bench
 
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_col_major_jagged_colwise_scales,
@@ -129,18 +129,15 @@ def run_triton(
 
     # bench torch
     compiled_run_torch = torch.compile(run_torch)
-    warmup(compiled_run_torch, input_row_major, input_col_major, offs)
-    start_time_ns = time.perf_counter_ns()
-    compiled_run_torch(input_row_major, input_col_major, offs)
-    torch_time_ns = time.perf_counter_ns() - start_time_ns
-    torch_time_us = torch_time_ns / 1e3
+    torch_time_us = benchmark_cuda_function_in_microseconds(
+        compiled_run_torch, input_row_major, input_col_major, offs
+    )
 
     # bench triton
     warmup(run_triton, input_row_major, input_col_major, offs)
-    start_time_ns = time.perf_counter_ns()
-    run_triton(input_row_major, input_col_major, offs)
-    triton_time_ns = time.perf_counter_ns() - start_time_ns
-    triton_time_us = triton_time_ns / 1e3
+    triton_time_us = benchmark_cuda_function_in_microseconds(
+        run_triton, input_row_major, input_col_major, offs
+    )
 
     return ExperimentResult(
         torch_time_us=torch_time_us,
@@ -173,6 +170,10 @@ def print_results(experiments: List[Experiment]):
     print(tabulate(rows, headers=headers))
 
 
+def benchmark_cuda_function_in_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+
+
 def main():
     torch.random.manual_seed(123)
     configs = get_configs()
diff --git a/torchao/prototype/moe_training/kernels/jagged_float8_scales.py b/torchao/prototype/moe_training/kernels/jagged_float8_scales.py
@@ -33,7 +33,7 @@
     torch.float64: tl.float64,
 }
 
-block_sizes = [128, 256]
+block_sizes = [16, 32]
 kernel_configs_2D = [
     triton.Config(
         {"BLOCK_SIZE_ROWS": block_size_rows, "BLOCK_SIZE_COLS": block_size_cols}

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`torch.float64: tl.float64,`
`34`	`34`	`}`
`35`	`35`
`36`		`-block_sizes = [128, 256]`
	`36`	`+block_sizes = [16, 32]`
`37`	`37`	`kernel_configs_2D = [`
`38`	`38`	`triton.Config(`
`39`	`39`	`{"BLOCK_SIZE_ROWS": block_size_rows, "BLOCK_SIZE_COLS": block_size_cols}`