row of blocks within groups only

danielvegamyhre · danielvegamyhre · commit df502e0fe004 · 2025-09-08T18:43:46.000-07:00
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -272,35 +272,41 @@ def test_mxfp8_per_group_blocked_scales_3d(
 
 
 @skip_if_rocm("ROCm enablement in progress")
-@pytest.mark.parametrize("m,total_k,n_groups", [(256, 512, 4)])#, (256, 128, 4), (512, 128, 4), (1024, 128, 4), (1024, 256, 4), (1024, 512, 4), (1024, 1024, 4), (1024, 2048, 4), (1024, 4096, 4), (1024, 8192, 4), (1024, 16384, 4)])
+@pytest.mark.parametrize(
+    "m,total_k,n_groups",
+    [
+        (256, 512, 4),
+        (256, 128, 4),
+        (512, 128, 4),
+        (1024, 128, 4),
+        (1024, 256, 4),
+        (1024, 512, 4),
+        (1024, 1024, 4),
+        (1024, 2048, 4),
+        (1024, 4096, 4),
+        (1024, 8192, 4),
+        (1024, 16384, 4),
+        (5120, 16640, 16),
+    ],
+)
 def test_mxfp8_per_group_blocked_scales_2d2d_lhs(
     m: int,
     total_k: int,
     n_groups: int,
 ):
     device = "cuda"
     block_size = 32
-
-    # Make each group of row blocks have distinct, constinent data for debugging
-    input_data = torch.cat(
-        [
-            torch.ones(m // 2, total_k, device=device),
-            torch.full((m // 2, total_k), 999, device=device),
-        ]
-    )
-    #input_data= torch.randn(m, total_k, device=device)
+    input_data = torch.randn(m, total_k, device=device)
 
     e8m0_scales, _ = to_mx(
         input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size
     )
 
     # Generate group end offsets along total_K, then divide by block_size to get scale group end offsets
-    # input_group_offsets = generate_jagged_offs(
-    #     n_groups, total_k, multiple_of=block_size, device=device
-    # )
-    # input_group_offsets //= block_size
-    input_group_offsets = torch.tensor([3, 8, 12, 16], device=device, dtype=torch.int32)
-    #print(input_group_offsets)
+    input_group_offsets = generate_jagged_offs(
+        n_groups, total_k, multiple_of=block_size, device=device
+    )
+    input_group_offsets //= block_size
 
     # torch reference
     ref_out_scales, ref_start_cols_after_padding = torch_to_blocked_per_group_2d2d_lhs(
@@ -320,12 +326,6 @@ def test_mxfp8_per_group_blocked_scales_2d2d_lhs(
         input_group_offsets,
         output_group_offsets,
     )
-    print(ref_start_cols_after_padding)
-    with open('tmp-ref.txt', 'w') as f:
-        f.write(str(ref_out_scales.storage()))
-    with open('tmp-triton.txt', 'w') as f:
-        f.write(str(triton_out_scales.storage()))
-    breakpoint()
     assert torch.allclose(ref_out_scales, triton_out_scales, atol=0, rtol=0), (
         "blocked scales not equal"
     )
diff --git a/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py b/torchao/prototype/moe_training/kernels/mxfp8_blocked_scales.py
@@ -479,8 +479,6 @@ def triton_mx_block_rearrange_per_group_2d2d_lhs(
     # Output block stride for the rearranged format
     BLOCK_ROWS, BLOCK_COLS = 128, 4
     output_stride_per_block = BLOCK_ROWS * BLOCK_COLS
-    num_row_blocks = padded_rows // BLOCK_ROWS
-    output_stride_per_col_of_blocks = output_stride_per_block * num_row_blocks
 
     # We parallelize per group and per row block.
     # Cols per group is variable, so we just loop through col blocks for each group.
@@ -495,17 +493,17 @@ def triton_mx_block_rearrange_per_group_2d2d_lhs(
         scales_tensor.stride(1),
         rows,
         cols,
+        padded_rows,
         num_groups,
         # Original offsets (to read from)
         input_group_end_offsets,
         # Output scales tensor and group offsets after padding (to write to)
         output.view(torch.uint8),
         output_group_start_offsets,
         output_stride_per_block,
-        output_stride_per_col_of_blocks,
         BLOCK_ROWS=BLOCK_ROWS,
         BLOCK_COLS=BLOCK_COLS,
-        DEBUG=True,
+        DEBUG=False,
     )
     return output
 
@@ -517,12 +515,12 @@ def triton_scale_swizzle_per_group_2d2d_lhs(
     scales_stride_dim1,
     scale_rows,
     scale_cols,
+    padded_rows,
     num_groups,
     orig_offsets,  # (num_groups,)
     output_scales_ptr,
     output_scales_group_offsets,  # (num_groups,)
     output_stride_per_block,
-    output_stride_per_col_of_blocks,
     BLOCK_ROWS: tl.constexpr,
     BLOCK_COLS: tl.constexpr,
     DEBUG: tl.constexpr = False,
@@ -557,8 +555,9 @@ def triton_scale_swizzle_per_group_2d2d_lhs(
 
     # For this group and row block, we iterate through col blocks, reading (BLOCK_ROWS, BLOCK_COLS) from the input scales.
     # We track how many col blocks we have iterated through.
+    out_group_base_offset = output_group_start_col * padded_rows
     curr_input_start_col = input_group_start_col
-    curr_out_start_col_block = output_group_start_col // BLOCK_COLS
+    curr_out_start_col_block = 0
     while curr_input_start_col < input_group_end_col:
         # Read block of input scales
         block_row_offs = block_row_pid * BLOCK_ROWS + row_offs
@@ -570,25 +569,21 @@ def triton_scale_swizzle_per_group_2d2d_lhs(
         input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0)
         scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
 
-        # Calculate block offset using provided output block stride
-        tgt_row_off = block_row_pid * output_stride_per_block
-        tgt_col_off = curr_out_start_col_block * output_stride_per_col_of_blocks
-
-        output_block_offsets = tgt_row_off + tgt_col_off
-        if DEBUG:
-            tl.device_print("\nblock_row_pid: ", block_row_pid)
-            tl.device_print("group_pid: ", group_pid)
-            tl.device_print("tgt_row_block", block_row_pid)
-            tl.device_print("output_group_start_col: ", output_group_start_col)
-            tl.device_print("tgt_col_block", curr_out_start_col_block)
-            tl.device_print("tgt_row_off: ", tgt_row_off)
-            tl.device_print("tgt_col_off: ", tgt_col_off)
-            tl.device_print("global_off:", tgt_row_off + tgt_col_off)
-            tl.device_print("writing: ", scales_flat)
+        # Get offset within the group to add to the group's base offset
+        num_cols_in_group = input_group_end_col - input_group_start_col
+        num_col_blocks_in_group = tl.cdiv(num_cols_in_group, BLOCK_COLS)
+        stride_per_row_of_blocks_in_group = (
+            num_col_blocks_in_group * output_stride_per_block
+        )
+        offset_in_group = (
+            block_row_pid * stride_per_row_of_blocks_in_group
+            + curr_out_start_col_block * output_stride_per_block
+        )
+        final_offset = out_group_base_offset + offset_in_group
 
         # Apply swizzling for write to gmem
         tl.store(
-            output_scales_ptr + output_block_offsets + dest_indices_flat,
+            output_scales_ptr + final_offset + dest_indices_flat,
             scales_flat,
         )