Fix EP token group padding issue (#1718)

danielvegamyhre · web-flow · commit 60645bc78cf4 · 2025-09-18T11:16:17.000-07:00
Fixes #1651 ## Summary - Round up `max_len` of permuted token indicies in expert parallel decorator to be a multiple of token group alignment size. ## Test plan - Llama4 debug model with FSDP=2, EP=2: `NGPU=2 CONFIG_FILE="./torchtitan/experiments/llama4/train_configs/debug_model.toml" ./run_train.sh --parallelism.data_parallel_shard_degree=2 --parallelism.expert_parallel_degree=2 --compile.enable `
diff --git a/torchtitan/distributed/expert_parallel.py b/torchtitan/distributed/expert_parallel.py
@@ -22,6 +22,8 @@
 )
 from torch.distributed.tensor.parallel import ParallelStyle
 
+from torchtitan.distributed.utils import _round_up
+
 
 TOKEN_GROUP_ALIGN_SIZE_M = 8
 ValidTokenGroupAlignmentSize = Literal[8, 16, 32]
@@ -253,6 +255,12 @@ def wrapper(
         experts_per_ep_rank = w1.shape[0]
         num_ep_ranks = num_tokens_per_expert.shape[0] // experts_per_ep_rank
 
+        # Make sure max_len of permuted token indicies is divisible by TOKEN_GROUP_ALIGN_SIZE_M,
+        # by padding it to the nearest multiple of TOKEN_GROUP_ALIGN_SIZE_M.
+        x_padded_per_expert = (
+            x.shape[0] + experts_per_ep_rank * TOKEN_GROUP_ALIGN_SIZE_M
+        )
+        padded_max_len = _round_up(x_padded_per_expert, TOKEN_GROUP_ALIGN_SIZE_M)
         with torch.no_grad():
             (
                 permuted_indices,
@@ -262,7 +270,7 @@ def wrapper(
                 num_tokens_per_expert,
                 experts_per_ep_rank,
                 num_ep_ranks,
-                x.shape[0] + experts_per_ep_rank * TOKEN_GROUP_ALIGN_SIZE_M,
+                padded_max_len,
                 TOKEN_GROUP_ALIGN_SIZE_M,
             )
 
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -448,3 +448,9 @@ def _clip_grad_norm_with_ep(
     torch.nn.utils.clip_grads_with_norm_(non_ep_params, max_norm, total_norm, foreach)
 
     return total_norm
+
+
+def _round_up(x: int, y: int) -> int:
+    """Round up x to the nearest multiple of y."""
+    x_ceil_div_y = (x + y - 1) // y
+    return x_ceil_div_y * y