Update on "[Tensor Parallel] update examples to simplify embedding + first transformer block"

tianyu-l · tianyu-l · commit 0960b2e204df · 2024-05-15T18:42:20.000-07:00
Following changes in pytorch/torchtitan#314, to apply a reduce-scatter instead of the more expensive all-reduce + local chunk. cross PR with pytorch/tutorials#2871 [ghstack-poisoned]
diff --git a/distributed/tensor_parallelism/fsdp_tp_example.py b/distributed/tensor_parallelism/fsdp_tp_example.py
@@ -109,16 +109,17 @@
             input_layouts=Replicate(),
             output_layouts=Shard(1),
         ),
+        "norm": SequenceParallel(),
         "output": ColwiseParallel(
             input_layouts=Shard(1),
             output_layouts=Replicate()
         ),
-        "norm": SequenceParallel(),
     }
 )
 
 for layer_id, transformer_block in enumerate(model.layers):
     layer_tp_plan = {
+        "attention_norm": SequenceParallel(),
         "attention": PrepareModuleInput(
             input_layouts=(Shard(1), None),
             desired_input_layouts=(Replicate(), None),
@@ -127,15 +128,14 @@
         "attention.wk": ColwiseParallel(),
         "attention.wv": ColwiseParallel(),
         "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
-        "attention_norm": SequenceParallel(),
+        "ffn_norm": SequenceParallel(),
         "feed_forward": PrepareModuleInput(
             input_layouts=(Shard(1),),
             desired_input_layouts=(Replicate(),),
         ),
         "feed_forward.w1": ColwiseParallel(),
         "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
         "feed_forward.w3": ColwiseParallel(),
-        "ffn_norm": SequenceParallel(),
     }
 
     # Adjust attention module to use the local number of heads