Update SMP v2 shared_scripts

viclzhu · viclzhu · commit 26f57c2cbecf · 2024-06-20T13:28:51.000-07:00
diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt
@@ -1,9 +1,9 @@
 accelerate>=0.12.0
-datasets>=2.16.1
+datasets>=2.19.1
 einops
 evaluate
 expecttest
-flash-attn>=2.3.6
+flash-attn>=2.3.6,<2.4
 h5py
 humanize
 hypothesis
@@ -14,4 +14,4 @@ protobuf
 scikit-learn
 sentencepiece!=0.1.92
 tensorboard
-transformers>=4.37.1
+transformers>=4.40.1
diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py
@@ -397,7 +397,7 @@ def main(args):
         len(args.num_kept_checkpoints),
     )
     if len(set(ckpt_lens)) != 1:
-        raise ValueError(f"Len mismtach for checkpoint dir, freq vs num to keep:  {ckpt_lens}.")
+        raise ValueError(f"Len mismatch for checkpoint dir, freq vs num to keep:  {ckpt_lens}.")
 
     if args.distributed_backend == "smddp":
         import smdistributed.dataparallel.torch.torch_smddp  # pylint: disable=unused-import
diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py
@@ -34,11 +34,22 @@ def compute_num_params(model):
 
 
 def compute_tflops(args, global_batch_size, step_time, world_size):
-    # Based on 
+    # Based on
     # https://github.com/NVIDIA/Megatron-LM/blob/ba773259dbe5735fbd91ca41e7f4ded60b335c52/megatron/training/training.py#L65
-    num_experts_routed_to = 1 if args.moe > 1 else args.num_experts_per_tok
-    if args.num_key_value_heads is None:
+    # Attention projection size.
+    kv_channels = args.hidden_width // args.num_heads
+    query_projection_size = kv_channels * args.num_heads
+    query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_width
+
+    # Group Query Attention.
+    if not args.num_key_value_heads:
         args.num_key_value_heads = args.num_heads
+
+    # MoE.
+    num_experts_routed_to = 1 if args.moe == 0 else args.num_experts_per_tok
+    gated_linear_multiplier = 3/2 if args.moe > 0 else 1
+
+    # Compute the number of floating point operations
     num_flops = (
         12
         * global_batch_size
@@ -47,13 +58,26 @@ def compute_tflops(args, global_batch_size, step_time, world_size):
         * args.hidden_width
         * args.hidden_width
         * (
-            1
-            + ((args.intermediate_size / args.hidden_width) * num_experts_routed_to)
-            + (args.num_key_value_heads / args.num_heads)
-            + (args.max_context_width / args.hidden_width)
+            # Attention.
+            (
+                (
+                    1
+                    + (args.num_key_value_heads / args.num_heads)
+                    + (args.max_context_width / args.hidden_width)
+                ) * query_projection_to_hidden_size_ratio
+            )
+            # MLP.
+            + (
+                (args.intermediate_size / args.hidden_width)
+                * num_experts_routed_to
+                * gated_linear_multiplier
+            )
+            # Logit.
             + (args.vocab_size / (2 * args.num_layers * args.hidden_width))
         )
     )
+
+    # Convert to TFLOPs per GPU
     tflops_per_gpu = num_flops / (
                  step_time * 10**12 * world_size)
     return tflops_per_gpu

Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def main(args):`
`397`	`397`	`len(args.num_kept_checkpoints),`
`398`	`398`	`)`
`399`	`399`	`if len(set(ckpt_lens)) != 1:`
`400`		`- raise ValueError(f"Len mismtach for checkpoint dir, freq vs num to keep: {ckpt_lens}.")`
	`400`	`+ raise ValueError(f"Len mismatch for checkpoint dir, freq vs num to keep: {ckpt_lens}.")`
`401`	`401`
`402`	`402`	`if args.distributed_backend == "smddp":`
`403`	`403`	`import smdistributed.dataparallel.torch.torch_smddp # pylint: disable=unused-import`