From 25ae618ed3f6bfe01ccf0a3cb5b100f6a052a988 Mon Sep 17 00:00:00 2001 From: Victor Zhu Date: Mon, 17 Jun 2024 11:00:14 -0700 Subject: [PATCH 1/3] Update SMP v2 notebooks to use latest PT2.3.1-TSM2.4.0 release. --- .../gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb | 4 ++-- .../gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb | 8 ++++---- .../llama_v2/smp-finetuning-llama-fsdp-tp.ipynb | 4 ++-- .../llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb | 8 ++++---- .../mixtral/smp-train-mixtral-fsdp-ep.ipynb | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb index 0ebc523568..f4448787b2 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb @@ -882,8 +882,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", diff --git a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb index 28638611cd..014dc103e0 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb @@ -873,8 +873,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", @@ -955,8 +955,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", diff --git a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb index 46c5edbc42..b3cc07b7ef 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb @@ -867,8 +867,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", diff --git a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb index 0a4c705b11..a7c4189016 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb @@ -831,8 +831,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", @@ -913,8 +913,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", diff --git a/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb b/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb index c58b76c310..30aec32a6d 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb @@ -916,8 +916,8 @@ " }\n", " },\n", " },\n", - " py_version=\"py310\",\n", - " framework_version=\"2.2.0\",\n", + " py_version=\"py311\",\n", + " framework_version=\"2.3.1\",\n", " # image_uri=$IMAGE, # Either provide `framework_version` or `image_uri`\n", " output_path=s3_output_bucket,\n", " max_run=86400,\n", From 96704cf572d34410d786807be795cc3ad2f6efc1 Mon Sep 17 00:00:00 2001 From: Victor Zhu Date: Mon, 17 Jun 2024 14:07:46 -0700 Subject: [PATCH 2/3] Update SMP v2 shared_scripts --- .../shared-scripts/requirements.txt | 6 +-- .../shared-scripts/train_lib.py | 2 +- .../shared-scripts/train_utils.py | 38 +++++++++++++++---- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt index 8dd5fd9937..ed71162ed8 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt +++ b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/requirements.txt @@ -1,9 +1,9 @@ accelerate>=0.12.0 -datasets>=2.16.1 +datasets>=2.19.1 einops evaluate expecttest -flash-attn>=2.3.6 +flash-attn>=2.3.6,<2.4 h5py humanize hypothesis @@ -14,4 +14,4 @@ protobuf scikit-learn sentencepiece!=0.1.92 tensorboard -transformers>=4.37.1 +transformers>=4.40.1 diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py index b391dee3c2..188f199c1f 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py +++ b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_lib.py @@ -397,7 +397,7 @@ def main(args): len(args.num_kept_checkpoints), ) if len(set(ckpt_lens)) != 1: - raise ValueError(f"Len mismtach for checkpoint dir, freq vs num to keep: {ckpt_lens}.") + raise ValueError(f"Len mismatch for checkpoint dir, freq vs num to keep: {ckpt_lens}.") if args.distributed_backend == "smddp": import smdistributed.dataparallel.torch.torch_smddp # pylint: disable=unused-import diff --git a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py index 99c0264120..e5b73049c1 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py +++ b/training/distributed_training/pytorch/model_parallel_v2/shared-scripts/train_utils.py @@ -34,11 +34,22 @@ def compute_num_params(model): def compute_tflops(args, global_batch_size, step_time, world_size): - # Based on + # Based on # https://github.com/NVIDIA/Megatron-LM/blob/ba773259dbe5735fbd91ca41e7f4ded60b335c52/megatron/training/training.py#L65 - num_experts_routed_to = 1 if args.moe > 1 else args.num_experts_per_tok - if args.num_key_value_heads is None: + # Attention projection size. + kv_channels = args.hidden_width // args.num_heads + query_projection_size = kv_channels * args.num_heads + query_projection_to_hidden_size_ratio = query_projection_size / args.hidden_width + + # Group Query Attention. + if not args.num_key_value_heads: args.num_key_value_heads = args.num_heads + + # MoE. + num_experts_routed_to = 1 if args.moe == 0 else args.num_experts_per_tok + gated_linear_multiplier = 3/2 if args.moe > 0 else 1 + + # Compute the number of floating point operations num_flops = ( 12 * global_batch_size @@ -47,13 +58,26 @@ def compute_tflops(args, global_batch_size, step_time, world_size): * args.hidden_width * args.hidden_width * ( - 1 - + ((args.intermediate_size / args.hidden_width) * num_experts_routed_to) - + (args.num_key_value_heads / args.num_heads) - + (args.max_context_width / args.hidden_width) + # Attention. + ( + ( + 1 + + (args.num_key_value_heads / args.num_heads) + + (args.max_context_width / args.hidden_width) + ) * query_projection_to_hidden_size_ratio + ) + # MLP. + + ( + (args.intermediate_size / args.hidden_width) + * num_experts_routed_to + * gated_linear_multiplier + ) + # Logit. + (args.vocab_size / (2 * args.num_layers * args.hidden_width)) ) ) + + # Convert to TFLOPs per GPU tflops_per_gpu = num_flops / ( step_time * 10**12 * world_size) return tflops_per_gpu From a4c576fa43d76d932c849faf2cdb42efb4b79a67 Mon Sep 17 00:00:00 2001 From: Victor Zhu Date: Tue, 18 Jun 2024 14:45:17 -0700 Subject: [PATCH 3/3] Update minimum sagemaker pysdk version to 2.224 --- .../gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb | 2 +- .../model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb | 2 +- .../llama_v2/smp-finetuning-llama-fsdp-tp.ipynb | 2 +- .../llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb | 2 +- .../model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb index f4448787b2..50fb20cf6f 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-finetuning-gpt-neox-fsdp-tp.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade \"sagemaker>=2.212\"\n", + "%pip install --upgrade \"sagemaker>=2.224\"\n", "%pip install sagemaker-experiments" ] }, diff --git a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb index 014dc103e0..b8598276c5 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/gpt-neox/smp-train-gpt-neox-fsdp-tp.ipynb @@ -74,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade \"sagemaker>=2.212\"\n", + "%pip install --upgrade \"sagemaker>=2.224\"\n", "%pip install sagemaker-experiments" ] }, diff --git a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb index b3cc07b7ef..c7c1b8bae1 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-finetuning-llama-fsdp-tp.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade \"sagemaker>=2.212\"\n", + "%pip install --upgrade \"sagemaker>=2.224\"\n", "%pip install sagemaker-experiments" ] }, diff --git a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb index a7c4189016..21d5c26c0d 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/llama_v2/smp-train-llama-fsdp-tp-fp8.ipynb @@ -74,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade \"sagemaker>=2.212\"\n", + "%pip install --upgrade \"sagemaker>=2.224\"\n", "%pip install sagemaker-experiments" ] }, diff --git a/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb b/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb index 30aec32a6d..d9db6d36ff 100644 --- a/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb +++ b/training/distributed_training/pytorch/model_parallel_v2/mixtral/smp-train-mixtral-fsdp-ep.ipynb @@ -74,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade \"sagemaker>=2.215\"\n", + "%pip install --upgrade \"sagemaker>=2.224\"\n", "%pip install sagemaker-experiments" ] },