diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml new file mode 100644 index 0000000..0ffc2fe --- /dev/null +++ b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml @@ -0,0 +1,99 @@ +apiVersion: kubeflow.org/v1 +kind: PyTorchJob +metadata: + name: meg-lm-llama2-70b-alan + namespace: default +spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: Never + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + scitix.ai/topo-aware-in-node: "true" + spec: &job-spec + tolerations: + - key: "scitix.ai/nodecheck" + operator: "Exists" + effect: "NoSchedule" + containers: + - args: + - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \ + DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \ + BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \ + RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \ + ENABLE_CKPT=1 MOCK_DATA=true \ + bash /workspace/deep_learning_examples/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh " + command: + - /usr/bin/env + - bash + - -c + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: TZ + value: CST-8 + image: registry-ap-southeast.scitix.ai/hpc/nemo:25.04 + imagePullPolicy: Always + name: pytorch + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + rdma/hca_shared_devices_all: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + rdma/hca_shared_devices_all: "1" + securityContext: + capabilities: + add: + - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dev-shm + - mountPath: /workspace/deep_learning_examples + name: deep-learning-examples + - mountPath: /data/zawang/siflowai + name: siflowai + volumes: + - name: deep-learning-examples + hostPath: + path: /data/wangza/deep_learning_examples + - name: siflowai + hostPath: + path: /data/wangza/siflowai + - name: dev-shm + hostPath: + path: /dev/shm + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: scitix.ai/gpu-type + operator: In + values: + - h100nvlink80 + Worker: + replicas: 3 + restartPolicy: Never + template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + scitix.ai/topo-aware-in-node: "true" + spec: + <<: *job-spec diff --git a/thirdparty/Megatron-DeepSpeed b/thirdparty/Megatron-DeepSpeed index 0d6e379..3e1da1f 160000 --- a/thirdparty/Megatron-DeepSpeed +++ b/thirdparty/Megatron-DeepSpeed @@ -1 +1 @@ -Subproject commit 0d6e3793a1fc06eded9764ef15ad12bcc0281101 +Subproject commit 3e1da1fbb226fd4d19aad33afcb33c2f6ed0eb26 diff --git a/thirdparty/Megatron-LM b/thirdparty/Megatron-LM index 77b4bfe..957f348 160000 --- a/thirdparty/Megatron-LM +++ b/thirdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 77b4bfe00ab2634650345fd485be59a9d9c27272 +Subproject commit 957f3488efd505e5d22f5d5bc46eaa187eeb44cb diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh index 654ca6b..8cc87a5 100755 --- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh +++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh @@ -106,6 +106,7 @@ MODEL_ARGS=( --swiglu --normalization RMSNorm --disable-bias-linear + --async-save ) TRAINING_ARGS=( diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh index a626209..8015d01 100755 --- a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh +++ b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh @@ -109,6 +109,7 @@ MODEL_ARGS=( --swiglu --normalization RMSNorm --disable-bias-linear + --async-save ) TRAINING_ARGS=(