sallylxl · ziang663 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025
diff --git a/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml b/launcher_scripts/k8s/training/megatron-lm/llm/meg-lm-llama2-70b-ckpt.yaml
@@ -0,0 +1,99 @@
+apiVersion: kubeflow.org/v1
+kind: PyTorchJob
+metadata:
+  name: meg-lm-llama2-70b-alan
+  namespace: default
+spec: 
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: Never
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+          labels:
+            scitix.ai/topo-aware-in-node: "true"
+        spec: &job-spec
+          tolerations:
+          - key: "scitix.ai/nodecheck"
+            operator: "Exists"
+            effect: "NoSchedule"
+          containers:
+          - args:
+            - "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \
+               DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \
+               BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \
+               RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \
+               ENABLE_CKPT=1 MOCK_DATA=true \
+               bash /workspace/deep_learning_examples/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh "             
+            command:
+            - /usr/bin/env
+            - bash
+            - -c
+            env:
+            - name: NODE_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: TZ
+              value: CST-8
+            image: registry-ap-southeast.scitix.ai/hpc/nemo:25.04
+            imagePullPolicy: Always 
+            name: pytorch
+            resources:
+              limits:
+                cpu: "80"
+                memory: 1000Gi
+                nvidia.com/gpu: "8"
+                rdma/hca_shared_devices_all: "1"
+              requests:
+                cpu: "80"
+                memory: 1000Gi
+                nvidia.com/gpu: "8"
+                rdma/hca_shared_devices_all: "1"
+            securityContext:
+              capabilities:
+                add:
+                - IPC_LOCK
+            volumeMounts:
+            - mountPath: /dev/shm
+              name: dev-shm
+            - mountPath: /workspace/deep_learning_examples
+              name: deep-learning-examples
+            - mountPath: /data/zawang/siflowai
+              name: siflowai
+          volumes:
+          - name: deep-learning-examples
+            hostPath:
+              path: /data/wangza/deep_learning_examples
+          - name: siflowai
+            hostPath:
+              path: /data/wangza/siflowai
+          - name: dev-shm
+            hostPath:
+              path: /dev/shm
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: scitix.ai/gpu-type
+                        operator: In
+                        values:
+                          - h100nvlink80
+    Worker:
+      replicas: 3
+      restartPolicy: Never
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "false"
+          labels:
+            scitix.ai/topo-aware-in-node: "true"
+        spec:
+          <<: *job-spec
diff --git a/thirdparty/Megatron-DeepSpeed b/thirdparty/Megatron-DeepSpeed
diff --git a/thirdparty/Megatron-LM b/thirdparty/Megatron-LM
diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_13b_bf16.sh
@@ -106,6 +106,7 @@ MODEL_ARGS=(
     --swiglu
     --normalization RMSNorm 
     --disable-bias-linear
+    --async-save
 )
 
 TRAINING_ARGS=(

diff --git a/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh b/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh
@@ -109,6 +109,7 @@ MODEL_ARGS=(
     --swiglu
     --normalization RMSNorm 
     --disable-bias-linear
+    --async-save
 )
 
 TRAINING_ARGS=(