Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: meg-lm-llama2-70b-alan
namespace: default
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
labels:
scitix.ai/topo-aware-in-node: "true"
spec: &job-spec
tolerations:
- key: "scitix.ai/nodecheck"
operator: "Exists"
effect: "NoSchedule"
containers:
- args:
- "export NODE_RANK=$RANK && unset RANK && rm -rf /dev/shm/* && \
DEEP_LEARNING_EXAMPLES_DIR=/workspace/deep_learning_examples \
BASE_RESULTS_DIR=/workspace/deep_learning_examples/results \
RUN_ID=0 GBS=128 MBS=1 PP=4 TP=4 MAX_STEPS=40 \
ENABLE_CKPT=1 MOCK_DATA=true \
bash /workspace/deep_learning_examples/training/Megatron-LM/llm/llama/run_meg_lm_llama2_70b_bf16.sh "
command:
- /usr/bin/env
- bash
- -c
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: TZ
value: CST-8
image: registry-ap-southeast.scitix.ai/hpc/nemo:25.04
imagePullPolicy: Always
name: pytorch
resources:
limits:
cpu: "80"
memory: 1000Gi
nvidia.com/gpu: "8"
rdma/hca_shared_devices_all: "1"
requests:
cpu: "80"
memory: 1000Gi
nvidia.com/gpu: "8"
rdma/hca_shared_devices_all: "1"
securityContext:
capabilities:
add:
- IPC_LOCK
volumeMounts:
- mountPath: /dev/shm
name: dev-shm
- mountPath: /workspace/deep_learning_examples
name: deep-learning-examples
- mountPath: /data/zawang/siflowai
name: siflowai
volumes:
- name: deep-learning-examples
hostPath:
path: /data/wangza/deep_learning_examples
- name: siflowai
hostPath:
path: /data/wangza/siflowai
- name: dev-shm
hostPath:
path: /dev/shm
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: scitix.ai/gpu-type
operator: In
values:
- h100nvlink80
Worker:
replicas: 3
restartPolicy: Never
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
labels:
scitix.ai/topo-aware-in-node: "true"
spec:
<<: *job-spec
2 changes: 1 addition & 1 deletion thirdparty/Megatron-DeepSpeed
2 changes: 1 addition & 1 deletion thirdparty/Megatron-LM
Submodule Megatron-LM updated 1432 files
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ MODEL_ARGS=(
--swiglu
--normalization RMSNorm
--disable-bias-linear
--async-save
)

TRAINING_ARGS=(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ MODEL_ARGS=(
--swiglu
--normalization RMSNorm
--disable-bias-linear
--async-save
)

TRAINING_ARGS=(
Expand Down