Skip to content

Commit 470d777

Browse files
authored
[TRTINFRA-7280][infra] Support enroot/pyxis clusters in multi-node SLURM and enable oci-hsg GB200 in post-merge (#9117)
Signed-off-by: Matt Lefebvre <[email protected]>
1 parent df41f22 commit 470d777

File tree

1 file changed

+46
-3
lines changed

1 file changed

+46
-3
lines changed

jenkins/L0_Test.groovy

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -921,8 +921,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
921921
taskArgs = [
922922
*taskArgs,
923923
]
924+
925+
def containerImageArg = container
926+
def srunPrologue = ""
927+
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
928+
mounts = [
929+
"/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro",
930+
"/home/svc_tensorrt/bloom/scripts",
931+
"/home/svc_tensorrt/.cache:/root/.cache",
932+
].join(",")
933+
934+
def enrootImagePath = "/lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
935+
containerImageArg = enrootImagePath
936+
937+
srunPrologue = """
938+
export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
939+
940+
retry_command() {
941+
local cmd=\$1
942+
local max_attempts=\${2:-3}
943+
local delay=\${3:-60}
944+
local attempt=1
945+
946+
until \$cmd
947+
do
948+
if ((attempt >= max_attempts))
949+
then
950+
echo "Command '\$cmd' failed after \$max_attempts attempts"
951+
return 1
952+
fi
953+
954+
echo "Command '\$cmd' failed (attempt \$attempt of \$max_attempts). Retrying in \${delay}s..."
955+
sleep \$delay
956+
((attempt++))
957+
done
958+
}
959+
960+
retry_command "enroot import -o $enrootImagePath -- docker://$container"
961+
""".replaceAll("(?m)^\\s*", "")
962+
}
963+
924964
srunArgs = [
925-
"--container-image=$container",
965+
"--container-image=$containerImageArg",
926966
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
927967
"--container-mounts=$mounts",
928968
"--container-env=NVIDIA_IMEX_CHANNELS"
@@ -951,6 +991,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
951991
export NVIDIA_IMEX_CHANNELS=0
952992
export NVIDIA_IMEX_CHANNELS=0
953993
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
994+
995+
${srunPrologue}
996+
954997
chmod +x $scriptRunNode
955998
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
956999
""".replaceAll("(?m)^\\s*", "")
@@ -2718,7 +2761,7 @@ def launchTestJobs(pipeline, testFilter)
27182761
// Disable GB300 stages due to nodes will be offline temporarily.
27192762
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
27202763
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2721-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
2764+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
27222765
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
27232766
]
27242767
fullSet += SBSASlurmTestConfigs.keySet()
@@ -2735,7 +2778,7 @@ def launchTestJobs(pipeline, testFilter)
27352778
multiNodesSBSAConfigs = [:]
27362779
def numMultiNodeTests = 3
27372780
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
2738-
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
2781+
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
27392782
}
27402783
fullSet += multiNodesSBSAConfigs.keySet()
27412784

0 commit comments

Comments
 (0)