@@ -921,8 +921,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
921921 taskArgs = [
922922 * taskArgs,
923923 ]
924+
925+ def containerImageArg = container
926+ def srunPrologue = " "
927+ if (cluster. containerRuntime == ContainerRuntime . ENROOT ) {
928+ mounts = [
929+ " /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci:/scratch.trt_llm_data:ro" ,
930+ " /home/svc_tensorrt/bloom/scripts" ,
931+ " /home/svc_tensorrt/.cache:/root/.cache" ,
932+ ]. join(" ," )
933+
934+ def enrootImagePath = " /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-\$ {SLURM_JOB_ID}.sqsh"
935+ containerImageArg = enrootImagePath
936+
937+ srunPrologue = """
938+ export ENROOT_CACHE_PATH='/home/svc_tensorrt/.cache/enroot'
939+
940+ retry_command() {
941+ local cmd=\$ 1
942+ local max_attempts=\$ {2:-3}
943+ local delay=\$ {3:-60}
944+ local attempt=1
945+
946+ until \$ cmd
947+ do
948+ if ((attempt >= max_attempts))
949+ then
950+ echo "Command '\$ cmd' failed after \$ max_attempts attempts"
951+ return 1
952+ fi
953+
954+ echo "Command '\$ cmd' failed (attempt \$ attempt of \$ max_attempts). Retrying in \$ {delay}s..."
955+ sleep \$ delay
956+ ((attempt++))
957+ done
958+ }
959+
960+ retry_command "enroot import -o $enrootImagePath -- docker://$container "
961+ """ . replaceAll(" (?m)^\\ s*" , " " )
962+ }
963+
924964 srunArgs = [
925- " --container-image=$c ontainer " ,
965+ " --container-image=$c ontainerImageArg " ,
926966 " --container-workdir=/home/svc_tensorrt/bloom/scripts" ,
927967 " --container-mounts=$mounts " ,
928968 " --container-env=NVIDIA_IMEX_CHANNELS"
@@ -951,6 +991,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
951991 export NVIDIA_IMEX_CHANNELS=0
952992 export NVIDIA_IMEX_CHANNELS=0
953993 export NVIDIA_VISIBLE_DEVICES=\$ (seq -s, 0 \$ ((\$ (nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
994+
995+ ${ srunPrologue}
996+
954997 chmod +x $scriptRunNode
955998 srun --kill-on-bad-exit=1 ${ srunArgs.join(" ")} ${ scriptRunNode}
956999 """ . replaceAll(" (?m)^\\ s*" , " " )
@@ -2718,7 +2761,7 @@ def launchTestJobs(pipeline, testFilter)
27182761 // Disable GB300 stages due to nodes will be offline temporarily.
27192762 // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
27202763 " GB200-4_GPUs-PyTorch-1" : [" gb200-trtllm" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2721- " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-trtllm " , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
2764+ " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4-oci " , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
27222765 // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
27232766 ]
27242767 fullSet + = SBSASlurmTestConfigs . keySet()
@@ -2735,7 +2778,7 @@ def launchTestJobs(pipeline, testFilter)
27352778 multiNodesSBSAConfigs = [:]
27362779 def numMultiNodeTests = 3
27372780 multiNodesSBSAConfigs + = (1 .. numMultiNodeTests). collectEntries { i ->
2738- [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
2781+ [" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${ i} " . toString(), [" gb200-oci- trtllm" , " l0_gb200_multi_nodes" , i, numMultiNodeTests, 8 , 2 ]]
27392782 }
27402783 fullSet + = multiNodesSBSAConfigs. keySet()
27412784
0 commit comments