From 35cec19e765853dbe5faad710bbada164262eb28 Mon Sep 17 00:00:00 2001 From: gnadathur Date: Wed, 28 Feb 2024 09:35:13 -0800 Subject: [PATCH 1/2] Enable libUV for torchtrain --- run_llama_train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run_llama_train.sh b/run_llama_train.sh index 2d15496464..3c9c1dd453 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -2,6 +2,7 @@ set -ex +export USE_LIBUV=1 TRAINER_DIR=${1:-/home/$USER/local/torchtrain} # use envs as local overrides for convenience From 9066b6cdab7f94f593e04ef1dc72f5cde59a4b27 Mon Sep 17 00:00:00 2001 From: gnadathur Date: Wed, 28 Feb 2024 11:47:27 -0800 Subject: [PATCH 2/2] add comment for the env variable --- run_llama_train.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/run_llama_train.sh b/run_llama_train.sh index 3c9c1dd453..13b66aeace 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -2,6 +2,8 @@ set -ex +# libUV is a scalable backend for TCPStore which is used in processGroup +# rendezvous. This is the recommended backend for distributed training. export USE_LIBUV=1 TRAINER_DIR=${1:-/home/$USER/local/torchtrain}