File tree Expand file tree Collapse file tree 4 files changed +9
-18
lines changed Expand file tree Collapse file tree 4 files changed +9
-18
lines changed Original file line number Diff line number Diff line change @@ -6,24 +6,15 @@ TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
66
77# use envs as local overrides for convenience
88# e.g.
9- # LOG_RANK=0,1 NGPU=4 SP=2 ./run_llama_train.sh
9+ # LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
1010
11- MODEL=${MODEL:- " llama" }
12- MODEL_CONF=${MODEL_CONF:- " debugmodel" }
1311NGPU=${NGPU:- " 8" }
14- PP=${PP:- " 1" }
15- SP=${SP:- " 1" }
16- DP=${DP:- " -1" }
1712
1813# by default log just rank 0 output,
1914LOG_RANK=${LOG_RANK:- 0}
2015
21- # Change this string to a meaningful one to enable checkpoint
22- CHECKPOINT_FOLDER=${CHECKPOINT_FOLDER:- " " }
23- # Please adjust this to a longer interval period. The unit of measurement is in steps.
24- CHECKPOINT_INTERVAL=${CHECKPOINT_INTERVAL:- 5}
2516
26- CONFIG_FILE=${CONFIG_FILE:- " ./torchtrain/ train_configs/train_config .toml" }
17+ CONFIG_FILE=${CONFIG_FILE:- " ./train_configs/debug_model .toml" }
2718
2819torchrun --nproc_per_node=${NGPU} --rdzv_endpoint=" localhost:5972" \
2920--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
Original file line number Diff line number Diff line change 1+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2+ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
3+
14import pytest
25from torchtrain .config_manager import JobConfig
36
@@ -10,9 +13,7 @@ def test_command_line_args(self):
1013
1114 def test_job_config_file (self ):
1215 config = JobConfig ()
13- config .parse_args (
14- ["--job.config_file" , "./torchtrain/train_configs/train_config.toml" ]
15- )
16+ config .parse_args (["--job.config_file" , "./train_configs/debug_model.toml" ])
1617 assert config .model .name == "llama"
1718
1819 def test_job_file_does_not_exist (self ):
Load Diff This file was deleted.
Original file line number Diff line number Diff line change 11# TorchTrain Config.toml
22[job ]
3- dump_folder = " ./torchtrain/ outputs"
3+ dump_folder = " ./outputs"
44
55[profiling ]
66run_profiler = true
@@ -26,8 +26,8 @@ lr = 8e-4
2626[training ]
2727batch_size = 8
2828seq_len = 2048
29- warmup_pct = 0.20
30- max_norm = 1.0
29+ warmup_pct = 0.20 # lr scheduler warm up
30+ max_norm = 1.0 # grad norm clipping
3131steps = 10
3232data_parallel_degree = -1
3333sequence_parallel_degree = 1
You can’t perform that action at this time.
0 commit comments