use warmup steps for lr scheduler, ban steps == -1

wanchaol · wanchaol · commit 73482d70e438 · 2024-02-28T13:53:00.000-08:00
as titled, we don't want to allow steps == -1 case as it would blow up
the lr scheduler
diff --git a/torchtrain/config_manager.py b/torchtrain/config_manager.py
@@ -151,10 +151,10 @@ def init_args_from_command_line(
             "--training.seq_len", type=int, default=2048, help="sequence length"
         )
         parser.add_argument(
-            "--training.warmup_pct",
-            type=float,
-            default=0.20,
-            help="percentage of total training steps to use for warmup",
+            "--training.warmup_steps",
+            type=int,
+            default=200,
+            help="steps for lr scheduler warmup",
         )
         parser.add_argument(
             "--training.max_norm",
@@ -163,7 +163,7 @@ def init_args_from_command_line(
             help="max norm for gradient clipping",
         )
         parser.add_argument(
-            "--training.steps", type=int, default=-1, help="how many train steps to run"
+            "--training.steps", type=int, default=10000, help="how many train steps to run"
         )
         parser.add_argument(
             "--training.data_parallel_degree",
diff --git a/torchtrain/lr_scheduling.py b/torchtrain/lr_scheduling.py
@@ -6,7 +6,7 @@
 
 # global states for scheduling
 # these are needed as LambdaLR does not support argument passing
-_warmup_steps = 2
+_warmup_steps = 200
 _decay_steps = 0
 
 
@@ -33,9 +33,7 @@ def linear_warmup_linear_decay(current_step: int) -> float:
 def get_lr_scheduler(optimizer, job_config: JobConfig):
     """Build a linear warmup and linear decay scheduler"""
     global _warmup_steps, _decay_steps
-    _warmup_steps = max(
-        int(job_config.training.steps * job_config.training.warmup_pct), 2
-    )
+    _warmup_steps = int(job_config.training.warmup_steps)
     _decay_steps = float(max(1, job_config.training.steps - _warmup_steps))
 
     warmup_scheduler = LambdaLR(optimizer, lr_lambda=linear_warmup_linear_decay)
diff --git a/train.py b/train.py
@@ -187,10 +187,7 @@ def main(job_config: JobConfig):
         losses_since_last_log: List[float] = []
         nwords_since_last_log = 0
         time_last_log = timer()
-        while (
-            train_state.step < job_config.training.steps
-            or job_config.training.steps == -1
-        ):
+        while train_state.step < job_config.training.steps:
             train_state.step += 1
             # get batch
             data_load_start = timer()
@@ -220,7 +217,7 @@ def main(job_config: JobConfig):
 
             # clip gradients (after unscaling gradients of the optimizer's params)
             scaler.unscale_(optimizer)
-            model.clip_grad_norm_(job_config.training.max_norm)
+            # model.clip_grad_norm_(job_config.training.max_norm)
 
             # optimizer step
             # If gradients don't contain infs/NaNs, optimizer.step() is then called;
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -26,7 +26,7 @@ lr = 8e-4
 [training]
 batch_size = 8
 seq_len = 2048
-warmup_pct = 0.20  # lr scheduler warm up
+warmup_steps = 5  # lr scheduler warm up
 max_norm = 1.0  # grad norm clipping
 steps = 10
 data_parallel_degree = -1

Original file line number	Diff line number	Diff line change
`@@ -151,10 +151,10 @@ def init_args_from_command_line(`
`151`	`151`	`"--training.seq_len", type=int, default=2048, help="sequence length"`
`152`	`152`	`)`
`153`	`153`	`parser.add_argument(`
`154`		`- "--training.warmup_pct",`
`155`		`- type=float,`
`156`		`- default=0.20,`
`157`		`- help="percentage of total training steps to use for warmup",`
	`154`	`+ "--training.warmup_steps",`
	`155`	`+ type=int,`
	`156`	`+ default=200,`
	`157`	`+ help="steps for lr scheduler warmup",`
`158`	`158`	`)`
`159`	`159`	`parser.add_argument(`
`160`	`160`	`"--training.max_norm",`
`@@ -163,7 +163,7 @@ def init_args_from_command_line(`
`163`	`163`	`help="max norm for gradient clipping",`
`164`	`164`	`)`
`165`	`165`	`parser.add_argument(`
`166`		`- "--training.steps", type=int, default=-1, help="how many train steps to run"`
	`166`	`+ "--training.steps", type=int, default=10000, help="how many train steps to run"`
`167`	`167`	`)`
`168`	`168`	`parser.add_argument(`
`169`	`169`	`"--training.data_parallel_degree",`