pytorch · wanchaol · Feb 29, 2024 · Feb 28, 2024
@@ -9,7 +9,7 @@ class TestJobConfig:
     def test_command_line_args(self):
         config = JobConfig()
         config.parse_args([])
-        assert config.training.steps == -1
+        assert config.training.steps == 10000
 
     def test_job_config_file(self):
         config = JobConfig()

@@ -151,10 +151,10 @@ def init_args_from_command_line(
             "--training.seq_len", type=int, default=2048, help="sequence length"
         )
         parser.add_argument(
-            "--training.warmup_pct",
-            type=float,
-            default=0.20,
-            help="percentage of total training steps to use for warmup",
+            "--training.warmup_steps",
+            type=int,
+            default=200,
+            help="steps for lr scheduler warmup",
         )
         parser.add_argument(
             "--training.max_norm",
@@ -163,7 +163,10 @@ def init_args_from_command_line(
             help="max norm for gradient clipping",
         )
         parser.add_argument(
-            "--training.steps", type=int, default=-1, help="how many train steps to run"
+            "--training.steps",
+            type=int,
+            default=10000,
+            help="how many train steps to run",
         )
         parser.add_argument(
             "--training.data_parallel_degree",

@@ -1,3 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 
@@ -6,7 +9,7 @@
 
 # global states for scheduling
 # these are needed as LambdaLR does not support argument passing
-_warmup_steps = 2
+_warmup_steps = 200
 _decay_steps = 0
 
 
@@ -33,9 +36,7 @@ def linear_warmup_linear_decay(current_step: int) -> float:
 def get_lr_scheduler(optimizer, job_config: JobConfig):
     """Build a linear warmup and linear decay scheduler"""
     global _warmup_steps, _decay_steps
-    _warmup_steps = max(
-        int(job_config.training.steps * job_config.training.warmup_pct), 2
-    )
+    _warmup_steps = int(job_config.training.warmup_steps)
     _decay_steps = float(max(1, job_config.training.steps - _warmup_steps))
 
     warmup_scheduler = LambdaLR(optimizer, lr_lambda=linear_warmup_linear_decay)

@@ -187,10 +187,7 @@ def main(job_config: JobConfig):
         losses_since_last_log: List[float] = []
         nwords_since_last_log = 0
         time_last_log = timer()
-        while (
-            train_state.step < job_config.training.steps
-            or job_config.training.steps == -1
-        ):
+        while train_state.step < job_config.training.steps:
             train_state.step += 1
             # get batch
             data_load_start = timer()

@@ -26,7 +26,7 @@ lr = 8e-4
 [training]
 batch_size = 8
 seq_len = 2048
-warmup_pct = 0.20  # lr scheduler warm up
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
 max_norm = 1.0  # grad norm clipping
 steps = 10
 data_parallel_degree = -1