pytorch · lessw2020 · Feb 22, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
@@ -24,6 +24,9 @@ class ModelArgs:
 
     max_batch_size: int = 32
     max_seq_len: int = 32768
+    depth_init: bool = (
+        True  # initialization uses each unique layer_id or total model layer count
+    )
 
 
 class RMSNorm(torch.nn.Module):
@@ -392,7 +395,11 @@ def __init__(self, layer_id: int, model_args: ModelArgs):
         self.num_layers = model_args.n_layers
         self.attention_norm = RMSNorm(model_args.dim, eps=model_args.norm_eps)
         self.ffn_norm = RMSNorm(model_args.dim, eps=model_args.norm_eps)
-        self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5
+
+        if model_args.depth_init:
+            self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5
+        else:
+            self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5
 
     def forward(
         self,

@@ -207,12 +207,14 @@ def main(args):
 
             # log metrics
             if (train_state.step - 1) % args.log_freq == 0:
-                avg_loss, max_loss = np.mean(losses_since_last_log), np.max(
-                    losses_since_last_log
+                avg_loss, max_loss = (
+                    np.mean(losses_since_last_log),
+                    np.max(losses_since_last_log),
+                )
+                global_avg_loss, global_max_loss = (
+                    dist_mean(avg_loss, world_mesh),
+                    dist_max(max_loss, world_mesh),
                 )
-                global_avg_loss, global_max_loss = dist_mean(
-                    avg_loss, world_mesh
-                ), dist_max(max_loss, world_mesh)
 
                 time_delta = timer() - time_last_log
                 wps = nwords_since_last_log / (
@@ -239,7 +241,8 @@ def main(args):
                 time_last_log = timer()
 
             rank0_log(
-                f"step: {train_state.step}, current loss: {train_state.current_loss}, lr: {scheduler.get_last_lr()}"
+                f"step: {train_state.step},  current loss: {round(train_state.current_loss,4)},"
+                f"  lr: {round(float(scheduler.get_last_lr()[0]), 8)}"
             )
             scheduler.step()