[PP] Bypass seed checkpoint my init-ing model parts separately (#516)

H-Huang · web-flow · commit eb64c696bf1b · 2024-08-12T18:18:24.000-04:00
Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * #473 * #517 * __->__ #516 Allows PP to be used without a seed checkpoint by calling `init_weight` on each model part. This is the solution in step 1 of #514 proposed by @wconstab
diff --git a/torchtitan/models/llama/model.py b/torchtitan/models/llama/model.py
@@ -394,19 +394,23 @@ def init_weights(self):
         """
         with torch.device(self.freqs_cis.device):
             self.freqs_cis = self._precompute_freqs_cis()
-        nn.init.normal_(self.tok_embeddings.weight)
+        if self.tok_embeddings is not None:
+            nn.init.normal_(self.tok_embeddings.weight)
         for layer in self.layers.values():
-            layer.init_weights()
-        self.norm.reset_parameters()
+            if layer is not None:
+                layer.init_weights()
+        if self.norm is not None:
+            self.norm.reset_parameters()
         final_out_std = self.model_args.dim**-0.5
         cutoff_factor = 3
-        nn.init.trunc_normal_(
-            self.output.weight,
-            mean=0.0,
-            std=final_out_std,
-            a=-cutoff_factor * final_out_std,
-            b=cutoff_factor * final_out_std,
-        )
+        if self.output is not None:
+            nn.init.trunc_normal_(
+                self.output.weight,
+                mean=0.0,
+                std=final_out_std,
+                a=-cutoff_factor * final_out_std,
+                b=cutoff_factor * final_out_std,
+            )
 
     def _precompute_freqs_cis(self) -> torch.Tensor:
         return precompute_freqs_cis(
diff --git a/train.py b/train.py
@@ -11,6 +11,7 @@
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.fx import GraphModule
 
 from torchtitan import utils
 from torchtitan.checkpoint import CheckpointManager, TrainState
@@ -152,25 +153,23 @@ def loss_fn(pred, labels):
         for m in model_parts:
             # apply SPMD-style PT-D techniques
             models_parallelize_fns[model_name](m, world_mesh, parallel_dims, job_config)
-
-            # In PP, we cannot call init_weights directly because some layers are missing.
-            # In the future, we may make init_weights handle missing layers, but also have
-            # to consider RNG seed propagation. For now, we rely on a seed checkpoint to
-            # initialize the model.
             m.to_empty(device="cuda")
-            m.train()
     else:
         # apply PT-D Tensor Parallel, activation checkpointing, torch.compile, Data Parallel
         models_parallelize_fns[model_name](model, world_mesh, parallel_dims, job_config)
 
         # move sharded model to CPU/GPU and initialize weights via DTensor
         init_device = "cpu" if job_config.checkpoint.create_seed_checkpoint else "cuda"
         model.to_empty(device=init_device)
-        model.init_weights()
-        model.train()
-
         model_parts = [model]
 
+    for mod in model_parts:
+        # skip traced modules since we do not define init_weights in the traced module
+        if isinstance(mod, GraphModule):
+            continue
+        mod.init_weights()
+        mod.train()
+
     gpu_mem_stats = gpu_memory_monitor.get_peak_stats()
     logger.info(
         f"GPU memory usage for model: "
@@ -205,9 +204,10 @@ def loss_fn(pred, labels):
     checkpoint_loaded = checkpoint.load()
 
     if parallel_dims.pp_enabled and not checkpoint_loaded:
-        raise RuntimeError(
-            "Pipeline Parallelism requires meta-initialization and loading seed checkpoint. "
-            "Please run `./create_seed_checkpoint.sh` and rerun training with `--checkpoint.enable_checkpoint`"
+        # TODO: fix this by allowing each rank to set their own seed
+        logger.warning(
+            "Pipeline Parallelism is being used without a seed checkpoint. "
+            "All the substages will be initialized with random weights with same RNG state which can affect convergence."
         )
 
     metric_logger = build_metric_logger(job_config, parallel_dims)