Manual PP

wconstab · wconstab · commit 3216f3eea64b · 2024-05-08T17:53:49.000-07:00
runs PP+DP and PP+TP without issue, runs PP+TP+DP with decreasing loss, but fails DCP save TODOs - clean up manualstage creation - config options for configuring stage split - a way to switch between tracer/manual ghstack-source-id: 952f364 Pull Request resolved: #308
diff --git a/torchtitan/models/llama/model.py b/torchtitan/models/llama/model.py
@@ -76,7 +76,7 @@ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Ten
     """
     ndim = x.ndim
     assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (freqs_cis.shape, x.shape)
     shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
     return freqs_cis.view(*shape)
 
@@ -182,6 +182,7 @@ def forward(
             torch.Tensor: Output tensor after attention.
 
         """
+        print(f"transformer layer got input shape {x.shape}")
         bs, seqlen, _ = x.shape
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
 
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -8,7 +8,7 @@
 # llama model, i.e. activation checkpointing, etc.
 
 from collections import defaultdict
-from typing import Tuple
+from typing import List, Tuple
 
 import torch
 
@@ -138,7 +138,112 @@ def get_tp_parallel_strategy(
     return RowwiseParallel, ColwiseParallel
 
 
-def apply_pipeline_parallelism(model, world_mesh, parallel_dims, job_config: JobConfig):
+class DummyTransformerLayer(torch.nn.Module):
+    def forward(self, input, freqs_cis):
+        return input
+
+
+class TransformerChunk(torch.nn.Module):
+    def __init__(
+        self,
+        orig_model,  # : Transformer,
+        this_stage_layer_names: List[str],
+        device,
+        input_seqlen: int,
+    ):
+        super().__init__()
+        self.tok_embeddings = None
+
+        # inferring seqlen from forward(input) only works on stage0, bc on later stages
+        # the hidden state input may have reduced seqlen due to TP.  We need to use the
+        # original (full) seqlen for freqs_cis to be correct.
+        self.input_seqlen = input_seqlen
+
+        if "tok_embeddings" in this_stage_layer_names:
+            self.tok_embeddings = orig_model.tok_embeddings
+
+        with torch.device(device):
+            self.freqs_cis = orig_model._precompute_freqs_cis()
+
+        # preserve FQNs of original model by preserving structure
+        # (including preserving position in layers[] list)- use dummy module
+        self.layers = orig_model.layers
+        for i in range(len(self.layers)):
+            if f"layers.{i}" not in this_stage_layer_names:
+                self.layers[i] = DummyTransformerLayer()
+        self.norm = None
+        if "norm" in this_stage_layer_names:
+            self.norm = orig_model.norm
+        self.output = None
+        if "output" in this_stage_layer_names:
+            self.output = orig_model.output
+
+    def forward(self, input):
+        """
+        Copypaste of original Transformer.forward, with conditionals and unpacking added
+        such that we handle the cases where this rank doesn't have the embedding, or doesn't have
+        the output layers.
+        """
+        if self.tok_embeddings:
+            h = self.tok_embeddings(input)
+        else:
+            h = input
+
+        freqs_cis = self.freqs_cis[0 : self.input_seqlen]
+
+        for layer in self.layers:
+            h = layer(h, freqs_cis)
+        output = h
+
+        if self.norm:
+            h = self.norm(h)
+            output = h
+
+        if self.output:
+            output = self.output(h).float()
+        return output
+
+
+def apply_pipeline_parallelism_manual(
+    model, world_mesh, parallel_dims, job_config: JobConfig, device
+):
+    """
+    This API gets individual torch.nn.Module objects for each pipeline stage (including virtual stages).
+
+    The SPMD parallelisms should be applied to
+    """
+    pp_mesh = world_mesh["pp"]
+    pp_rank = pp_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    stage_idx = pp_rank  # TODO support virtual stages
+    layers_per_rank = len(model.layers) // parallel_dims.pp
+    layer_offset = layers_per_rank * pp_rank
+    this_stage_layer_names = [
+        f"layers.{i + layer_offset}" for i in range(layers_per_rank)
+    ]
+    if pp_rank == 0:
+        this_stage_layer_names.insert(0, "tok_embeddings")
+        assert "layers.0" in this_stage_layer_names
+    elif pp_rank == pp_size - 1:
+        this_stage_layer_names.append("norm")
+        this_stage_layer_names.append("output")
+        assert "layers.1" in this_stage_layer_names
+
+    input_seqlen = 2048  # TODO hack
+
+    stage_model = TransformerChunk(model, this_stage_layer_names, device, input_seqlen)
+    # Create a pipeline representation from the model
+
+    # note for PipPy API
+    # it would be nice if we could get fx.graph out of PipeInfo and then make it possible to manually construct PipeInfo
+    # and then use the same _PipelineStage ctor in either tracer or manual cases.
+
+    return (stage_model,)
+
+
+def apply_pipeline_parallelism_tracer(
+    model, world_mesh, parallel_dims, job_config: JobConfig
+):
     assert (
         parallel_dims.pp_enabled
     ), "can't apply pipeline parallelism if it is not enabled"
@@ -212,6 +317,8 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
 
         # Apply tensor + sequence parallelism to every transformer block
         for layer_id, transformer_block in enumerate(model.layers):
+            if isinstance(transformer_block, DummyTransformerLayer):
+                continue
             layer_plan = {
                 "attention": PrepareModuleInput(
                     input_layouts=(Shard(1), None),
@@ -259,6 +366,8 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
         ac_mode = job_config.activation_checkpoint.mode
         fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
         for layer_name, transformer_block in model.layers.named_children():
+            if isinstance(transformer_block, DummyTransformerLayer):
+                continue
             if job_config.activation_checkpoint.mode in ("full", "selective"):
                 transformer_block = checkpoint_wrapper(
                     transformer_block, job_config.activation_checkpoint
@@ -275,6 +384,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
             )
             model.layers.add_module(layer_name, transformer_block)
 
+        # TODO(whc) do we need reshard_after_forward setting here too?
         model = fully_shard(model, **fsdp_config)
         if ac_mode in ("full", "selective"):
             logger.info(f"Applied {ac_mode} activation checkpointing to the model")
diff --git a/train.py b/train.py
@@ -22,15 +22,17 @@
 
 # TODO(whc) this can be removed after pippy migration into pytorch core is complete.
 try:
-    from pippy import ScheduleGPipe
-    from pippy.PipelineStage import _PipelineStage
+    from pippy import ManualPipelineStage, ScheduleGPipe
+
+    # from pippy.PipelineStage import _PipelineStage
 except ImportError as exc:
     raise ImportError(
         "pippy is not installed. Please install it to use pipeline parallelism. "
         "`pip install git+https://github.com/pytorch/pippy`"
     ) from exc
 
 from torch.distributed import destroy_process_group
+from torch.distributed._composable.fsdp.fully_shard import FSDPModule
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.distributed.elastic.multiprocessing.errors import record
 from torch.distributed.tensor.parallel import loss_parallel
@@ -224,28 +226,70 @@ def loss_fn(pred, labels):
 
     if parallel_dims.pp_enabled:
         # TODO(whc) now i need to figure out how to align this with the `model_parallelize_fns[model_name] pattern`
-        from torchtitan.parallelisms.parallelize_llama import apply_pipeline_parallelism
+        from torchtitan.parallelisms.parallelize_llama import (
+            apply_pipeline_parallelism_manual,
+        )
 
-        model, pipe_info = apply_pipeline_parallelism(
-            model, world_mesh, parallel_dims, job_config
+        stage_models = apply_pipeline_parallelism_manual(
+            model, world_mesh, parallel_dims, job_config, device
         )
+        stage_models = [
+            models_parallelize_fns[model_name](
+                model, world_mesh, parallel_dims, job_config
+            )
+            for model in stage_models
+        ]
+        # TODO virtual stages NYI
+        model = stage_models[0]
 
-    # apply PT-D DP/TP parallelisms and activation checkpointing
-    model = models_parallelize_fns[model_name](
-        model, world_mesh, parallel_dims, job_config
-    )
+    else:
+        # apply PT-D DP/TP parallelisms and activation checkpointing
+        model = models_parallelize_fns[model_name](
+            model, world_mesh, parallel_dims, job_config
+        )
 
     model.to_empty(device="cuda")
 
     # TODO(whc) everything below needs to become a function that can be applied to each 'virtual stage' of PP, if
     # there are virtual stages
     if parallel_dims.pp_enabled:
-        stage = _PipelineStage(
-            stage_module=model,
-            stage_index=pp_rank,
-            pipe_info=pipe_info,
-            device=device,
-            group=pp_mesh.get_group(),
+        # stage = _PipelineStage(
+        #     stage_module=model,
+        #     stage_index=pp_rank,
+        #     pipe_info=pipe_info,
+        #     device=device,
+        #     group=pp_mesh.get_group(),
+        # )
+        assert len(stage_models) == 1, "virtual stages NYI"
+        chunks = parallel_dims.pp
+        pp_mesh = world_mesh["pp"]
+        pp_rank = pp_mesh.get_local_rank()
+        pp_size = pp_mesh.size()
+        stage_idx = pp_rank  # TODO support virtual stages
+        # Get example input
+        if pp_rank == 0:
+            input_shape = (job_config.training.batch_size, job_config.training.seq_len)
+            input_ids = torch.randint(
+                model_config.vocab_size, input_shape, dtype=torch.int64, device="meta"
+            )
+        else:
+            # TODO(whc) can we rely on shape inference so that user doesn't have to compute TP impact on seq_len
+            input_shape = (
+                job_config.training.batch_size,
+                int(job_config.training.seq_len // parallel_dims.tp),
+                model_config.dim,
+            )
+            input_ids = torch.randint(
+                model_config.vocab_size, input_shape, dtype=torch.float32, device="meta"
+            )
+        stage = ManualPipelineStage(
+            model,
+            pp_rank,
+            pp_size,
+            device,
+            chunks,
+            input_args=input_ids.chunk(chunks)[0],
+            group=pp_mesh.get_group("pp"),
         )
         pp_schedule = ScheduleGPipe(
             stage,
@@ -259,6 +303,9 @@ def loss_fn(pred, labels):
         # becuase it can't find "embedding" layer, for example.
 
         # allocate sharded model on GPU and initialize weights via DTensor
+
+        # if we were to rewrite init_weights to work on the pp-model, we could call it unconditionally here, and that
+        # would not free us from needing seed-checkpoint init
         model.init_weights()
 
     gpu_mem_stats = gpu_memory_monitor.get_peak_stats()
@@ -268,6 +315,10 @@ def loss_fn(pred, labels):
         f"({gpu_mem_stats.max_reserved_pct:.2f}%)"
     )
 
+    if isinstance(model, FSDPModule) and parallel_dims.pp_enabled:
+        # reshard now to counteract an issue where FSDP's states got advanced during PP stage shape inference
+        model.reshard()
+
     # build optimizer after applying parallelisms to the model
     optimizer = build_optimizer(model, job_config)
     scheduler = get_lr_scheduler(optimizer, job_config)