[WIP][Distributed] Add lanes to KV cache

kwen2501 · kwen2501 · commit e33e68102815 · 2024-09-20T22:53:44.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -273,13 +273,11 @@ def main(args):
     pp_rank = pp_mesh.get_local_rank()
     tp_group = tp_mesh.get_group()
     pp_group = pp_mesh.get_group()
-    pp_group_size = pp_group.size()
-    tp_group_size = tp_group.size()
-    logger.info(f"{pp_group_size=}, {tp_group_size=}")
+    logger.info(f"{pp_degree=}, {tp_degree=}")
 
     # Convenience variables
     first_pp_rank = 0
-    last_pp_rank = pp_group_size - 1
+    last_pp_rank = pp_degree - 1
 
     # Assuming same number of GPUs per node
     device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
@@ -292,23 +290,28 @@ def main(args):
         # TODO: we should create model instead of Transformer
         model = Transformer(config)
 
-    # Distribute model on TP mesh
-    model.distribute(tp_mesh)
     if rank == 0:
         logger.info(f"Model: {model}")
 
-    mbs = 1  # number of micro-batches
-    mb_size = 4  # micro-batch size
-    batch_size = mbs * mb_size  # total batch size
+    # Distribute model on TP mesh
+    model.distribute(tp_mesh)
 
+    # Batch size. Since we push batches dynamically through the pipeline rather
+    # than chunking them, this is effectively micro-batch size in pipeline
+    # sense. Thus it is interchangeable with micro-batch size below.
+    batch_size = 4
     seqlen_prefill = 1024  # sequence length
     dim = 4096  # embedding dimension
 
     # Setup KV caches (after model distribution)
-    # TODO: the setting below only works for 1 micro-batch case. To support
-    # multiple micro-batches, we need the KV cache in the model to be aware of
-    # the number of micro-batches and the current micro-batch index.
-    model.setup_caches(mb_size, seqlen_prefill)
+    # The number of cache lanes is the same as the maximum number of
+    # micro-batches that can be "in flight" in parallel -- imagine each
+    # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
+    # When decoding is done for certain micro-batches, we can reuse the KV cache
+    # lanes.
+    # TODO: bump up the lane count
+    cache_lanes = 1
+    model.setup_caches(batch_size, seqlen_prefill, cache_lanes=cache_lanes)
 
     # Load weights
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
@@ -317,7 +320,7 @@ def main(args):
         model.to(device)
 
     logger.info(
-        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for stage {rank}{color.reset}"
+        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
     )
 
     # info on stage size and params
@@ -335,12 +338,12 @@ def main(args):
 
     # Helper function to get example inputs and outputs for the stages.
     def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen), device=device)
+        mb_ids = torch.randint(0, config.vocab_size, (batch_size, seqlen), device=device)
         activation = torch.rand(
-            mb_size, seqlen, dim, device=device, dtype=model_dtype
+            batch_size, seqlen, dim, device=device, dtype=model_dtype
         )
         logits = torch.rand(
-            mb_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
+            batch_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
         )
         example_inputs = (mb_ids if pp_rank == first_pp_rank else activation,)
         example_outputs = (logits if pp_rank == last_pp_rank else activation,)
@@ -358,8 +361,13 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         output_args=example_outputs,
         group=pp_group,
     )
+    # Number of micro-batches for the schedule is 1, because each step() call we
+    # only push 1 micro-batch into the pipeline. But we can continuously push
+    # new micro-batches into the pipeline as they arrive, achieving same
+    # pipelining effect.
+    mbs = 1
     # create schedule
-    prefill_schedule = ScheduleGPipe(prefill_stage, mbs)
+    prefiller = ScheduleGPipe(prefill_stage, mbs)
 
     prompt = [
         "What is a computer?",
@@ -401,14 +409,15 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
     num_tokens = 40
 
     # Prefill phase
-    # Run context input through pipeline, in 1 step
+    # Run context input through pipeline
+    # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
     with torch.no_grad():
         if pp_rank == first_pp_rank:
-            output = prefill_schedule.step(padded_sequence)
+            output = prefiller.step(padded_sequence)
         elif pp_rank == last_pp_rank:
-            output = prefill_schedule.step()
+            output = prefiller.step()
         else:  # middle pp ranks
-            prefill_schedule.step()
+            prefiller.step()
 
     # Decode the output -- first generated token
     if pp_rank == last_pp_rank:
@@ -445,7 +454,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         group=pp_group,
     )
     # create schedule
-    decode_schedule = ScheduleGPipe(decode_stage, mbs)
+    decorder = ScheduleGPipe(decode_stage, mbs)
 
     # Decoding
     with torch.no_grad():
@@ -467,11 +476,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
             # Run data through pipeline
             if pp_rank == first_pp_rank:
-                output = decode_schedule.step(new_token)
+                output = decorder.step(new_token)
             elif pp_rank == last_pp_rank:
-                output = decode_schedule.step()
+                output = decorder.step()
             else:  # middle pp ranks
-                decode_schedule.step()
+                decorder.step()
 
             # Decode the output
             if pp_rank == last_pp_rank:
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -606,7 +606,7 @@ def __init__(self, config: TransformerArgs) -> None:
         self.max_batch_size = -1
         self.max_seq_length = -1
 
-    def setup_caches(self, max_batch_size, max_seq_length):
+    def setup_caches(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
         if (
             self.max_seq_length >= max_seq_length
             and self.max_batch_size >= max_batch_size
@@ -620,7 +620,7 @@ def setup_caches(self, max_batch_size, max_seq_length):
             # parallelism may have been applied there and the `n_local_heads``
             # value being adjusted.
             b.attention.setup_cache(
-                max_batch_size, max_seq_length,
+                max_batch_size, max_seq_length, cache_lanes=cache_lanes
             )
 
         freqs_cis = precompute_freqs_cis(
@@ -658,7 +658,7 @@ def distribute(self, device_mesh: DeviceMesh):
     def setup_input_pos(self, input_pos: Tensor) -> None:
         self._input_pos = input_pos
 
-    def forward(self, x: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+    def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int = 1) -> Tensor:
         assert self.freqs_cis is not None, "Caches must be initialized first"
         # TODO: find a better way to pass input_pos to non-0 pipeline stages
         input_pos = input_pos if input_pos is not None else self._input_pos
@@ -668,7 +668,7 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
             x = self.tok_embeddings(x)
 
         for _, layer in self.layers.items():
-            x = layer(x, input_pos, freqs_cis, mask)
+            x = layer(x, input_pos, freqs_cis, mask, cache_lane=cache_lane)
 
         if self.norm:
             x = self.norm(x)
@@ -691,7 +691,7 @@ def distribute(self, device_mesh: DeviceMesh):
         self.feed_forward.distribute(device_mesh)
 
     def forward(
-        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor
+        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor, cache_lane: int = 0
     ) -> Tensor:
         h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
         out = h + self.feed_forward(self.ffn_norm(h))
@@ -723,15 +723,16 @@ def __init__(self, config: TransformerArgs):
         self.dim = config.dim
         self._register_load_state_dict_pre_hook(self.load_hook)
 
-    def setup_cache(self, max_batch_size, max_seq_length):
+    def setup_cache(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
         n_local_heads = self.n_local_heads
         # If TP is enabled, the heads would be divided and assigned to different ranks
         if hasattr(self, "tp_degree"):
             n_local_heads = self.n_local_heads // self.tp_degree
 
-        self.kv_cache = KVCache(
-            max_batch_size, max_seq_length, n_local_heads, self.head_dim
-        )
+        self.kv_cache = nn.ModuleList([
+            KVCache(max_batch_size, max_seq_length, n_local_heads, self.head_dim)
+            for _ in range(cache_lanes)
+        ])
 
     def load_hook(self, state_dict, prefix, *args):
         # if prefix + "wq.weight" in state_dict:
@@ -784,6 +785,7 @@ def forward(
         freqs_cis: Tensor,
         mask: Tensor,
         input_pos: Optional[Tensor] = None,
+        cache_lane: int = 0,
     ) -> Tensor:
         bsz, seqlen, _ = x.shape
 
@@ -809,7 +811,7 @@ def forward(
         q, k, v = (x.transpose(1, 2) for x in (q, k, v))
 
         if self.kv_cache is not None:
-            k, v = self.kv_cache.update(input_pos, k, v)
+            k, v = self.kv_cache[cache_lane].update(input_pos, k, v)
 
         k = k.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)