Revert "[Distributed] Add lanes to KV cache (#1174)"

kwen2501 · web-flow · commit 05129a2f5122 · 2024-09-23T16:02:33.000-07:00
This reverts commit 2cf4016.
diff --git a/dist_run.py b/dist_run.py
@@ -273,11 +273,13 @@ def main(args):
     pp_rank = pp_mesh.get_local_rank()
     tp_group = tp_mesh.get_group()
     pp_group = pp_mesh.get_group()
-    logger.info(f"{pp_degree=}, {tp_degree=}")
+    pp_group_size = pp_group.size()
+    tp_group_size = tp_group.size()
+    logger.info(f"{pp_group_size=}, {tp_group_size=}")
 
     # Convenience variables
     first_pp_rank = 0
-    last_pp_rank = pp_degree - 1
+    last_pp_rank = pp_group_size - 1
 
     # Assuming same number of GPUs per node
     device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
@@ -295,22 +297,18 @@ def main(args):
     if rank == 0:
         logger.info(f"Model: {model}")
 
-    # Batch size. Since we push batches dynamically through the pipeline rather
-    # than chunking them, this is effectively micro-batch size in pipeline
-    # sense. Thus it is interchangeable with micro-batch size below.
-    batch_size = 4
+    mbs = 1  # number of micro-batches
+    mb_size = 4  # micro-batch size
+    batch_size = mbs * mb_size  # total batch size
+
     seqlen_prefill = 1024  # sequence length
     dim = 4096  # embedding dimension
 
     # Setup KV caches (after model distribution)
-    # The number of cache lanes is the same as the maximum number of
-    # micro-batches that can be "in flight" in parallel -- imagine each
-    # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
-    # When decoding is done for certain micro-batches, we can reuse the KV cache
-    # lanes.
-    # TODO: bump up the lane count
-    pipeline_lanes = 1
-    model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes)
+    # TODO: the setting below only works for 1 micro-batch case. To support
+    # multiple micro-batches, we need the KV cache in the model to be aware of
+    # the number of micro-batches and the current micro-batch index.
+    model.setup_caches(mb_size, seqlen_prefill)
 
     # Load weights
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
@@ -319,7 +317,7 @@ def main(args):
         model.to(device)
 
     logger.info(
-        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
+        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for stage {rank}{color.reset}"
     )
 
     # info on stage size and params
@@ -332,16 +330,17 @@ def main(args):
 
     # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen
     input_pos = torch.arange(seqlen_prefill, device=device)
+    model.setup_input_pos(input_pos)
     model.eval()
 
     # Helper function to get example inputs and outputs for the stages.
     def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        mb_ids = torch.randint(0, config.vocab_size, (batch_size, seqlen), device=device)
+        mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen), device=device)
         activation = torch.rand(
-            batch_size, seqlen, dim, device=device, dtype=model_dtype
+            mb_size, seqlen, dim, device=device, dtype=model_dtype
         )
         logits = torch.rand(
-            batch_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
+            mb_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
         )
         example_inputs = (mb_ids if pp_rank == first_pp_rank else activation,)
         example_outputs = (logits if pp_rank == last_pp_rank else activation,)
@@ -359,13 +358,8 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         output_args=example_outputs,
         group=pp_group,
     )
-
-    # Create schedule
-    # Number of micro-batches for the schedule is 1, because each step() call we
-    # only push 1 micro-batch into the pipeline. But we can continuously push
-    # new micro-batches into the pipeline as they arrive, achieving same
-    # pipelining effect.
-    prefiller = ScheduleGPipe(prefill_stage, 1)
+    # create schedule
+    prefill_schedule = ScheduleGPipe(prefill_stage, mbs)
 
     prompt = [
         "What is a computer?",
@@ -394,6 +388,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
     s = set(prompt_lengths)
     assert len(s) == 1, f"prompt_lengths should be the same, got {s}"
 
+    # with CUDATrackTime() as timer:
     # Need these global ids due to the API definition of dist.send and recv
     first_pp_rank_global_id = dist.get_global_rank(pp_group, first_pp_rank)
     last_pp_rank_global_id = dist.get_global_rank(pp_group, last_pp_rank)
@@ -406,21 +401,14 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
     num_tokens = 40
 
     # Prefill phase
-    # Run context input through pipeline
-    # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
-    lane = 0
-    kwargs = {"input_pos": input_pos, "cache_lane": lane}
-    with torch.no_grad(), CUDATrackTime() as timer:
+    # Run context input through pipeline, in 1 step
+    with torch.no_grad():
         if pp_rank == first_pp_rank:
-            output = prefiller.step(padded_sequence, **kwargs)
+            output = prefill_schedule.step(padded_sequence)
         elif pp_rank == last_pp_rank:
-            output = prefiller.step(**kwargs)
+            output = prefill_schedule.step()
         else:  # middle pp ranks
-            prefiller.step(**kwargs)
-
-    logger.info(
-        f"{color.green}Prefilling time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-    )
+            prefill_schedule.step()
 
     # Decode the output -- first generated token
     if pp_rank == last_pp_rank:
@@ -442,6 +430,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
     # seqlen = 1 now
     seqlen_decode = 1
     input_pos = torch.tensor([prompt_lengths[0]], device=device)
+    model.setup_input_pos(input_pos)
 
     # Create decode stage
     logger.info(f"Creating pipeline stage for decode {pp_rank=}, {pp_degree=}")
@@ -456,12 +445,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         group=pp_group,
     )
     # create schedule
-    decorder = ScheduleGPipe(decode_stage, 1)
+    decode_schedule = ScheduleGPipe(decode_stage, mbs)
 
     # Decoding
-    with torch.no_grad(), CUDATrackTime() as timer:
+    with torch.no_grad():
         for step in range(num_tokens - 1):
-            kwargs = {"input_pos": input_pos, "cache_lane": lane}
             # sendrecv between last and first ranks, only if:
             # first_pp_rank != last_pp_rank.
             if pp_rank == last_pp_rank and pp_rank != first_pp_rank:
@@ -479,11 +467,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
             # Run data through pipeline
             if pp_rank == first_pp_rank:
-                output = decorder.step(new_token, **kwargs)
+                output = decode_schedule.step(new_token)
             elif pp_rank == last_pp_rank:
-                output = decorder.step(**kwargs)
+                output = decode_schedule.step()
             else:  # middle pp ranks
-                decorder.step(**kwargs)
+                decode_schedule.step()
 
             # Decode the output
             if pp_rank == last_pp_rank:
@@ -503,10 +491,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
                     )  # decode_results[i][0]
 
             input_pos += 1
-
-    logger.info(
-        f"{color.green}Decoding time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-    )
+            model.setup_input_pos(input_pos)
 
     # Display the decoding results
 
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -152,9 +152,9 @@ def __init__(self, attention: Attention):
             self.wo = attention.wo
 
             max_batch_size, n_heads, max_seq_length, head_dim = (
-                attention.kv_cache[0].k_cache.shape
+                attention.kv_cache.k_cache.shape
             )
-            cache_dtype = attention.kv_cache[0].k_cache.dtype
+            cache_dtype = attention.kv_cache.k_cache.dtype
             self.kv_cache = CustomKVCache(
                 max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
             )
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -606,7 +606,7 @@ def __init__(self, config: TransformerArgs) -> None:
         self.max_batch_size = -1
         self.max_seq_length = -1
 
-    def setup_caches(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
+    def setup_caches(self, max_batch_size, max_seq_length):
         if (
             self.max_seq_length >= max_seq_length
             and self.max_batch_size >= max_batch_size
@@ -620,7 +620,7 @@ def setup_caches(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
             # parallelism may have been applied there and the `n_local_heads``
             # value being adjusted.
             b.attention.setup_cache(
-                max_batch_size, max_seq_length, cache_lanes=cache_lanes
+                max_batch_size, max_seq_length,
             )
 
         freqs_cis = precompute_freqs_cis(
@@ -653,15 +653,22 @@ def distribute(self, device_mesh: DeviceMesh):
                 ColwiseParallel(output_layouts=Replicate()),
             )
 
-    def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int = 1) -> Tensor:
+    # This is a temporary solution to pass input_pos to non-0 pipeline stages
+    # TODO: make `step()` function of dist.pipelining accept args for non-0 stages
+    def setup_input_pos(self, input_pos: Tensor) -> None:
+        self._input_pos = input_pos
+
+    def forward(self, x: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         assert self.freqs_cis is not None, "Caches must be initialized first"
+        # TODO: find a better way to pass input_pos to non-0 pipeline stages
+        input_pos = input_pos if input_pos is not None else self._input_pos
         mask = self.causal_mask[None, None, input_pos]
         freqs_cis = self.freqs_cis[input_pos]
         if self.tok_embeddings:
             x = self.tok_embeddings(x)
 
         for _, layer in self.layers.items():
-            x = layer(x, input_pos, freqs_cis, mask, cache_lane=cache_lane)
+            x = layer(x, input_pos, freqs_cis, mask)
 
         if self.norm:
             x = self.norm(x)
@@ -684,7 +691,7 @@ def distribute(self, device_mesh: DeviceMesh):
         self.feed_forward.distribute(device_mesh)
 
     def forward(
-        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor, cache_lane: int = 0
+        self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor
     ) -> Tensor:
         h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos)
         out = h + self.feed_forward(self.ffn_norm(h))
@@ -716,16 +723,15 @@ def __init__(self, config: TransformerArgs):
         self.dim = config.dim
         self._register_load_state_dict_pre_hook(self.load_hook)
 
-    def setup_cache(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
+    def setup_cache(self, max_batch_size, max_seq_length):
         n_local_heads = self.n_local_heads
         # If TP is enabled, the heads would be divided and assigned to different ranks
         if hasattr(self, "tp_degree"):
             n_local_heads = self.n_local_heads // self.tp_degree
 
-        self.kv_cache = nn.ModuleList([
-            KVCache(max_batch_size, max_seq_length, n_local_heads, self.head_dim)
-            for _ in range(cache_lanes)
-        ])
+        self.kv_cache = KVCache(
+            max_batch_size, max_seq_length, n_local_heads, self.head_dim
+        )
 
     def load_hook(self, state_dict, prefix, *args):
         # if prefix + "wq.weight" in state_dict:
@@ -778,7 +784,6 @@ def forward(
         freqs_cis: Tensor,
         mask: Tensor,
         input_pos: Optional[Tensor] = None,
-        cache_lane: int = 0,
     ) -> Tensor:
         bsz, seqlen, _ = x.shape
 
@@ -804,7 +809,7 @@ def forward(
         q, k, v = (x.transpose(1, 2) for x in (q, k, v))
 
         if self.kv_cache is not None:
-            k, v = self.kv_cache[cache_lane].update(input_pos, k, v)
+            k, v = self.kv_cache.update(input_pos, k, v)
 
         k = k.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)

Original file line number	Diff line number	Diff line change
`@@ -152,9 +152,9 @@ def __init__(self, attention: Attention):`
`152`	`152`	`self.wo = attention.wo`
`153`	`153`
`154`	`154`	`max_batch_size, n_heads, max_seq_length, head_dim = (`
`155`		`- attention.kv_cache[0].k_cache.shape`
	`155`	`+ attention.kv_cache.k_cache.shape`
`156`	`156`	`)`
`157`		`- cache_dtype = attention.kv_cache[0].k_cache.dtype`
	`157`	`+ cache_dtype = attention.kv_cache.k_cache.dtype`
`158`	`158`	`self.kv_cache = CustomKVCache(`
`159`	`159`	`max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype`
`160`	`160`	`)`