feat: add batch forward type.

RobbieLeung · yq33victor · commit 6c367fa067e6 · 2025-11-18T16:30:53.000+08:00
diff --git a/xllm/core/framework/batch/batch_input_builder.cpp b/xllm/core/framework/batch/batch_input_builder.cpp
@@ -90,7 +90,7 @@ ForwardInput BatchInputBuilder::build_forward_input(
     uint32_t min_decoding_batch_size) {
   process_sequences(0, static_cast<uint32_t>(num_sequences_));
   padding_decode_batch_size(num_decoding_tokens, min_decoding_batch_size);
-
+  process_batch_forward_type();
   return state_to_forward_input();
 }
 
@@ -102,6 +102,7 @@ RawForwardInput BatchInputBuilder::build_raw_forward_input(uint32_t start_idx,
   } else {
     process_sequences_multithreaded(start_idx, end_idx);
   }
+  process_batch_forward_type();
   return state_to_raw_forward_input();
 }
 
@@ -548,6 +549,7 @@ ForwardInput BatchInputBuilder::state_to_forward_input() {
 
   auto& input_params = forward_input.input_params;
   input_params.empty_kv_cache = state_.empty_kv_cache;
+  input_params.batch_forward_type = state_.batch_forward_type;
   input_params.num_sequences = state_.block_tables_vec.size();
   input_params.kv_max_seq_len = state_.max_seq_len;
   input_params.q_max_seq_len = state_.q_max_seq_len;
@@ -633,7 +635,7 @@ RawForwardInput BatchInputBuilder::state_to_raw_forward_input() {
   raw_forward_input.unique_token_lens_vec =
       std::move(state_.unique_token_lens_vec);
   raw_forward_input.empty_kv_cache = state_.empty_kv_cache;
-  // raw_forward_input.global_empty_kv_cache = ;
+  raw_forward_input.batch_forward_type = state_.batch_forward_type;
   raw_forward_input.max_seq_len = state_.max_seq_len;
   raw_forward_input.q_max_seq_len = state_.q_max_seq_len;
   raw_forward_input.seq_lens = std::move(state_.seq_lens);
@@ -723,4 +725,69 @@ void BatchInputBuilder::process_swap_block_infos(
                                          swap_cache_block_infos_->end());
   }
 }
+
+void BatchInputBuilder::process_batch_forward_type() {
+  CHECK_EQ(state_.seq_lens.size(), state_.q_seq_lens.size())
+      << "seq_lens size must be equal to q_seq_lens size";
+
+  if (state_.q_max_seq_len == 1) {
+    state_.batch_forward_type = BatchForwardType::DECODE;
+    return;
+  }
+
+  bool empty_kv_cache = true;
+  bool all_decode = true;
+  bool all_prefill = true;
+
+#if defined(USE_NPU)
+  if (state_.seq_lens.size() == 0) {
+    state_.batch_forward_type = BatchForwardType::EMPTY;
+    return;
+  }
+  for (size_t i = 0; i < state_.seq_lens.size(); ++i) {
+    auto q_len = state_.q_seq_lens[i];
+    auto kv_len = state_.seq_lens[i];
+    auto cache_len = kv_len - q_len;
+    if (cache_len > 0) {
+      empty_kv_cache = false;
+    }
+    if (q_len > 1) {
+      all_decode = false;
+    }
+    if (q_len == 1) {
+      all_prefill = false;
+    }
+  }
+#elif defined(USE_MLU)
+  if (state_.seq_lens.size() == 1) {
+    state_.batch_forward_type = BatchForwardType::EMPTY;
+    return;
+  }
+  for (size_t i = 1; i < state_.seq_lens.size(); ++i) {
+    auto q_len = state_.q_seq_lens[i] - state_.q_seq_lens[i - 1];
+    auto kv_len = state_.seq_lens[i] - state_.seq_lens[i - 1];
+    auto cache_len = kv_len - q_len;
+    if (cache_len > 0) {
+      empty_kv_cache = false;
+    }
+    if (q_len > 1) {
+      all_decode = false;
+    }
+    if (q_len == 1) {
+      all_prefill = false;
+    }
+  }
+#endif
+  if (empty_kv_cache) {
+    state_.batch_forward_type = BatchForwardType::PREFILL;
+  } else {
+    if (all_prefill) {
+      state_.batch_forward_type = BatchForwardType::CHUNKED_PREFILL;
+    } else if (all_decode) {
+      state_.batch_forward_type = BatchForwardType::DECODE;
+    } else {
+      state_.batch_forward_type = BatchForwardType::MIXED;
+    }
+  }
+}
 }  // namespace xllm
diff --git a/xllm/core/framework/batch/batch_input_builder.h b/xllm/core/framework/batch/batch_input_builder.h
@@ -62,6 +62,8 @@ class BatchInputBuilder {
 
   void process_swap_block_infos(RawForwardInput& raw_forward_input);
 
+  void process_batch_forward_type();
+
   // State management
   struct BuilderState {
     // Token and position data
@@ -81,6 +83,7 @@ class BatchInputBuilder {
 
     // Sequence metadata
     bool empty_kv_cache = true;
+    BatchForwardType batch_forward_type;
     uint32_t max_seq_len = 0;
     uint32_t q_max_seq_len = 0;
 #if defined(USE_NPU)
diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
@@ -34,6 +34,7 @@ cc_library(
     embedding_lm.h
     model_args.h
     npu_dp_ep_padding.h
+    batch_forward_type.h
     model_input_params.h
   SRCS
     npu_dp_ep_padding.cpp
diff --git a/xllm/core/framework/model/batch_forward_type.h b/xllm/core/framework/model/batch_forward_type.h
@@ -0,0 +1,84 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+Copyright 2024 The ScaleLLM Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+namespace xllm {
+
+class BatchForwardType {
+ public:
+  enum Value : int32_t {
+    // Prefill without using kv cache.
+    PREFILL = 0,
+    // Chunked prefill using kv cache.
+    // No decode sequence in this type.
+    CHUNKED_PREFILL = 1,
+    // Decode one token.
+    // No prefill sequence in this type.
+    DECODE = 2,
+    // Mixed prefill and decode in one batch when doing chunked prefill.
+    MIXED = 3,
+    // No sequence to forward.
+    EMPTY = 4,
+  };
+
+  BatchForwardType() : value_(EMPTY) {}
+
+  BatchForwardType(int32_t v) : value_(static_cast<Value>(v)) {}
+
+  constexpr BatchForwardType(Value v) : value_(v) {}
+
+  BatchForwardType& operator=(Value v) {
+    value_ = v;
+    return *this;
+  }
+
+  int32_t value() const { return value_; }
+
+  bool is_prefill() const { return (value_ == PREFILL); }
+
+  bool is_chunked_prefill() const { return (value_ == CHUNKED_PREFILL); }
+
+  bool has_decode() const { return (value_ == DECODE || value_ == MIXED); }
+
+  bool no_decode() const {
+    return (value_ == PREFILL || value_ == CHUNKED_PREFILL);
+  }
+
+  bool is_decode() const { return (value_ == DECODE); }
+
+  bool is_mixed() const { return (value_ == MIXED); }
+
+  bool is_empty() const { return (value_ == EMPTY); }
+
+  const char* to_string() const {
+    switch (value_) {
+      case PREFILL:
+        return "PREFILL";
+      case CHUNKED_PREFILL:
+        return "CHUNKED_PREFILL";
+      case DECODE:
+        return "DECODE";
+      case MIXED:
+        return "MIXED";
+      case EMPTY:
+        return "EMPTY";
+      default:
+        return "UNKNOWN";
+    }
+  }
+
+ private:
+  Value value_;
+};
+}  // namespace xllm
diff --git a/xllm/core/framework/model/model_input_params.h b/xllm/core/framework/model/model_input_params.h
@@ -21,6 +21,7 @@ limitations under the License.
 #if defined(USE_NPU)
 #include "platform/npu/npu_layer_synchronizer.h"
 #endif
+#include "framework/model/batch_forward_type.h"
 #include "framework/request/mm_data.h"
 #include "npu_dp_ep_padding.h"
 #include "util/tensor_helper.h"
@@ -52,6 +53,7 @@ struct ModelInputParams {
     ModelInputParams params;
     params.empty_kv_cache = empty_kv_cache;
     params.global_empty_kv_cache = global_empty_kv_cache;
+    params.batch_forward_type = batch_forward_type;
     params.num_sequences = num_sequences;
     params.kv_max_seq_len = kv_max_seq_len;
     params.q_max_seq_len = q_max_seq_len;
@@ -103,6 +105,7 @@ struct ModelInputParams {
   void print() const {
     LOG(INFO) << "ModelInputParams: empty_kv_cache is " << empty_kv_cache
               << " , global_empty_kv_cache is " << global_empty_kv_cache
+              << " , batch_forward_type is " << batch_forward_type.to_string()
               << " , num_sequences is " << num_sequences
               << " , kv_max_seq_len is " << kv_max_seq_len
               << " , q_max_seq_len is " << q_max_seq_len
@@ -120,6 +123,9 @@ struct ModelInputParams {
   // whether the kv-cache is empty for all sequences.
   bool empty_kv_cache = true;
 
+  // forward type of the batch, used by worker/kernel.
+  BatchForwardType batch_forward_type;
+
   // total number of sequences in the batch
   int32_t num_sequences = 0;
 
diff --git a/xllm/core/runtime/forward_params.h b/xllm/core/runtime/forward_params.h
@@ -148,6 +148,7 @@ struct RawForwardInput {
   std::vector<int32_t> unique_token_lens_vec;
   bool empty_kv_cache = true;
   bool global_empty_kv_cache = true;
+  BatchForwardType batch_forward_type;
   uint32_t max_seq_len;
   uint32_t q_max_seq_len;
   std::vector<int32_t> seq_lens;
diff --git a/xllm/core/runtime/forward_shared_memory_manager.cpp b/xllm/core/runtime/forward_shared_memory_manager.cpp
@@ -153,7 +153,8 @@ INLINE size_t calculate_raw_forward_input_size(const RawForwardInput& input) {
   total += type_size<uint64_t> * 4 +
            cache_block_size * cache_block_info_fixed_size();
 
-  total += type_size<bool> * 2  // empty_kv_cache + global_empty_kv_cache
+  total += type_size<bool> * 2   // empty_kv_cache + global_empty_kv_cache
+           + type_size<int32_t>  // batch_forward_type
            + type_size<uint32_t> *
                  3  // max_seq_len + q_max_seq_len + prefill_seq_len
            + type_size<int32_t>  // num_sequences
@@ -599,6 +600,9 @@ INLINE void deserialize_raw_forward_input(
 
   read_data(buffer, input.empty_kv_cache);
   read_data(buffer, input.global_empty_kv_cache);
+  int32_t batch_forward_type;
+  read_data(buffer, batch_forward_type);
+  input.batch_forward_type = BatchForwardType(batch_forward_type);
   read_data(buffer, input.max_seq_len);
   read_data(buffer, input.q_max_seq_len);
   read_data(buffer, input.num_sequences);
@@ -653,6 +657,7 @@ INLINE void serialize_raw_forward_input(const RawForwardInput& input,
 
   write_data(buffer, input.empty_kv_cache);
   write_data(buffer, input.global_empty_kv_cache);
+  write_data(buffer, input.batch_forward_type.value());
   write_data(buffer, input.max_seq_len);
   write_data(buffer, input.q_max_seq_len);
   write_data(buffer, input.num_sequences);
@@ -855,6 +860,7 @@ void convert_raw_forward_input_to_forward_input(RawForwardInput& raw_input,
   auto& input_params = forward_input.input_params;
   input_params.empty_kv_cache = raw_input.empty_kv_cache;
   input_params.global_empty_kv_cache = raw_input.global_empty_kv_cache;
+  input_params.batch_forward_type = raw_input.batch_forward_type;
   input_params.num_sequences = raw_input.num_sequences;
   input_params.kv_max_seq_len = raw_input.max_seq_len;
   input_params.q_max_seq_len = raw_input.q_max_seq_len;
diff --git a/xllm/core/runtime/llm_engine.cpp b/xllm/core/runtime/llm_engine.cpp
@@ -822,6 +822,8 @@ std::vector<std::vector<RawForwardInput>> LLMEngine::prepare_inputs(
   dp_global_token_nums.resize(micro_batches_num,
                               std::vector<int32_t>(dp_size_));
   bool global_empty_kv_cache = true;
+  // All empty batches use the first non-empty batch's forward type.
+  BatchForwardType batch_forward_type;
 
   // eplb related
   EplbInfo eplb_info;
@@ -841,6 +843,12 @@ std::vector<std::vector<RawForwardInput>> LLMEngine::prepare_inputs(
           batched_inputs[dp_rank][i].flatten_tokens_vec.size();
       global_empty_kv_cache =
           batched_inputs[dp_rank][i].empty_kv_cache && global_empty_kv_cache;
+      if (batched_inputs[dp_rank][i].batch_forward_type.is_empty()) {
+        continue;
+      }
+      if (batch_forward_type.is_empty() || batch_forward_type.is_prefill()) {
+        batch_forward_type = batched_inputs[dp_rank][i].batch_forward_type;
+      }
     }
   }
 
@@ -853,6 +861,9 @@ std::vector<std::vector<RawForwardInput>> LLMEngine::prepare_inputs(
     for (auto i = 0; i < micro_batches_num; ++i) {
       batched_inputs[dp_rank][i].dp_global_token_nums = dp_global_token_nums[i];
       batched_inputs[dp_rank][i].global_empty_kv_cache = global_empty_kv_cache;
+      if (batched_inputs[dp_rank][i].batch_forward_type.is_empty()) {
+        batched_inputs[dp_rank][i].batch_forward_type = batch_forward_type;
+      }
       if (FLAGS_enable_eplb) {
         batched_inputs[dp_rank][i].eplb_info = eplb_info;
       }
diff --git a/xllm/core/runtime/llm_worker_impl.cpp b/xllm/core/runtime/llm_worker_impl.cpp
@@ -182,14 +182,11 @@ std::optional<ForwardOutput> LLMWorkerImpl::step(
   // should be in same prefill stage, so, to judge empty_kv_cache,
   // just use micro batch 0 here
   if (options_.enable_speculative_decode() && !is_spec_draft_) {
-    if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
+    if (!inputs.micro_inputs[0].input_params.batch_forward_type.is_decode()) {
       output.sample_output.embeddings = hidden_states;
-    } else if (concated_sampling_params.sample_idxes.defined()) {
-      // auto sample_idxes =
-      //     concated_sampling_params.selected_token_idxes.index_select(
-      //         /*dim=*/0, concated_sampling_params.sample_idxes);
+    } else if (concated_sampling_params.selected_token_idxes.defined()) {
       auto embeddings = hidden_states.index_select(
-          /*dim=*/0, concated_sampling_params.sample_idxes);
+          /*dim=*/0, concated_sampling_params.selected_token_idxes);
       output.sample_output.embeddings = embeddings;
     }
   }
diff --git a/xllm/core/runtime/speculative_worker_impl.cpp b/xllm/core/runtime/speculative_worker_impl.cpp
@@ -173,7 +173,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
   }
 
   // TODO: support data parallel case
-  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
+  if (!inputs.micro_inputs[0].input_params.batch_forward_type.is_decode()) {
     return step_prefill(inputs);
   } else {
     return step_decode(inputs);
@@ -182,7 +182,7 @@ std::optional<ForwardOutput> SpeculativeWorkerImpl::step(
 
 std::optional<ForwardOutput> SpeculativeWorkerImpl::step_empty(
     const BatchedForwardInputs& inputs) {
-  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
+  if (!inputs.micro_inputs[0].input_params.batch_forward_type.is_decode()) {
     auto output = impl_->step(inputs);
     auto draft_output = draft_impl_->step(inputs);
     return output;
@@ -833,7 +833,7 @@ void SpeculativeWorkerImpl::update_sampling_params(
 void SpeculativeWorkerImpl::prepare_work_before_execute(
     const BatchedForwardInputs& inputs,
     BatchedForwardInputs& processed_inputs) {
-  if (check_is_prefill(inputs.micro_inputs[0].input_params.q_seq_lens_vec)) {
+  if (!inputs.micro_inputs[0].input_params.batch_forward_type.is_decode()) {
     WorkerImpl::prepare_work_before_execute(inputs, processed_inputs);
   } else {
     if (enable_schedule_overlap()) {
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h