jd-opensource
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎xllm/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎xllm/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xllm/core/common/global_flags.cpp‎
Lines changed: 8 additions & 2 deletions b/‎xllm/core/common/global_flags.cpp‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
100755100644
Lines changed: 11 additions & 2 deletions b/‎xllm/core/framework/model/model_input_params.h‎
100755100644
Lines changed: 11 additions & 2 deletions
diff --git a/‎xllm/core/framework/request/mm_data.h‎
Lines changed: 16 additions & 0 deletions b/‎xllm/core/framework/request/mm_data.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎xllm/core/kernels/npu/impl/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎xllm/core/kernels/npu/impl/CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 17 additions & 4 deletions b/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.cpp‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h‎
Lines changed: 0 additions & 1 deletion b/‎xllm/core/layers/npu/npu_glm4_moe_decoder_layer.h‎
Lines changed: 0 additions & 1 deletion
@@ -28,20 +28,20 @@ if(USE_NPU)
     if(DEVICE_TYPE STREQUAL "USE_A3")
         message("downloading a3 arm xllm kernels")
         file(DOWNLOAD 
-            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.2-Linux.a3.arm.rpm"
+            "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a3.arm.rpm"
             "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
         )
     else()  
       if(DEVICE_ARCH STREQUAL "ARM")
           message("downloading a2 arm xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.2-Linux.a2.arm.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.arm.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       else()
           message("downloading a2 x86 xllm_kernels")
           file(DOWNLOAD 
-              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.2-Linux.a2.x86.rpm"
+              "https://9n-das-tools.s3.cn-north-1.jdcloud-oss.com/xllm-ai/xllm_kernels/0.7.0/xllm_kernels-1.3.3-Linux.a2.x86.rpm"
               "${CMAKE_BINARY_DIR}/xllm_kernels.rpm"
           )
       endif()
 
@@ -34,7 +34,7 @@ target_link_libraries(xllm PRIVATE glog::glog brpc leveldb::leveldb ZLIB::ZLIB p
 add_dependencies(xllm brpc-static)
 
 if(USE_NPU)
-  set(COMMON_LIBS Python::Python ascendcl hccl c_sec nnopbase ms_tools_ext)
+  set(COMMON_LIBS Python::Python ascendcl atb_customize hccl c_sec nnopbase ms_tools_ext)
 elseif(USE_MLU)
   set(COMMON_LIBS Python::Python)
 endif()
 
@@ -86,9 +86,15 @@ DEFINE_bool(enable_acl_graph,
             "Whether to enable ACL graph execution for decode phase.");
 
 DEFINE_int32(max_seq_len_for_graph_mode,
-             20480,
-             "Maximum number of tokens per sequence for ACL graph execution.");
+             0,
+             "Maximum number of tokens per sequence for ACL graph execution. "
+             "If 0, use model max_position_embeddings.");
 
+DEFINE_bool(enable_acl_graph_no_padding,
+            false,
+            "Whether to enable ACL graph execution for decode phase without "
+            "padding. If true, graph will be caputured with every actual num "
+            "tokens, as stride is 1.");
 // --- vlm config ---
 
 DEFINE_int32(limit_image_per_prompt,
 
@@ -87,6 +87,8 @@ DECLARE_bool(enable_acl_graph);
 
 DECLARE_int32(max_seq_len_for_graph_mode);
 
+DECLARE_bool(enable_acl_graph_no_padding);
+
 DECLARE_bool(enable_chunked_prefill);
 
 DECLARE_string(master_node_addr);
 
@@ -17,6 +17,7 @@ cc_binary(
     ascendcl
     nnopbase
     atb
+    atb_customize
     c_sec
     spdlog::spdlog
 )
 
@@ -97,7 +97,11 @@ struct ModelInputParams {
     params.kv_cache_start_offsets = safe_to(kv_cache_start_offsets, device);
 
     // Copy graph_buffer to device
-    params.graph_buffer = safe_to(graph_buffer, device, true);
+    // params.graph_buffer = safe_to(graph_buffer, device, true);
+    params.graph_buffer.attn_mask =
+        safe_to(graph_buffer.attn_mask, device, true);
+    params.graph_buffer.tiling_data =
+        safe_to(graph_buffer.tiling_data, device, true);
 
     return params;
   }
@@ -206,7 +210,12 @@ struct ModelInputParams {
   torch::Tensor kv_cache_start_offsets;
   // Graph execution buffer for temporary tensor storage
   // Used by ACL Graph Executor to avoid repeated memory allocation
-  torch::Tensor graph_buffer;
+
+  struct GraphBuffer {
+    torch::Tensor attn_mask;
+    torch::Tensor tiling_data;
+  };
+  GraphBuffer graph_buffer;
 };
 
 }  // namespace xllm
@@ -81,6 +81,22 @@ struct MMData {
     return true;
   }
 
+  template <typename T>
+  bool update(uint32_t type, const MMKey& key, const T& value) {
+    const auto& itor = data_.find(key);
+    if (itor != data_.end()) {
+      // Key exists, update it
+      data_[key] = value;
+      ty_ |= type;
+      return true;
+    } else {
+      // Key doesn't exist, add it (same as add method)
+      ty_ |= type;
+      data_.insert({key, value});
+      return true;
+    }
+  }
+
   template <typename T>
   std::optional<T> get(const MMKey& key) const {
     if (!valid()) return std::nullopt;
 
@@ -40,6 +40,7 @@ cc_test(
     xllm_kernels
     c_sec
     atb
+    opapi
     spdlog::spdlog
 )
 
@@ -55,6 +56,7 @@ cc_test(
     xllm_kernels
     c_sec
     atb
+    opapi
     spdlog::spdlog
 )
 
@@ -70,6 +72,7 @@ cc_test(
     xllm_kernels
     c_sec
     atb
+    opapi
     spdlog::spdlog
 )
 
@@ -85,6 +88,7 @@ cc_test(
     xllm_kernels
     c_sec
     atb
+    opapi
     spdlog::spdlog
 )
 
@@ -100,5 +104,6 @@ cc_test(
     xllm_kernels
     c_sec
     atb
+    opapi
     spdlog::spdlog
 )
@@ -382,7 +382,12 @@ void Glm4MoeDecoderImpl::initialize_basic_parameters(
 
   param.mlpLinearTransposeType = {1, -1, 1, -1};
 
-  param.enableSplitFuse = (FLAGS_enable_chunked_prefill || FLAGS_enable_prefix_cache) && is_prefill;
+  param.enableSplitFuse =
+      (FLAGS_enable_chunked_prefill || FLAGS_enable_prefix_cache) && is_prefill;
+
+  // not support MTP model yet
+  param.enableAclGraph =
+      FLAGS_enable_acl_graph && !is_prefill && args.n_layers() > 1;
 
   param.moeLinearTransposeType = (layer_id_ < args.first_k_dense_replace())
                                      ? std::vector<int>{-1, -1, -1, -1}
@@ -406,7 +411,7 @@ void Glm4MoeDecoderImpl::initialize_basic_parameters(
   param.enableSwiGLUQuantForSharedExperts = false;  // TODO
 
   param.useQKNorm = args.use_qk_norm();
-  if(args.use_qk_norm()){
+  if (args.use_qk_norm()) {
     WEIGHT_COUNT_PER_LAYER = 70;
     WEIGHT_MAPPING_W8A8["self_attn.q_norm.weight"] = Q_NORM_WEIGHT;
     WEIGHT_MAPPING_W8A8["self_attn.k_norm.weight"] = K_NORM_WEIGHT;
@@ -1086,8 +1091,9 @@ torch::Tensor Glm4MoeDecoderImpl::forward(
     std::vector<std::atomic<bool>*> event_flag,
     int node_id) {
   atb::Status st;
-  if (input_params.decode_seq_range.second !=
-      input_params.q_seq_lens.size(0) - 1) {
+  bool is_prefill = input_params.decode_seq_range.second !=
+                    input_params.q_seq_lens.size(0) - 1;
+  if (is_prefill) {
     build_node_variant_pack(prefill_node_,
                             x,
                             cos_pos,
@@ -1200,6 +1206,13 @@ void Glm4MoeDecoderImpl::build_node_variant_pack(
   node.variantPack.inTensors.at(input_idx++) =
       atb_speed::Utils::AtTensor2Tensor(tensor_placeholder_);
 
+  if (FLAGS_enable_acl_graph && !is_prefill &&
+      input_params.graph_buffer.tiling_data.defined()) {
+    node.variantPack.inTensors.at(input_idx++) =
+        atb_speed::Utils::AtTensor2Tensor(
+            input_params.graph_buffer.tiling_data);
+  }
+
   for (size_t i = 0; i < WEIGHT_COUNT_PER_LAYER; ++i) {
     CHECK_THROW(node.inTensors.at(i) == nullptr,
                 model_name_ << " inTensor " << i << " is NULL");
 
@@ -170,7 +170,6 @@ class Glm4MoeDecoderImpl : public NpuBaseLayer {
                                const ModelInputParams& input_params,
                                torch::Tensor& expert_array,
                                bool is_prefill);
-
   std::string model_name_;
 
   int32_t device_id_;
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ cc_binary(`
`17`	`17`	`ascendcl`
`18`	`18`	`nnopbase`
`19`	`19`	`atb`
	`20`	`+ atb_customize`
`20`	`21`	`c_sec`
`21`	`22`	`spdlog::spdlog`
`22`	`23`	`)`