feat: add process group for cuda device. (#282)

XuZhang99 · web-flow · commit 742b1bffb57d · 2025-10-28T10:47:16.000+08:00
diff --git a/xllm/core/distributed_runtime/worker_server.cpp b/xllm/core/distributed_runtime/worker_server.cpp
@@ -95,8 +95,8 @@ void WorkerServer::create_server(const runtime::Options& options,
 
   CollectiveCommunicator comm(worker_global_rank, world_size, dp_size, ep_size);
   const ParallelArgs* parallel_args = comm.parallel_args();
-#if defined(USE_MLU)
-  comm.create_process_groups_cncl(master_node_addr, device);
+#if defined(USE_MLU) || defined(USE_CUDA)
+  comm.create_process_groups(master_node_addr, device);
 #endif
 
   WorkerType worker_type =
diff --git a/xllm/core/framework/parallel_state/CMakeLists.txt b/xllm/core/framework/parallel_state/CMakeLists.txt
@@ -11,12 +11,14 @@ cc_library(
     process_group.h
     $<$<BOOL:${USE_NPU}>:npu_process_group.h>
     $<$<BOOL:${USE_MLU}>:mlu_process_group.h>
+    $<$<BOOL:${USE_CUDA}>:cuda_process_group.h>
     collective_communicator.h
   SRCS
     mapping_npu.cpp
     parallel_state.cpp
     $<$<BOOL:${USE_NPU}>:npu_process_group.cpp>
     $<$<BOOL:${USE_MLU}>:mlu_process_group.cpp>
+    $<$<BOOL:${USE_CUDA}>:cuda_process_group.cpp>
     collective_communicator.cpp
   DEPS
     :common
diff --git a/xllm/core/framework/parallel_state/collective_communicator.cpp b/xllm/core/framework/parallel_state/collective_communicator.cpp
@@ -25,11 +25,35 @@ limitations under the License.
 #include <torch_mlu/csrc/framework/distributed/process_group_cncl.hpp>
 
 #include "mlu_process_group.h"
+#elif defined(USE_CUDA)
+#include "cuda_process_group.h"
 #endif
 #include "common/global_flags.h"
 #include "parallel_args.h"
 #include "util/net.h"
 
+namespace {
+std::unique_ptr<xllm::ProcessGroup> create_process_group(
+    int rank,
+    int world_size,
+    int rank_size,
+    int port,
+    const std::string& host,
+    const std::string& group_name,
+    const torch::Device& device) {
+#if defined(USE_MLU)
+  return std::make_unique<xllm::ProcessGroupCncl>(
+      rank, world_size, rank_size, port, host, group_name, device);
+#elif defined(USE_CUDA)
+  return std::make_unique<xllm::ProcessGroupNccl>(
+      rank, world_size, rank_size, port, host, group_name, device);
+#else
+  LOG(FATAL) << "Unsupported device type";
+  return nullptr;
+#endif
+}
+}  // namespace
+
 namespace xllm {
 
 CollectiveCommunicator::CollectiveCommunicator(int global_rank,
@@ -90,40 +114,41 @@ CollectiveCommunicator::CollectiveCommunicator(int global_rank,
                                                   mapping,
                                                   dispatchAndCombinecommDomain,
                                                   dispatchAndCombineHcclComm);
-#elif defined(USE_MLU)
+#else
   parallel_args_ = std::make_unique<ParallelArgs>(
       global_rank, world_size, dp_size, nullptr, ep_size);
 #endif
 }
 
-#if defined(USE_MLU)
-void CollectiveCommunicator::create_process_groups_cncl(
+void CollectiveCommunicator::create_process_groups(
     const std::string& master_addr,
     const torch::Device& device) {
   std::string host;
   int port;
   net::parse_host_port_from_addr(master_addr, host, port);
 
-  std::vector<std::unique_ptr<ProcessGroup>> process_groups;
   int global_rank = parallel_args_->rank();
   int world_size = parallel_args_->world_size();
   int dp_size = parallel_args_->dp_size();
-  process_group_ = std::make_unique<ProcessGroupCncl>(
+
+  process_group_ = create_process_group(
       global_rank, world_size, world_size, ++port, host, "world_group", device);
+
   int tp_size = world_size / dp_size;
   CHECK_EQ(tp_size * dp_size, world_size);
   int port_offset = global_rank / tp_size + 1;
-  tp_group_ = std::make_unique<ProcessGroupCncl>(global_rank,
-                                                 world_size,
-                                                 tp_size,
-                                                 port + port_offset,
-                                                 host,
-                                                 "tp_group",
-                                                 device);
+
+  tp_group_ = create_process_group(global_rank,
+                                   world_size,
+                                   tp_size,
+                                   port + port_offset,
+                                   host,
+                                   "tp_group",
+                                   device);
+
   parallel_args_->process_group_ = process_group_.get();
   parallel_args_->tp_group_ = tp_group_.get();
 }
-#endif
 
 const ParallelArgs* CollectiveCommunicator::parallel_args() {
   // TODO: init communicator
diff --git a/xllm/core/framework/parallel_state/collective_communicator.h b/xllm/core/framework/parallel_state/collective_communicator.h
@@ -31,10 +31,8 @@ class CollectiveCommunicator {
                          int ep_size);
   ~CollectiveCommunicator() = default;
 
-#if defined(USE_MLU)
-  void create_process_groups_cncl(const std::string& master_addr,
-                                  const torch::Device& device);
-#endif
+  void create_process_groups(const std::string& master_addr,
+                             const torch::Device& device);
 
   // init communicator and return parallel args.
   const ParallelArgs* parallel_args();
@@ -43,9 +41,7 @@ class CollectiveCommunicator {
   std::unique_ptr<ParallelArgs> parallel_args_;
   std::unique_ptr<ProcessGroup> process_group_;
   std::unique_ptr<ProcessGroup> dp_local_process_group_;
-#if defined(USE_MLU)
   std::unique_ptr<ProcessGroup> tp_group_;
-#endif
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/cuda_process_group.cpp b/xllm/core/framework/parallel_state/cuda_process_group.cpp
@@ -0,0 +1,69 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "cuda_process_group.h"
+
+#include <torch/csrc/distributed/c10d/TCPStore.hpp>
+
+#include "parallel_state.h"
+
+namespace xllm {
+
+ProcessGroupNccl::ProcessGroupNccl(int rank,
+                                   int world_size,
+                                   int rank_size,
+                                   int port,
+                                   const std::string& host,
+                                   const std::string& group_name,
+                                   const torch::Device& device)
+    : ProcessGroup(rank, rank_size, device),
+      world_size_(rank_size),
+      rank_(rank) {
+  c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> nccl_pg_options =
+      c10d::ProcessGroupNCCL::Options::create();
+  nccl_pg_options->is_high_priority_stream = false;
+
+  if (world_size != rank_size) {
+    auto [local_rank, group_ranks] =
+        parallel_state::get_group_rank(world_size, rank, rank_size);
+    nccl_pg_options->global_ranks_in_group = group_ranks;
+    rank_ = local_rank;
+  }
+
+  c10d::TCPStoreOptions tcp_options;
+  tcp_options.isServer = (rank_ == 0);
+  tcp_options.port = port;
+
+  c10::intrusive_ptr<c10d::Store> store =
+      c10::make_intrusive<c10d::TCPStore>(host, tcp_options);
+  nccl_pg_ = std::make_unique<c10d::ProcessGroupNCCL>(
+      store, rank_, rank_size, nccl_pg_options);
+}
+
+ProcessGroupNccl::~ProcessGroupNccl() { nccl_pg_->shutdown(); }
+
+void ProcessGroupNccl::allreduce(torch::Tensor& input) {
+  std::vector<torch::Tensor> input_tensors = {input};
+  nccl_pg_->allreduce(input_tensors)->wait();
+}
+
+void ProcessGroupNccl::allgather(torch::Tensor input,
+                                 std::vector<torch::Tensor>& outputs) {
+  std::vector<torch::Tensor> input_tensors = {input};
+  std::vector<std::vector<torch::Tensor>> output_tensors = {outputs};
+  nccl_pg_->allgather(output_tensors, input_tensors)->wait();
+}
+
+}  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/cuda_process_group.h b/xllm/core/framework/parallel_state/cuda_process_group.h
@@ -0,0 +1,52 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
+
+#include "process_group.h"
+
+namespace xllm {
+
+class ProcessGroupNccl : public ProcessGroup {
+ public:
+  ProcessGroupNccl(int rank,
+                   int world_size,
+                   int rank_size,
+                   int port,
+                   const std::string& host,
+                   const std::string& group_name,
+                   const torch::Device& device);
+
+  ~ProcessGroupNccl() override;
+
+  void allreduce(torch::Tensor& input) override;
+
+  void allgather(torch::Tensor input,
+                 std::vector<torch::Tensor>& outputs) override;
+
+ private:
+  // rank of current process
+  int rank_ = 0;
+
+  // number of processes
+  int world_size_ = 0;
+
+  // nccl process group
+  std::unique_ptr<c10d::ProcessGroupNCCL> nccl_pg_;
+};
+
+}  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/mlu_process_group.cpp b/xllm/core/framework/parallel_state/mlu_process_group.cpp
@@ -17,23 +17,7 @@ limitations under the License.
 
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 
-namespace {
-
-std::pair<int, std::vector<uint64_t>> get_group_rank(int world_size,
-                                                     int global_rank,
-                                                     int split_size) {
-  int target_group_index = global_rank / split_size;
-  uint64_t start_rank = target_group_index * split_size;
-  uint64_t end_rank = start_rank + split_size;
-  std::vector<uint64_t> group_rank;
-  int index = global_rank - start_rank;
-  for (uint64_t rank = start_rank; rank < end_rank; rank++) {
-    group_rank.push_back(rank);
-  }
-  return {index, group_rank};
-}
-
-}  // namespace
+#include "parallel_state.h"
 
 namespace xllm {
 
@@ -52,7 +36,7 @@ ProcessGroupCncl::ProcessGroupCncl(int rank,
   cncl_pg_options->group_name = group_name;
   if (world_size != rank_size) {
     auto [local_rank, group_ranks] =
-        get_group_rank(world_size, rank, rank_size);
+        parallel_state::get_group_rank(world_size, rank, rank_size);
     cncl_pg_options->global_ranks_in_group = group_ranks;
     rank_ = local_rank;
   }
@@ -67,7 +51,6 @@ ProcessGroupCncl::ProcessGroupCncl(int rank,
       store, rank, world_size, cncl_pg_options);
 }
 
-// Destructor.
 ProcessGroupCncl::~ProcessGroupCncl() { cncl_pg_->shutdown(); }
 
 void ProcessGroupCncl::allreduce(torch::Tensor& input) {
diff --git a/xllm/core/framework/parallel_state/mlu_process_group.h b/xllm/core/framework/parallel_state/mlu_process_group.h
@@ -23,7 +23,6 @@ namespace xllm {
 
 class ProcessGroupCncl : public ProcessGroup {
  public:
-  // Constructor.
   ProcessGroupCncl(int rank,
                    int world_size,
                    int rank_size,
@@ -32,11 +31,6 @@ class ProcessGroupCncl : public ProcessGroup {
                    const std::string& group_name,
                    const torch::Device& device);
 
-  int rank() override { return rank_; }
-
-  int world_size() override { return world_size_; }
-
-  // Destructor.
   ~ProcessGroupCncl() override;
 
   void allreduce(torch::Tensor& input) override;
@@ -45,12 +39,14 @@ class ProcessGroupCncl : public ProcessGroup {
                  std::vector<torch::Tensor>& outputs) override;
 
  private:
-  std::shared_ptr<torch_mlu::ProcessGroupCNCL> cncl_pg_ = nullptr;
-  // rank of current process.
+  // rank of current process
   int rank_ = 0;
 
-  // number of processes.
+  // number of processes
   int world_size_ = 0;
+
+  // cncl process group
+  std::unique_ptr<torch_mlu::ProcessGroupCNCL> cncl_pg_;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/parallel_args.h b/xllm/core/framework/parallel_state/parallel_args.h
@@ -111,11 +111,10 @@ struct ParallelArgs {
 
   // atb hccl dispatchAndCombineHcclComm
   PROPERTY(HcclComm, dispatchAndCombineHcclComm);
-#elif defined(USE_MLU)
+#endif
   ProcessGroup* tp_group_ = nullptr;
   ProcessGroup* moe_ep_group_ = nullptr;
   ProcessGroup* moe_tp_group_ = nullptr;
-#endif
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/parallel_state.cpp b/xllm/core/framework/parallel_state/parallel_state.cpp
@@ -125,5 +125,19 @@ std::vector<std::unique_ptr<ProcessGroup>> create_npu_process_groups(
 #endif
 }
 
+std::pair<int, std::vector<uint64_t>> get_group_rank(int world_size,
+                                                     int global_rank,
+                                                     int split_size) {
+  int target_group_index = global_rank / split_size;
+  uint64_t start_rank = target_group_index * split_size;
+  uint64_t end_rank = start_rank + split_size;
+  std::vector<uint64_t> group_rank;
+  int index = global_rank - start_rank;
+  for (uint64_t rank = start_rank; rank < end_rank; rank++) {
+    group_rank.push_back(rank);
+  }
+  return {index, group_rank};
+}
+
 }  // namespace parallel_state
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/parallel_state.h b/xllm/core/framework/parallel_state/parallel_state.h
@@ -35,5 +35,9 @@ torch::Tensor scatter(torch::Tensor input, ProcessGroup* process_group);
 // devices: list of devices to create process groups on.
 std::vector<std::unique_ptr<ProcessGroup>> create_npu_process_groups(
     const std::vector<torch::Device>& devices);
+
+std::pair<int, std::vector<uint64_t>> get_group_rank(int world_size,
+                                                     int global_rank,
+                                                     int split_size);
 }  // namespace parallel_state
 }  // namespace xllm
diff --git a/xllm/core/framework/parallel_state/process_group.h b/xllm/core/framework/parallel_state/process_group.h