added cuda implementation and rocm guards

shbiswas834 · shbiswas834 · commit 76b963d05727 · 2025-11-19T05:26:33.000Z
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_group_index.cu b/fbgemm_gpu/src/sparse_ops/sparse_group_index.cu
@@ -11,19 +11,21 @@
 using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
+namespace {
+
+#ifdef USE_ROCM
+constexpr int kGroupIndexWarpSize = kWarpSize;
+#else
+constexpr int kGroupIndexWarpSize = kWarpSize;
+#endif
 
-// TODO: Update UNROLL_FACTOR
 constexpr int GROUP_INDEX_SELECT_UNROLL_FACTOR = 1;
 constexpr int GROUP_INDEX_SELECT_COLS_PER_WARP =
-    GROUP_INDEX_SELECT_UNROLL_FACTOR * kWarpSize;
-
-// GROUP_INDEX_SELECT_COLS_PER_WARP must be power of two
+    GROUP_INDEX_SELECT_UNROLL_FACTOR * kGroupIndexWarpSize;
 constexpr int GROUP_INDEX_SELECT_LOG_COLS_PER_WARP =
     log2_calc<GROUP_INDEX_SELECT_COLS_PER_WARP>::value;
 
-int get_group_index_select_cols_per_warp() {
-  return GROUP_INDEX_SELECT_COLS_PER_WARP;
-}
+#ifdef USE_ROCM
 
 template <
     typename index_t,
@@ -40,17 +42,16 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
     const int64_t* indices_ptrs,
     const int64_t* warp_offsets_group,
     const int32_t* num_cols_group,
-    const int64_t num_work_rows, // number of rows to work on per member
+    const int64_t num_work_rows,
     const int64_t group_size) {
   const auto total_num_warps = warp_offsets_group[group_size];
-  // USE_INDEX_SELECT is a template argument; the compiler prunes the unused branch.
   if (USE_INDEX_SELECT) {
     for (int64_t warp_id = threadIdx.y * gridDim.x + blockIdx.x;
          warp_id < total_num_warps;
          warp_id += gridDim.x * blockDim.y) {
       int32_t member_id, member_warp_id, num_cols, warps_per_row;
       if (USE_VAR_COLS) {
-        __shared__ int member_ids[kMaxThreads / kWarpSize];
+        __shared__ int member_ids[kMaxThreads / kGroupIndexWarpSize];
         if (threadIdx.x == 0) {
           binary_search_range(
               &member_ids[threadIdx.y],
@@ -64,7 +65,6 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
         warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
         member_warp_id = warp_id - warp_offsets_group[member_id];
       } else {
-        // All columns are the same
         num_cols = num_cols_group[0];
         warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
         member_id = warp_id / (warps_per_row * num_work_rows);
@@ -78,7 +78,6 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
           reinterpret_cast<scalar_t*>(input_ptrs[member_id]) + col_offset;
       scalar_t* output =
           reinterpret_cast<scalar_t*>(output_ptrs[member_id]) + col_offset;
-
       index_t* indices = reinterpret_cast<index_t*>(indices_ptrs[member_id]);
       const index_t idx = indices[row];
 #pragma unroll
@@ -87,8 +86,6 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
       }
     }
   } else {
-    // Cache a handful of scatter destinations per warp so we can merge
-    // consecutive updates that hit the same index before touching global memory.
     constexpr int kCacheSlots = 2;
     index_t cached_idx[kCacheSlots];
     scalar_t cached_vals[kCacheSlots][UNROLL_FACTOR];
@@ -135,7 +132,7 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
          warp_id += gridDim.x * blockDim.y) {
       int32_t member_id, member_warp_id, num_cols, warps_per_row;
       if (USE_VAR_COLS) {
-        __shared__ int member_ids[kMaxThreads / kWarpSize];
+        __shared__ int member_ids[kMaxThreads / kGroupIndexWarpSize];
         if (threadIdx.x == 0) {
           binary_search_range(
               &member_ids[threadIdx.y],
@@ -149,7 +146,6 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
         warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
         member_warp_id = warp_id - warp_offsets_group[member_id];
       } else {
-        // All columns are the same
         num_cols = num_cols_group[0];
         warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
         member_id = warp_id / (warps_per_row * num_work_rows);
@@ -258,6 +254,88 @@ __launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
   }
 }
 
+#else // !USE_ROCM
+
+template <
+    typename index_t,
+    typename scalar_t,
+    bool USE_INDEX_SELECT,
+    bool USE_VAR_COLS,
+    int UNROLL_FACTOR,
+    int COLS_PER_WARP,
+    int LOG_COLS_PER_WARP>
+__global__
+__launch_bounds__(kMaxThreads) void group_index_select_or_add_2d_kernel(
+    const int64_t* input_ptrs,
+    const int64_t* output_ptrs,
+    const int64_t* indices_ptrs,
+    const int64_t* warp_offsets_group,
+    const int32_t* num_cols_group,
+    const int64_t num_work_rows,
+    const int64_t group_size) {
+  const auto total_num_warps = warp_offsets_group[group_size];
+  int32_t num_cols = 0;
+  int32_t warps_per_row = 0;
+
+  if constexpr (!USE_VAR_COLS) {
+    num_cols = num_cols_group[0];
+    warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
+  }
+
+  for (int64_t warp_id = threadIdx.y * gridDim.x + blockIdx.x;
+       warp_id < total_num_warps;
+       warp_id += gridDim.x * blockDim.y) {
+    int32_t member_id = 0;
+    int32_t member_warp_id = 0;
+    if constexpr (USE_VAR_COLS) {
+      __shared__ int member_ids[kMaxThreads / kGroupIndexWarpSize];
+      if (threadIdx.x == 0) {
+        binary_search_range(
+            &member_ids[threadIdx.y],
+            warp_offsets_group + 1,
+            warp_id,
+            group_size);
+      }
+      syncwarp();
+      member_id = member_ids[threadIdx.y];
+      num_cols = num_cols_group[member_id];
+      warps_per_row = (num_cols + COLS_PER_WARP - 1) >> LOG_COLS_PER_WARP;
+      member_warp_id = warp_id - warp_offsets_group[member_id];
+    } else {
+      member_id = warp_id / (warps_per_row * num_work_rows);
+      member_warp_id = warp_id - (member_id * warps_per_row * num_work_rows);
+    }
+    const auto row = member_warp_id / warps_per_row;
+    const auto col_offset =
+        ((member_warp_id % warps_per_row) << LOG_COLS_PER_WARP) +
+        (threadIdx.x * UNROLL_FACTOR);
+    scalar_t* input =
+        reinterpret_cast<scalar_t*>(input_ptrs[member_id]) + col_offset;
+    scalar_t* output =
+        reinterpret_cast<scalar_t*>(output_ptrs[member_id]) + col_offset;
+
+    index_t* indices = reinterpret_cast<index_t*>(indices_ptrs[member_id]);
+    const index_t idx = indices[row];
+#pragma unroll
+    for (int i = 0; i < UNROLL_FACTOR && col_offset + i < num_cols; i++) {
+      if constexpr (USE_INDEX_SELECT) {
+        output[row * num_cols + i] = LDG(&input[idx * num_cols + i]);
+      } else {
+        gpuAtomicAddNoReturn(
+            &output[idx * num_cols + i], input[row * num_cols + i]);
+      }
+    }
+  }
+}
+
+#endif // USE_ROCM
+
+} // namespace
+
+int get_group_index_select_cols_per_warp() {
+  return GROUP_INDEX_SELECT_COLS_PER_WARP;
+}
+
 DLL_PUBLIC void group_index_select_or_add_cuda(
     const int64_t* input_ptrs,
     const int64_t* output_ptrs,
@@ -278,36 +356,15 @@ DLL_PUBLIC void group_index_select_or_add_cuda(
 
   at::cuda::OptionalCUDAGuard device_guard(device);
 
-  // Partition work based on num_work_rows
-  uint32_t num_warps_per_threadblock = kMaxThreads / kWarpSize;
+  uint32_t num_warps_per_threadblock = kMaxThreads / kGroupIndexWarpSize;
   uint32_t max_grid_size =
       at::cuda::getCurrentDeviceProperties()->multiProcessorCount * 8;
   uint32_t grid_size = std::min(
       cuda_calc_xblock_count(total_num_warps, num_warps_per_threadblock),
       max_grid_size);
-  dim3 block_size(kWarpSize, num_warps_per_threadblock, 1);
-
-#define INVOKE_GROUP_INDEX_SELECT_OR_ADD(USE_INDEX_SELECT, USE_VAR_COLS) \
-  FBGEMM_LAUNCH_KERNEL(                                                  \
-      (group_index_select_or_add_2d_kernel<                              \
-          index_t,                                                       \
-          scalar_t,                                                      \
-          USE_INDEX_SELECT,                                              \
-          USE_VAR_COLS,                                                  \
-          GROUP_INDEX_SELECT_UNROLL_FACTOR,                              \
-          GROUP_INDEX_SELECT_COLS_PER_WARP,                              \
-          GROUP_INDEX_SELECT_LOG_COLS_PER_WARP>),                        \
-      grid_size,                                                         \
-      block_size,                                                        \
-      0,                                                                 \
-      at::cuda::getCurrentCUDAStream(),                                  \
-      input_ptrs,                                                        \
-      output_ptrs,                                                       \
-      indices_ptrs,                                                      \
-      warp_offsets_group,                                                \
-      num_cols_group,                                                    \
-      num_work_rows,                                                     \
-      group_size)
+  dim3 block_size(kGroupIndexWarpSize, num_warps_per_threadblock, 1);
+
+#define INVOKE_GROUP_INDEX_SELECT_OR_ADD(USE_INDEX_SELECT_FLAG, USE_VAR_COLS_FLAG)   FBGEMM_LAUNCH_KERNEL(                                                          (group_index_select_or_add_2d_kernel<                                          index_t,                                                                   scalar_t,                                                                  USE_INDEX_SELECT_FLAG,                                                     USE_VAR_COLS_FLAG,                                                         GROUP_INDEX_SELECT_UNROLL_FACTOR,                                          GROUP_INDEX_SELECT_COLS_PER_WARP,                                          GROUP_INDEX_SELECT_LOG_COLS_PER_WARP>),                                grid_size,                                                                 block_size,                                                                0,                                                                         at::cuda::getCurrentCUDAStream(),                                          input_ptrs,                                                                output_ptrs,                                                               indices_ptrs,                                                              warp_offsets_group,                                                        num_cols_group,                                                            num_work_rows,                                                             group_size)
 
   AT_DISPATCH_INDEX_TYPES(
       indices_scalar_type, "group_index_select_2d_wrapper_1", [&] {