Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,10 @@ __configure_fbgemm_gpu_build_cuda () {
local arch_list="7.5;8.0"
fi

elif [[ $cuda_version_nvcc == *"V13.0"* ]] ||
[[ $cuda_version_nvcc == *"V12.9"* ]] ||
elif [[ $cuda_version_nvcc == *"V13.0"* ]]; then
local arch_list="8.0;9.0a;10.0a;12.0a"

elif [[ $cuda_version_nvcc == *"V12.9"* ]] ||
[[ $cuda_version_nvcc == *"V12.8"* ]]; then
local arch_list="7.5;8.0;9.0a;10.0a;12.0a"

Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/generate_ci_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def cuda_versions(self) -> List[str]:
return ["12.6.3", "12.8.1", "13.0.2"]
else:
# GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
return ["12.6.3", "12.8.1"]
return ["12.6.3", "12.8.1", "13.0.2"]

def rocm_versions(self) -> List[str]:
if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
Expand Down
27 changes: 22 additions & 5 deletions cmake/modules/GpuCppLibrary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,32 @@ function(prepare_target_sources)
list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS})
endif()

# Set source properties
set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES COMPILE_OPTIONS
"${args_NVCC_FLAGS}")

# Set include directories
set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES INCLUDE_DIRECTORIES
"${args_INCLUDE_DIRS}")

# Starting with CUDA 13.0, nvcc changed the default visibility of
# __global__ functions to `hidden`, which causes symbol lookup errors
# during linking. This can be worked around by setting -cudart=shared
# and --device-entity-has-hidden-visibility=false.
#
# https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/
if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND
(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
set(_nvcc_flags ${args_NVCC_FLAGS}
-cudart=shared
-static-global-template-stub=false
--device-entity-has-hidden-visibility=false)
else()
set(_nvcc_flags ${args_NVCC_FLAGS})
endif()

# Set compilation flags
set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES COMPILE_OPTIONS
"${_nvcc_flags}")

# Append to the full sources list
list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu})
endif()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ template <
typename index_t,
size_t kThreadGroupSize
>
__launch_bounds__(kForwardMaxThreads) __global__ void
__launch_bounds__(kForwardMaxThreads) __global__ __attribute__((visibility("default"))) void
{%- if is_index_select %}
batch_index_select_dim0_codegen_forward_small_kernel(
{%- else %}
Expand Down
40 changes: 36 additions & 4 deletions fbgemm_gpu/src/memory_utils/memory_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ std::tuple<void*, size_t> adjust_to_page_boundaries(void* ptr, size_t size) {
return std::make_tuple((void*)raw_ptr_adjusted, (size_t)size_adjusted);
}

cudaMemLocation new_mem_location_from_device(const int device_id) {
cudaMemLocation deviceLoc;
deviceLoc.type = cudaMemLocationTypeDevice;
deviceLoc.id = device_id;
return deviceLoc;
}

} // namespace

Tensor new_managed_tensor(
Expand All @@ -158,11 +165,25 @@ Tensor new_managed_tensor(

// Set preferred memory location to host memory
AT_CUDA_CHECK(cudaMemAdvise(
ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
ptr, size_bytes, cudaMemAdviseSetPreferredLocation,
#if CUDA_VERSION >= 13000
new_mem_location_from_device(cudaCpuDeviceId)
#else
cudaCpuDeviceId
#endif
));

// User hints with "accessed by": GPU will establish direct mapping of data
// in CPU memory, no page faults will be generated
AT_CUDA_CHECK(cudaMemAdvise(
ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device()));
ptr, size_bytes, cudaMemAdviseSetAccessedBy,
#if CUDA_VERSION >= 13000
new_mem_location_from_device(at::cuda::current_device())
#else
at::cuda::current_device()
#endif
));

C10_CUDA_KERNEL_LAUNCH_CHECK();

// Work around fork issue - see uvm_mem_advice_dont_fork for details
Expand Down Expand Up @@ -353,7 +374,12 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
ptr,
size_bytes,
static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
hint_device));
#if CUDA_VERSION >= 13000
new_mem_location_from_device(hint_device)
#else
hint_device
#endif
));
return;
}

Expand All @@ -379,7 +405,13 @@ void uvm_cuda_mem_prefetch_async(

auto stream = at::cuda::getCurrentCUDAStream();

AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes,
#if CUDA_VERSION >= 13000
new_mem_location_from_device(prefetch_device), 0,
#else
prefetch_device,
#endif
stream));

return;
}
Expand Down
6 changes: 5 additions & 1 deletion fbgemm_gpu/src/sparse_ops/sparse_async_batched_cumsum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ __global__ __launch_bounds__(kMaxThreads) void _batched_complete_cumsum_kernel(
data = (val_t)values[blockIdx.x][i];
}
BlockScan(temp_storage).InclusiveSum(data, data, prefix_op);
cub::CTA_SYNC();
#if CUDA_VERSION >= 13000
__syncthreads();
#else
cub::CTA_SYNC();
#endif
if (i < len) {
out[blockIdx.x][i + 1] = data;
}
Expand Down
Loading