Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ __configure_fbgemm_gpu_build_cuda () {
# See:
#
# https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
# https://kaixih.github.io/nvcc-options/
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list
# https://github.com/NVIDIA/nvbench/discussions/129
# https://github.com/vllm-project/vllm/blob/main/CMakeLists.txt#L187
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/generate_ci_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]:
# FBGEMM HSTU is expensive, so conserve CI resources
return ["12.8.1"]
elif self.target == TARGET_GENAI:
return ["12.6.3", "12.8.1", "13.0.2"]
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
else:
# GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
return ["12.6.3", "12.8.1"]
return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]

def rocm_versions(self) -> List[str]:
if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
Expand Down
13 changes: 3 additions & 10 deletions .github/scripts/nova_dir.bash
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,9 @@ elif [[ "$CU_VERSION" == "cu"* ]]; then
echo "################################################################################"


elif [[ "$CU_VERSION" == "rocm7.0"* ]]; then
export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201,gfx950"
echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"

elif [[ "$CU_VERSION" == "rocm6.4"* ]] ||
[[ "$CU_VERSION" == "rocm6.3"* ]]; then
export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201"
echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"

elif [[ "$CU_VERSION" == "rocm6.4"* ]] ||
elif [[ "$CU_VERSION" == "rocm7.1"* ]] ||
[[ "$CU_VERSION" == "rocm7.0"* ]] ||
[[ "$CU_VERSION" == "rocm6.4"* ]] ||
[[ "$CU_VERSION" == "rocm6.3"* ]]; then
export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201"
echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/utils_triton.bash
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ install_triton_pip () {
return 1
else
echo "################################################################################"
echo "# Install PyTorch (PyTorch PIP)"
echo "# Install PyTorch-Triton (PyTorch PIP)"
echo "#"
echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
echo "################################################################################"
Expand Down
24 changes: 22 additions & 2 deletions cmake/modules/GpuCppLibrary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,34 @@ function(prepare_target_sources)
list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS})
endif()

set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES INCLUDE_DIRECTORIES
"${args_INCLUDE_DIRS}")

# Set source properties
set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES COMPILE_OPTIONS
"${args_NVCC_FLAGS}")

# Starting with CUDA 13.0, nvcc changed the default visibility of
# __global__ functions to `hidden`, which causes symbol lookup errors
# during linking. This can be worked around by setting -cudart=shared
# and --device-entity-has-hidden-visibility=false.
#
# https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/
if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND
(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
set(_nvcc_flags ${args_NVCC_FLAGS}
-static-global-template-stub=false
--device-entity-has-hidden-visibility=false)
else()
set(_nvcc_flags ${args_NVCC_FLAGS})
endif()

# Set compilation flags
set_source_files_properties(${${args_PREFIX}_sources_cu}
PROPERTIES INCLUDE_DIRECTORIES
"${args_INCLUDE_DIRS}")
PROPERTIES COMPILE_OPTIONS
"${_nvcc_flags}")

# Append to the full sources list
list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu})
Expand Down
1 change: 1 addition & 0 deletions fbgemm_gpu/cmake/TbeTraining.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ gpu_cpp_library(
${TORCH_CUDA_OPTIONS}
DEPS
fbgemm_gpu_tbe_training_backward
fbgemm_gpu_config
DESTINATION
fbgemm_gpu)

Expand Down
46 changes: 38 additions & 8 deletions fbgemm_gpu/src/memory_utils/memory_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,43 @@ using gpuMemLocation = hipMemLocation;
using gpuMemLocation = cudaMemLocation;
#endif

#if ROCM_VERSION >= 70100
// In contrast to CUDA 13, which overrides the API signature of cudaMemAdvise,
// ROCm 7.0, adds hipMemAdvise_v2 to maintain backwards compatibiility.
//
// See:
// https://rocm.docs.amd.com/projects/HIP/en/develop/doxygen/html/group___memory_m.html
#define gpuMemAdvise hipMemAdvise_v2
#else
#define gpuMemAdvise cudaMemAdvise
#endif

#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100

inline gpuMemLocation new_mem_location_from_device(const int device_id) {
gpuMemLocation deviceLoc;
#ifdef USE_ROCM
deviceLoc.type = hipMemLocationTypeDevice;
#else
deviceLoc.type = cudaMemLocationTypeDevice;
#endif
deviceLoc.id = device_id;
return deviceLoc;
}

inline gpuMemLocation new_mem_location_cpu() {
gpuMemLocation deviceLoc;
#ifdef USE_ROCM
deviceLoc.type = hipMemLocationTypeHost;
#else
deviceLoc.type = cudaMemLocationTypeHost;
#endif
deviceLoc.id = cudaCpuDeviceId;
return deviceLoc;
}

#endif

} // namespace

Tensor new_managed_tensor(
Expand All @@ -170,26 +200,26 @@ Tensor new_managed_tensor(
size_t size_bytes = t.storage().nbytes();

// Set preferred memory location to host memory
AT_CUDA_CHECK(cudaMemAdvise(
AT_CUDA_CHECK(gpuMemAdvise(
ptr,
size_bytes,
cudaMemAdviseSetPreferredLocation,
#if CUDA_VERSION >= 13000
#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
// Starting with CUDA 13, the deviceId arg (int) is replaced with
// cudaMemLocation (struct)
new_mem_location_from_device(cudaCpuDeviceId)
new_mem_location_cpu()
#else
cudaCpuDeviceId
#endif
));

// User hints with "accessed by": GPU will establish direct mapping of data
// in CPU memory, no page faults will be generated
AT_CUDA_CHECK(cudaMemAdvise(
AT_CUDA_CHECK(gpuMemAdvise(
ptr,
size_bytes,
cudaMemAdviseSetAccessedBy,
#if CUDA_VERSION >= 13000
#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
new_mem_location_from_device(at::cuda::current_device())
#else
at::cuda::current_device()
Expand Down Expand Up @@ -382,11 +412,11 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
device_guard.set_index(cuda_device_index);

// FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
AT_CUDA_CHECK(cudaMemAdvise(
AT_CUDA_CHECK(gpuMemAdvise(
ptr,
size_bytes,
static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
#if CUDA_VERSION >= 13000
#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
new_mem_location_from_device(hint_device)
#else
hint_device
Expand Down Expand Up @@ -420,7 +450,7 @@ void uvm_cuda_mem_prefetch_async(
AT_CUDA_CHECK(cudaMemPrefetchAsync(
ptr,
size_bytes,
#if CUDA_VERSION >= 13000
#if CUDART_VERSION >= 13000
new_mem_location_from_device(prefetch_device),
// Flags argument needs to be set to zero for now, see:
// https://docs.nvidia.com/cuda/archive/13.0.0/cuda-runtime-api/group__CUDART__MEMORY.html
Expand Down
Loading