diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 96305c056a..900e973c8e 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -283,6 +283,7 @@ __configure_fbgemm_gpu_build_cuda () { # See: # # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # https://kaixih.github.io/nvcc-options/ # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list # https://github.com/NVIDIA/nvbench/discussions/129 # https://github.com/vllm-project/vllm/blob/main/CMakeLists.txt#L187 diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py index b842433c8f..cd74eaed97 100644 --- a/.github/scripts/generate_ci_matrix.py +++ b/.github/scripts/generate_ci_matrix.py @@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]: # FBGEMM HSTU is expensive, so conserve CI resources return ["12.8.1"] elif self.target == TARGET_GENAI: - return ["12.6.3", "12.8.1", "13.0.2"] + return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"] else: # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138 - return ["12.6.3", "12.8.1"] + return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"] def rocm_versions(self) -> List[str]: if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH: diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash index f4a9cce161..706f101860 100644 --- a/.github/scripts/nova_dir.bash +++ b/.github/scripts/nova_dir.bash @@ -50,16 +50,9 @@ elif [[ "$CU_VERSION" == "cu"* ]]; then echo "################################################################################" -elif [[ "$CU_VERSION" == "rocm7.0"* ]]; then - export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201,gfx950" - echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}" - -elif [[ "$CU_VERSION" == "rocm6.4"* ]] || - [[ "$CU_VERSION" == "rocm6.3"* ]]; then - export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201" - echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}" - -elif [[ "$CU_VERSION" == "rocm6.4"* ]] || +elif [[ "$CU_VERSION" == "rocm7.1"* ]] || + [[ "$CU_VERSION" == "rocm7.0"* ]] || + [[ "$CU_VERSION" == "rocm6.4"* ]] || [[ "$CU_VERSION" == "rocm6.3"* ]]; then export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201" echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}" diff --git a/.github/scripts/utils_triton.bash b/.github/scripts/utils_triton.bash index 73a9a533a9..0e7d6f9151 100644 --- a/.github/scripts/utils_triton.bash +++ b/.github/scripts/utils_triton.bash @@ -67,7 +67,7 @@ install_triton_pip () { return 1 else echo "################################################################################" - echo "# Install PyTorch (PyTorch PIP)" + echo "# Install PyTorch-Triton (PyTorch PIP)" echo "#" echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}" echo "################################################################################" diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index 51c30df750..b148035ca0 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -87,14 +87,34 @@ function(prepare_target_sources) list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS}) endif() + set_source_files_properties(${${args_PREFIX}_sources_cu} + PROPERTIES INCLUDE_DIRECTORIES + "${args_INCLUDE_DIRS}") + # Set source properties set_source_files_properties(${${args_PREFIX}_sources_cu} PROPERTIES COMPILE_OPTIONS "${args_NVCC_FLAGS}") + # Starting with CUDA 13.0, nvcc changed the default visibility of + # __global__ functions to `hidden`, which causes symbol lookup errors + # during linking. This can be worked around by setting -cudart=shared + # and --device-entity-has-hidden-visibility=false. + # + # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/ + if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND + (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") ) + set(_nvcc_flags ${args_NVCC_FLAGS} + -static-global-template-stub=false + --device-entity-has-hidden-visibility=false) + else() + set(_nvcc_flags ${args_NVCC_FLAGS}) + endif() + + # Set compilation flags set_source_files_properties(${${args_PREFIX}_sources_cu} - PROPERTIES INCLUDE_DIRECTORIES - "${args_INCLUDE_DIRS}") + PROPERTIES COMPILE_OPTIONS + "${_nvcc_flags}") # Append to the full sources list list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu}) diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake index fd51516fd2..59fcd9fc7e 100644 --- a/fbgemm_gpu/cmake/TbeTraining.cmake +++ b/fbgemm_gpu/cmake/TbeTraining.cmake @@ -259,6 +259,7 @@ gpu_cpp_library( ${TORCH_CUDA_OPTIONS} DEPS fbgemm_gpu_tbe_training_backward + fbgemm_gpu_config DESTINATION fbgemm_gpu) diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index 575a856365..db20021728 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -150,13 +150,43 @@ using gpuMemLocation = hipMemLocation; using gpuMemLocation = cudaMemLocation; #endif +#if ROCM_VERSION >= 70100 +// In contrast to CUDA 13, which overrides the API signature of cudaMemAdvise, +// ROCm 7.0, adds hipMemAdvise_v2 to maintain backwards compatibiility. +// +// See: +// https://rocm.docs.amd.com/projects/HIP/en/develop/doxygen/html/group___memory_m.html +#define gpuMemAdvise hipMemAdvise_v2 +#else +#define gpuMemAdvise cudaMemAdvise +#endif + +#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100 + inline gpuMemLocation new_mem_location_from_device(const int device_id) { gpuMemLocation deviceLoc; +#ifdef USE_ROCM + deviceLoc.type = hipMemLocationTypeDevice; +#else deviceLoc.type = cudaMemLocationTypeDevice; +#endif deviceLoc.id = device_id; return deviceLoc; } +inline gpuMemLocation new_mem_location_cpu() { + gpuMemLocation deviceLoc; +#ifdef USE_ROCM + deviceLoc.type = hipMemLocationTypeHost; +#else + deviceLoc.type = cudaMemLocationTypeHost; +#endif + deviceLoc.id = cudaCpuDeviceId; + return deviceLoc; +} + +#endif + } // namespace Tensor new_managed_tensor( @@ -170,14 +200,14 @@ Tensor new_managed_tensor( size_t size_bytes = t.storage().nbytes(); // Set preferred memory location to host memory - AT_CUDA_CHECK(cudaMemAdvise( + AT_CUDA_CHECK(gpuMemAdvise( ptr, size_bytes, cudaMemAdviseSetPreferredLocation, -#if CUDA_VERSION >= 13000 +#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100 // Starting with CUDA 13, the deviceId arg (int) is replaced with // cudaMemLocation (struct) - new_mem_location_from_device(cudaCpuDeviceId) + new_mem_location_cpu() #else cudaCpuDeviceId #endif @@ -185,11 +215,11 @@ Tensor new_managed_tensor( // User hints with "accessed by": GPU will establish direct mapping of data // in CPU memory, no page faults will be generated - AT_CUDA_CHECK(cudaMemAdvise( + AT_CUDA_CHECK(gpuMemAdvise( ptr, size_bytes, cudaMemAdviseSetAccessedBy, -#if CUDA_VERSION >= 13000 +#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100 new_mem_location_from_device(at::cuda::current_device()) #else at::cuda::current_device() @@ -382,11 +412,11 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) { device_guard.set_index(cuda_device_index); // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP. - AT_CUDA_CHECK(cudaMemAdvise( + AT_CUDA_CHECK(gpuMemAdvise( ptr, size_bytes, static_cast(cuda_memory_advise), -#if CUDA_VERSION >= 13000 +#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100 new_mem_location_from_device(hint_device) #else hint_device @@ -420,7 +450,7 @@ void uvm_cuda_mem_prefetch_async( AT_CUDA_CHECK(cudaMemPrefetchAsync( ptr, size_bytes, -#if CUDA_VERSION >= 13000 +#if CUDART_VERSION >= 13000 new_mem_location_from_device(prefetch_device), // Flags argument needs to be set to zero for now, see: // https://docs.nvidia.com/cuda/archive/13.0.0/cuda-runtime-api/group__CUDART__MEMORY.html