diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 96305c056a..900e973c8e 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -283,6 +283,7 @@ __configure_fbgemm_gpu_build_cuda () {
     # See:
     #
     #   https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
+    #   https://kaixih.github.io/nvcc-options/
     #   https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list
     #   https://github.com/NVIDIA/nvbench/discussions/129
     #   https://github.com/vllm-project/vllm/blob/main/CMakeLists.txt#L187
diff --git a/.github/scripts/generate_ci_matrix.py b/.github/scripts/generate_ci_matrix.py
index b842433c8f..cd74eaed97 100644
--- a/.github/scripts/generate_ci_matrix.py
+++ b/.github/scripts/generate_ci_matrix.py
@@ -304,10 +304,10 @@ def cuda_versions(self) -> List[str]:
             # FBGEMM HSTU is expensive, so conserve CI resources
             return ["12.8.1"]
         elif self.target == TARGET_GENAI:
-            return ["12.6.3", "12.8.1", "13.0.2"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
         else:
             # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
-            return ["12.6.3", "12.8.1"]
+            return ["12.6.3", "12.8.1", "12.9.1", "13.0.2"]
 
     def rocm_versions(self) -> List[str]:
         if GitRepo.ref() == REFS_MAIN and GitRepo.event_name() == EVENT_NAME_PUSH:
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
index f4a9cce161..706f101860 100644
--- a/.github/scripts/nova_dir.bash
+++ b/.github/scripts/nova_dir.bash
@@ -50,16 +50,9 @@ elif [[ "$CU_VERSION" == "cu"* ]]; then
     echo "################################################################################"
 
 
-elif [[ "$CU_VERSION" == "rocm7.0"* ]]; then
-    export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201,gfx950"
-    echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
-
-elif [[ "$CU_VERSION" == "rocm6.4"* ]] ||
-     [[ "$CU_VERSION" == "rocm6.3"* ]]; then
-    export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201"
-    echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
-
-elif [[ "$CU_VERSION" == "rocm6.4"* ]] ||
+elif [[ "$CU_VERSION" == "rocm7.1"* ]] ||
+     [[ "$CU_VERSION" == "rocm7.0"* ]] ||
+     [[ "$CU_VERSION" == "rocm6.4"* ]] ||
      [[ "$CU_VERSION" == "rocm6.3"* ]]; then
     export PYTORCH_ROCM_ARCH="gfx908,gfx90a,gfx942,gfx1201"
     echo "[NOVA] Set PYTORCH_ROCM_ARCH to: ${PYTORCH_ROCM_ARCH}"
diff --git a/.github/scripts/utils_triton.bash b/.github/scripts/utils_triton.bash
index 73a9a533a9..0e7d6f9151 100644
--- a/.github/scripts/utils_triton.bash
+++ b/.github/scripts/utils_triton.bash
@@ -67,7 +67,7 @@ install_triton_pip () {
     return 1
   else
     echo "################################################################################"
-    echo "# Install PyTorch (PyTorch PIP)"
+    echo "# Install PyTorch-Triton (PyTorch PIP)"
     echo "#"
     echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
     echo "################################################################################"
diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake
index 51c30df750..b148035ca0 100644
--- a/cmake/modules/GpuCppLibrary.cmake
+++ b/cmake/modules/GpuCppLibrary.cmake
@@ -87,14 +87,34 @@ function(prepare_target_sources)
             list(APPEND ${args_PREFIX}_sources_cu ${args_CUDA_SPECIFIC_SRCS})
         endif()
 
+        set_source_files_properties(${${args_PREFIX}_sources_cu}
+            PROPERTIES INCLUDE_DIRECTORIES
+            "${args_INCLUDE_DIRS}")
+
         # Set source properties
         set_source_files_properties(${${args_PREFIX}_sources_cu}
             PROPERTIES COMPILE_OPTIONS
             "${args_NVCC_FLAGS}")
 
+        # Starting with CUDA 13.0, nvcc changed the default visibility of
+        # __global__ functions to `hidden`, which causes symbol lookup errors
+        # during linking.  This can be worked around by setting -cudart=shared
+        # and --device-entity-has-hidden-visibility=false.
+        #
+        # https://developer.nvidia.com/blog/cuda-c-compiler-updates-impacting-elf-visibility-and-linkage/
+        if( (FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA) AND
+            (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") )
+            set(_nvcc_flags ${args_NVCC_FLAGS}
+                -static-global-template-stub=false
+                --device-entity-has-hidden-visibility=false)
+        else()
+            set(_nvcc_flags ${args_NVCC_FLAGS})
+        endif()
+
+        # Set compilation flags
         set_source_files_properties(${${args_PREFIX}_sources_cu}
-            PROPERTIES INCLUDE_DIRECTORIES
-            "${args_INCLUDE_DIRS}")
+            PROPERTIES COMPILE_OPTIONS
+            "${_nvcc_flags}")
 
         # Append to the full sources list
         list(APPEND ${args_PREFIX}_sources_combined ${${args_PREFIX}_sources_cu})
diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake
index fd51516fd2..59fcd9fc7e 100644
--- a/fbgemm_gpu/cmake/TbeTraining.cmake
+++ b/fbgemm_gpu/cmake/TbeTraining.cmake
@@ -259,6 +259,7 @@ gpu_cpp_library(
     ${TORCH_CUDA_OPTIONS}
   DEPS
     fbgemm_gpu_tbe_training_backward
+    fbgemm_gpu_config
   DESTINATION
     fbgemm_gpu)
 
diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
index 575a856365..db20021728 100644
--- a/fbgemm_gpu/src/memory_utils/memory_utils.cu
+++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -150,13 +150,43 @@ using gpuMemLocation = hipMemLocation;
 using gpuMemLocation = cudaMemLocation;
 #endif
 
+#if ROCM_VERSION >= 70100
+// In contrast to CUDA 13, which overrides the API signature of cudaMemAdvise,
+// ROCm 7.0, adds hipMemAdvise_v2 to maintain backwards compatibiility.
+//
+// See:
+// https://rocm.docs.amd.com/projects/HIP/en/develop/doxygen/html/group___memory_m.html
+#define gpuMemAdvise hipMemAdvise_v2
+#else
+#define gpuMemAdvise cudaMemAdvise
+#endif
+
+#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
+
 inline gpuMemLocation new_mem_location_from_device(const int device_id) {
   gpuMemLocation deviceLoc;
+#ifdef USE_ROCM
+  deviceLoc.type = hipMemLocationTypeDevice;
+#else
   deviceLoc.type = cudaMemLocationTypeDevice;
+#endif
   deviceLoc.id = device_id;
   return deviceLoc;
 }
 
+inline gpuMemLocation new_mem_location_cpu() {
+  gpuMemLocation deviceLoc;
+#ifdef USE_ROCM
+  deviceLoc.type = hipMemLocationTypeHost;
+#else
+  deviceLoc.type = cudaMemLocationTypeHost;
+#endif
+  deviceLoc.id = cudaCpuDeviceId;
+  return deviceLoc;
+}
+
+#endif
+
 } // namespace
 
 Tensor new_managed_tensor(
@@ -170,14 +200,14 @@ Tensor new_managed_tensor(
   size_t size_bytes = t.storage().nbytes();
 
   // Set preferred memory location to host memory
-  AT_CUDA_CHECK(cudaMemAdvise(
+  AT_CUDA_CHECK(gpuMemAdvise(
       ptr,
       size_bytes,
       cudaMemAdviseSetPreferredLocation,
-#if CUDA_VERSION >= 13000
+#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
       // Starting with CUDA 13, the deviceId arg (int) is replaced with
       // cudaMemLocation (struct)
-      new_mem_location_from_device(cudaCpuDeviceId)
+      new_mem_location_cpu()
 #else
       cudaCpuDeviceId
 #endif
@@ -185,11 +215,11 @@ Tensor new_managed_tensor(
 
   // User hints with "accessed by": GPU will establish direct mapping of data
   // in CPU memory, no page faults will be generated
-  AT_CUDA_CHECK(cudaMemAdvise(
+  AT_CUDA_CHECK(gpuMemAdvise(
       ptr,
       size_bytes,
       cudaMemAdviseSetAccessedBy,
-#if CUDA_VERSION >= 13000
+#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
       new_mem_location_from_device(at::cuda::current_device())
 #else
       at::cuda::current_device()
@@ -382,11 +412,11 @@ void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise) {
   device_guard.set_index(cuda_device_index);
 
   // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
-  AT_CUDA_CHECK(cudaMemAdvise(
+  AT_CUDA_CHECK(gpuMemAdvise(
       ptr,
       size_bytes,
       static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
-#if CUDA_VERSION >= 13000
+#if CUDART_VERSION >= 13000 || ROCM_VERSION >= 70100
       new_mem_location_from_device(hint_device)
 #else
       hint_device
@@ -420,7 +450,7 @@ void uvm_cuda_mem_prefetch_async(
   AT_CUDA_CHECK(cudaMemPrefetchAsync(
       ptr,
       size_bytes,
-#if CUDA_VERSION >= 13000
+#if CUDART_VERSION >= 13000
       new_mem_location_from_device(prefetch_device),
       // Flags argument needs to be set to zero for now, see:
       // https://docs.nvidia.com/cuda/archive/13.0.0/cuda-runtime-api/group__CUDART__MEMORY.html