Merge pull request #4 from lcskrishna/rocm_enablement

petrex · web-flow · commit 735570eab67f · 2025-01-06T12:49:21.000-08:00
Fixes builds for non-rocm.
diff --git a/setup.py b/setup.py
@@ -59,7 +59,7 @@ def get_extensions():
 
     if not torch.cuda.is_available():
         print("PyTorch GPU support is not available. Skipping compilation of CUDA extensions")
-    if CUDA_HOME is None or not IS_ROCM and torch.cuda.is_available():
+    if (CUDA_HOME is None and ROCM_HOME is None) and torch.cuda.is_available():
         print("CUDA toolkit or ROCm is not available. Skipping compilation of CUDA extensions")
         print("If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit")
 
diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu
@@ -167,6 +167,7 @@ __global__ void _dequantize_int4_kernel(
       // All b values within a 16x16 tile should fall within the same q group
       // Hence we load 1 scale and zero per loop
       int qgroup = ks[0] /  groupSize;
+#if defined(USE_ROCM)
       __nv_bfloat162 scale2 = __bfloat162bfloat162(__hip_bfloat16(1.0f));
       __nv_bfloat162 zero2 = __bfloat162bfloat162(__hip_bfloat16(1.0f));
 
@@ -177,6 +178,11 @@ __global__ void _dequantize_int4_kernel(
         scale2 = __bfloat162bfloat162(pSZ[0]);
         zero2 = __bfloat162bfloat162(pSZ[1]);
       }
+#else
+      const __nv_bfloat16 *pSZ = reinterpret_cast<const __nv_bfloat16*>(&scales_and_zeros.value()[qgroup][n0][0]);
+      __nv_bfloat162 scale2 = __bfloat162bfloat162(pSZ[0]);
+      __nv_bfloat162 zero2 = __bfloat162bfloat162(pSZ[1]);
+#endif
 
   #pragma unroll
       for (int i = 0; i < 4; i++) {