Fix CI

yanbing-j · yanbing-j · commit bd3b79a3aba2 · 2024-07-28T15:00:37.000+08:00
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -102,8 +102,12 @@ def test_unpack_tensor_core_tiled_layout_correctness(shape, inner_k_tiles):
     assert K % (inner_k_tiles * kTileSizeK) == 0 and N % kTileSizeN == 0
 
     t = torch.randint(0, 16, dtype=torch.int, size=shape, device="cuda")
+    if TORCH_VERSION_AFTER_2_5:
+        t = (t[::, ::2] << 4 | t[::, 1::2]).to(torch.uint8)
     packed_w = torch.ops.aten._convert_weight_to_int4pack(t, inner_k_tiles)
     unpacked = torchao.ops.unpack_tensor_core_tiled_layout(packed_w, inner_k_tiles)
+    if TORCH_VERSION_AFTER_2_5:
+        unpacked = (unpacked[::, ::2] << 4 | unpacked[::, 1::2]).to(torch.uint8)
     assert torch.equal(t, unpacked)
 
 # TODO: Fix "test_aot_dispatch_dynamic" test failure
@@ -122,6 +126,8 @@ def test_unpack_tensor_core_tiled_layout_op(shape, inner_k_tiles):
         test_utils.append("test_aot_dispatch_dynamic")
 
     t = torch.randint(0, 16, dtype=torch.int, size=shape, device="cuda")
+    if TORCH_VERSION_AFTER_2_5:
+        t = (t[::, ::2] << 4 | t[::, 1::2]).to(torch.uint8)
     packed_w = torch.ops.aten._convert_weight_to_int4pack(t, inner_k_tiles)
 
     opcheck(
@@ -229,6 +235,9 @@ def test_dequantize_tensor_core_tiled_layout_correctness_unpack_and_dequant(shap
 
     # Unpack and dequantize
     unpacked = torchao.ops.unpack_tensor_core_tiled_layout(packed, inner_k_tiles)
+    if TORCH_VERSION_AFTER_2_5:
+        unpacked = (unpacked[::, ::2] << 4 | unpacked[::, 1::2]).to(torch.uint8)
+
     dq_ao = groupwise_affine_dequantize_tensor_from_qparams(
         unpacked, scales, zeros, n_bit=4, groupsize=group_size
     )
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -362,18 +362,26 @@ def groupwise_affine_dequantize_tensor_from_qparams(
     groupsize=128,
 ):
     assert groupsize > 1
-    # needed for GPTQ single column dequantize
-    if groupsize > w_int4x8.shape[-1] and scales.shape[-1] == 1:
-        groupsize = w_int4x8.shape[-1]
-    assert w_int4x8.shape[-1] % groupsize == 0
     assert w_int4x8.dim() == 2
+    if TORCH_VERSION_AFTER_2_5:
+        data = w_int4x8.to(torch.int32)
+        high_bits = data >> 4
+        low_bits = data & 0x0F
+        w_int32 = torch.zeros((w_int4x8.shape[0], w_int4x8.shape[1] * 2), dtype=torch.int32, device=w_int4x8.device)
+        w_int32[::, ::2] = high_bits
+        w_int32[::, 1::2] = low_bits
+    else:
+        w_int32 = w_int4x8
 
+    # needed for GPTQ single column dequantize
+    if groupsize > w_int32.shape[-1] and scales.shape[-1] == 1:
+        groupsize = w_int32.shape[-1]
+    assert w_int32.shape[-1] % groupsize == 0
     block_size = (1, groupsize)
     input_dtype = torch.int32
     quant_min = 0
     quant_max = 2**n_bit - 1
-    return dequantize_affine(w_int4x8, block_size, scales, zeros, input_dtype, quant_min, quant_max, zero_point_domain=ZeroPointDomain.FLOAT, output_dtype=scales.dtype)
-
+    return dequantize_affine(w_int32, block_size, scales, zeros, input_dtype, quant_min, quant_max, zero_point_domain=ZeroPointDomain.FLOAT, output_dtype=scales.dtype)
 
 def groupwise_affine_quantize_tensor(w, n_bit=4, groupsize=128, dtype=torch.bfloat16):
     scales, zeros = get_groupwise_affine_qparams(w, n_bit, groupsize, dtype)