PFCCLab · SigureMo · Oct 2, 2025 · Oct 13, 2025 · Oct 23, 2025 · Nov 4, 2025
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
@@ -180,7 +180,8 @@ def fp4_quantize_sm100(
                 - Scale factors tensor with shape determined by layout and sf_vec_size
         """
         if enable_pdl is None:
-            enable_pdl = device_support_pdl(input.device)
+            # enable_pdl = device_support_pdl(input.device)
+            enable_pdl = device_support_pdl(input.place)
         out_val = torch.empty(
             (*input.shape[:-1], input.shape[-1] // 2),
             dtype=torch.uint8,
@@ -669,9 +670,11 @@ def fp4_quantize(
 
     assert input.shape[-1] % sf_vec_size == 0
     if enable_pdl is None:
-        enable_pdl = device_support_pdl(input.device)
+        # enable_pdl = device_support_pdl(input.device)
+        enable_pdl = device_support_pdl(input.place)
     # get input device sm version
-    major, minor = get_compute_capability(input.device)
+    # major, minor = get_compute_capability(input.device)
+    major, minor = get_compute_capability(input.place)
     x_q, sf = get_fp4_quantization_module(f"{major}{minor}").fp4_quantize_sm100(
         input,
         global_scale,

diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -454,7 +454,8 @@ def cutlass_fused_moe(
         activation_type: ActivationType = ActivationType.Swiglu,
     ) -> List[torch.Tensor]:
         if enable_pdl is None:
-            enable_pdl = device_support_pdl(input.device)
+            # enable_pdl = device_support_pdl(input.device)
+            enable_pdl = device_support_pdl(input.place)
         tuner = AutoTuner.get()
         MoERunner.refine_tuning_config(tune_max_num_tokens)
 
@@ -513,17 +514,22 @@ def cutlass_fused_moe(
             else moe_runner.fused_moe_runner.run_moe
         )
         num_active_experts_per_node = torch.empty(
-            (1,), dtype=torch.int32, device=input.device
+            # (1,), dtype=torch.int32, device=input.device
+            (1,),
+            dtype=torch.int32,
+            device=input.place,
         )
         experts_to_token_score = torch.empty(
             (fc2_expert_weights.shape[0], input.shape[0]),
             dtype=torch.float32,
-            device=input.device,
+            # device=input.device,
+            device=input.place,
         )
         active_expert_global_ids = torch.empty(
             (fc2_expert_weights.shape[0],),
             dtype=torch.int32,
-            device=input.device,
+            # device=input.device,
+            device=input.place,
         )
         min_latency_output = (
             [
@@ -799,7 +805,8 @@ def cutlass_fused_moe(
             )
 
     if enable_pdl is None:
-        enable_pdl = device_support_pdl(input.device)
+        # enable_pdl = device_support_pdl(input.device)
+        enable_pdl = device_support_pdl(input.place)
 
     num_rows = input.shape[0]
     if min_latency_mode:
@@ -808,10 +815,16 @@ def cutlass_fused_moe(
     output_shape = (num_rows, hidden_size)
 
     if output is None:
-        output = torch.empty(output_shape, dtype=output_dtype, device=input.device)
+        # output = torch.empty(output_shape, dtype=output_dtype, device=input.device)
+        output = torch.empty(output_shape, dtype=output_dtype, device=input.place)
     else:
         check_shape_dtype_device(
-            output, output_shape, output_dtype, input.device, "output"
+            # output, output_shape, output_dtype, input.device, "output"
+            output,
+            output_shape,
+            output_dtype,
+            input.place,
+            "output",
         )
 
     return get_cutlass_fused_moe_module(device_arch).cutlass_fused_moe(

diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -16,13 +16,12 @@
 
 import functools
 import math
+import os
 from enum import Enum
 from typing import Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
 
 import torch
 import torch.version
-from torch.torch_version import TorchVersion
-from torch.torch_version import __version__ as torch_version
 
 from .jit.spdlog import gen_spdlog_module
 
@@ -249,6 +248,7 @@ def canonicalize_torch_dtype(dtype: Union[torch.dtype, str]) -> torch.dtype:
 
 @functools.cache
 def get_compute_capability(device: torch.device) -> Tuple[int, int]:
+    return torch.device.cuda.get_device_capability(device.gpu_device_id())
     if device.type != "cuda":
         raise ValueError("device must be a cuda device")
     return torch.cuda.get_device_capability(device.index)
@@ -267,7 +267,13 @@ def _check_cached_qkv_data_type(
         )
 
 
-if TorchVersion(torch_version) < TorchVersion("2.4"):
+def use_paddle_compatible_api() -> bool:
+    return os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() in ["1", "on", "true"]
+
+
+if use_paddle_compatible_api() or torch.torch_version.TorchVersion(
+    torch.torch_version.__version__
+) < torch.torch_version.TorchVersion("2.4"):
 
     def register_custom_op(
         name: str,
@@ -522,15 +528,16 @@ def check_shape_dtype_device(
     expected_device: Optional[torch.device],
     name: str,
 ) -> None:
-    if expected_shape and x.shape != torch.Size(expected_shape):
+    if expected_shape and tuple(x.shape) != torch.Size(expected_shape):
         raise ValueError(
             f"Invalid shape of {name}: expected {expected_shape}, got {x.shape}"
         )
     if expected_dtype and x.dtype != expected_dtype:
         raise ValueError(
             f"Invalid dtype of {name}: expected {expected_dtype}, got {x.dtype}"
         )
-    if expected_device and x.device != expected_device:
+    # if expected_device and x.device != expected_device:
+    if expected_device and x.place != expected_device:
         raise ValueError(
             f"Invalid device of {name}: expected {expected_device}, got {x.device}"
         )
@@ -566,8 +573,8 @@ def set_log_level(lvl_str: str) -> None:
 
 @functools.cache
 def device_support_pdl(device: torch.device) -> bool:
-    if device.type != "cuda":
-        return False
+    # if device.type != "cuda":
+    #     return False
     major, _ = get_compute_capability(device)
     return major >= 9
 

diff --git a/requirements.txt b/requirements.txt
@@ -9,5 +9,5 @@ nvidia-ml-py
 packaging>=24.2
 requests
 tabulate
-torch
+# torch
 tqdm