From d0d8271b3f4c8ca91b22e88451f618f86bd25688 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Fri, 10 May 2024 19:43:27 -0700
Subject: [PATCH 01/37] make skipXPU work

---
 torch/testing/_internal/common_device_type.py | 45 +++++++++++-
 .../_internal/common_methods_invocations.py   | 11 ++-
 torch/testing/_internal/common_utils.py       |  6 +-
 torch/testing/_internal/opinfo/core.py        | 73 ++++++++++++++++++-
 4 files changed, 128 insertions(+), 7 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index d5285a6d0d7fd..17bd8357f15b0 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -397,14 +397,19 @@ def instantiate_test_helper(cls, name, *, test, param_kwargs=None, decorator_fn=
             # Add the device param kwarg if the test needs device or devices.
             param_kwargs = {} if param_kwargs is None else param_kwargs
             test_sig_params = inspect.signature(test).parameters
+            #import pdb
+            #pdb.set_trace()
             if 'device' in test_sig_params or 'devices' in test_sig_params:
                 device_arg: str = cls._init_and_get_primary_device()
                 if hasattr(test, 'num_required_devices'):
                     device_arg = cls.get_all_devices()
                 _update_param_kwargs(param_kwargs, 'device', device_arg)
-
+            #import pdb
+            #pdb.set_trace()
             # Apply decorators based on param kwargs.
             for decorator in decorator_fn(param_kwargs):
+                #import pdb
+                #pdb.set_trace()
                 test = decorator(test)
 
             # Constructs the test
@@ -437,6 +442,8 @@ def instantiated_test(self, param_kwargs=param_kwargs):
                 return result
 
             assert not hasattr(cls, name), f"Redefinition of test {name}"
+            #import pdb
+            #pdb.set_trace()
             setattr(cls, name, instantiated_test)
 
         def default_parametrize_fn(test, generic_cls, device_cls):
@@ -448,6 +455,8 @@ def default_parametrize_fn(test, generic_cls, device_cls):
 
         # If one of the @dtypes* decorators is present, also parametrize over the dtypes set by it.
         dtypes = cls._get_dtypes(test)
+        #import pdb
+        #pdb.set_trace()
         if dtypes is not None:
 
             def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
@@ -473,6 +482,7 @@ def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
                 dtype_kwarg = param_kwargs['dtypes'] if 'dtypes' in param_kwargs else param_kwargs['dtype']
             test_name = f'{name}{test_suffix}{device_suffix}{_dtype_test_suffix(dtype_kwarg)}'
 
+            print(test_name)
             instantiate_test_helper(cls=cls, name=test_name, test=test, param_kwargs=param_kwargs,
                                     decorator_fn=decorator_fn)
 
@@ -832,6 +842,7 @@ class OpDTypes(Enum):
     any_one = 4  # Test precisely one supported dtype
     none = 5  # Instantiate no dtype variants (no dtype kwarg needed)
     any_common_cpu_cuda_one = 6  # Test precisely one supported dtype that is common to both cuda and cpu
+    any_common_cpu_xpu_one = 7 # Test precisely one supported dtype that is common to both xpu and cpu
 
 
 # Arbitrary order
@@ -909,6 +920,8 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                                'instantiate_parametrized_tests()')
 
         op = check_exhausted_iterator = object()
+        #import pdb
+        #pdb.set_trace()
         for op in self.op_list:
             # Determine the set of dtypes to use.
             dtypes: Union[Set[torch.dtype], Set[None]]
@@ -941,13 +954,19 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                     dtypes = {next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)}
                 else:
                     dtypes = {}
-
+            elif self.opinfo_dtypes == OpDTypes.any_common_cpu_xpu_one:
+                # Tries to pick a dtype that supports both CPU and CUDA
+                supported = set(op.dtypes).intersection(op.dtypesIfXPU)
+                if supported:
+                    dtypes = {next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)}
+                else:
+                    dtypes = {}
             elif self.opinfo_dtypes == OpDTypes.none:
                 dtypes = {None}
             else:
                 raise RuntimeError(f"Unknown OpDType: {self.opinfo_dtypes}")
 
-            if self.allowed_dtypes is not None:
+            if self.allowed_dtypes is not None and dtypes is not None:
                 dtypes = dtypes.intersection(self.allowed_dtypes)
 
             # Construct the test name; device / dtype parts are handled outside.
@@ -992,6 +1011,7 @@ def test_wrapper(*args, **kwargs):
                     decorator_fn = partial(op.get_decorators, generic_cls.__name__,
                                            test.__name__, device_cls.device_type, dtype)
 
+                    #print("create test {} op={} dtype={} param_kwargs={} decorator_fn={}".format(test_name, op, dtype, param_kwargs, decorator_fn))
                     yield (test_wrapper, test_name, param_kwargs, decorator_fn)
                 except Exception as ex:
                     # Provides an error message for debugging before rethrowing the exception
@@ -1041,6 +1061,11 @@ class skipCUDAIf(skipIf):
     def __init__(self, dep, reason):
         super().__init__(dep, reason, device_type='cuda')
 
+class skipXPUIf(skipIf):
+
+    def __init__(self, dep, reason):                                           
+        super().__init__(dep, reason, device_type='xpu') 
+
 # Skips a test on Lazy if the condition is true.
 class skipLazyIf(skipIf):
 
@@ -1356,6 +1381,17 @@ def only_fn(self, *args, **kwargs):
 
     return only_fn
 
+def onlyCUDAAndXPU(fn):
+    @wraps(fn)
+    def only_fn(self, *args, **kwargs):
+        if self.device_type not in ('cuda', 'xpu'):
+            reason = f"onlyCUDAAndXPU: doesn't run on {self.device_type}"
+            raise unittest.SkipTest(reason)
+
+        return fn(self, *args, **kwargs)
+
+    return only_fn
+
 def disablecuDNN(fn):
 
     @wraps(fn)
@@ -1563,6 +1599,9 @@ def skipLazy(fn):
 def skipMeta(fn):
     return skipMetaIf(True, "test doesn't work with meta tensors")(fn)
 
+def skipXPU(fn):
+    return skipXPUIf(True, "test doesn't work with XPU tensors")(fn)
+
 def skipXLA(fn):
     return skipXLAIf(True, "Marked as skipped for XLA")(fn)
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 001d93de1875f..8c2a72390aca9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10421,6 +10421,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.abs,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   skipXPU=False,
                    skips=(
                        DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
                                     'test_inplace_grad', dtypes=(torch.cdouble,)),
@@ -10543,6 +10545,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True,
                     supports_forward_ad=True,
                     supports_two_python_scalars=True,
+                    skipXPU=False,
                     decorators=(
                         DecorateInfo(
                             toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
@@ -10572,6 +10575,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=False,
            error_inputs_func=error_inputs_item,
            sample_inputs_func=sample_inputs_item,
+           skipXPU=False,
            skips=(
                # Error testing item function variant
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
@@ -10584,7 +10588,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast'),
                # Booleans mismatch: AssertionError: False is not true
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake'),
-           )),
+               #DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=None),
+           )
+        ),
     OpInfo('arange',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
            supports_out=True,
@@ -10592,6 +10598,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            is_factory_function=True,
            error_inputs_func=error_inputs_arange,
            sample_inputs_func=sample_inputs_arange,
+           skipXPU=False,
            skips=(
                # https://github.com/pytorch/pytorch/issues/81774
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
@@ -18012,6 +18019,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                   supports_forward_ad=True,
                   supports_fwgrad_bwgrad=True,
                   sample_inputs_func=sample_repeat_tile,
+                  skipXPU=True,
                   skips=(
                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                   )),
@@ -19198,6 +19206,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         ref=reference_reduction_numpy(np.all),
+        skipXPU=False,
         skips=(
             # FIXME: uint8 input returns uint8 instead of bool
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 46e4f817d2b9d..b56d44c3904bb 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -227,7 +227,7 @@ def maybe_load_json(filename):
 if os.getenv("DISABLED_TESTS_FILE", ""):
     disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
 
-NATIVE_DEVICES = ('cpu', 'cuda', 'meta', torch._C._get_privateuse1_backend_name())
+NATIVE_DEVICES = ('cpu', 'cuda', 'meta', 'xpu', torch._C._get_privateuse1_backend_name())
 
 check_names = ['orin', 'concord', 'galen', 'xavier', 'nano', 'jetson', 'tegra']
 IS_JETSON = any(name in platform.platform() for name in check_names)
@@ -389,6 +389,8 @@ def composite_fn(test, generic_cls, device_cls,
                      old_parametrize_fn=old_parametrize_fn,
                      new_parametrize_fn=new_parametrize_fn):
         old_tests = list(old_parametrize_fn(test, generic_cls, device_cls))
+        import pdb
+        pdb.set_trace()
         for (old_test, old_test_name, old_param_kwargs, old_dec_fn) in old_tests:
             for (new_test, new_test_name, new_param_kwargs, new_dec_fn) in \
                     new_parametrize_fn(old_test, generic_cls, device_cls):
@@ -403,6 +405,8 @@ def composite_fn(test, generic_cls, device_cls,
                                                    old_test_name)
 
                 def merged_decorator_fn(param_kwargs, old_dec_fn=old_dec_fn, new_dec_fn=new_dec_fn):
+                    import pdb
+                    pdb.set_trace()
                     return list(old_dec_fn(param_kwargs)) + list(new_dec_fn(param_kwargs))
 
                 yield (new_test, merged_test_name, full_param_kwargs, merged_decorator_fn)
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 70c643d2b8fee..02e430ed616c5 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -25,6 +25,7 @@
     floating_and_complex_types,
     floating_and_complex_types_and,
     floating_types,
+    empty_types,
 )
 from torch.testing._internal.common_utils import (
     is_iterable_of_tensors,
@@ -95,18 +96,26 @@ def __init__(
         self.dtypes = dtypes
         self.active_if = active_if
 
+        print("init decorators: {} {} {} {} {}".format(self.cls_name, self.test_name, self.device_type, self.dtypes, self.active_if))
+
         # Validate dtypes
         if self.dtypes is not None:
             for dtype in self.dtypes:
                 assert isinstance(dtype, torch.dtype)
 
     def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
+        print("is_active: {} {} {} {} {} {}".format(self.decorators, self.active_if, 
+                                                 (self.cls_name is None or self.cls_name == cls_name), 
+                                                 (self.test_name is None or self.test_name == test_name), 
+                                                 (self.device_type is None or self.device_type == device_type), 
+                                                 (self.dtypes is None or dtype in self.dtypes)))   
+        print("is_active details: {} {} {} {}".format(self.cls_name, cls_name, self.test_name, test_name))
         return (
             self.active_if
             and (self.cls_name is None or self.cls_name == cls_name)
             and (self.test_name is None or self.test_name == test_name)
             and (self.device_type is None or self.device_type == device_type)
-            and (self.dtypes is None or dtype in self.dtypes)
+            and (self.dtypes is None or G in self.dtypes)
             # Support callables over kwargs to determine if the decorator is active.
             and (
                 self.active_if(param_kwargs)
@@ -680,6 +689,9 @@ class OpInfo:
     # information about which tests to skip
     skips: Tuple = tuple()
 
+    # skip xpu by default
+    skipXPU: bool = True
+
     # decorators to apply to generated tests
     decorators: Tuple = tuple()
 
@@ -723,6 +735,9 @@ class OpInfo:
     # dtypes this function is expected to work with on CUDA
     dtypesIfCUDA: _dispatch_dtypes = None
 
+    # dtypes this function is expected to work with on XPU
+    dtypesIfXPU: _dispatch_dtypes = None
+
     # dtypes this function is expected to work with on ROCM
     dtypesIfROCM: _dispatch_dtypes = None
 
@@ -732,6 +747,9 @@ class OpInfo:
     # backward dtypes this function is expected to work with on CUDA
     backward_dtypesIfCUDA: _dispatch_dtypes = None
 
+    # backward dtypes this function is expected to work with on XPU
+    backward_dtypesIfXPU: _dispatch_dtypes = None
+
     # backward dtypes this function is expected to work with on ROCM
     backward_dtypesIfROCM: _dispatch_dtypes = None
 
@@ -945,6 +963,19 @@ def __post_init__(self):
                 else self.dtypes
             )
         )
+
+        self.backward_dtypesIfXPU = (
+            set(self.backward_dtypesIfXPU)
+            if self.backward_dtypesIfXPU is not None
+            else (
+                self.backward_dtypes
+                if self.backward_dtypes is not None
+                else self.dtypesIfXPU
+                if self.dtypesIfXPU is not None
+                else self.dtypes
+            )
+        )
+
         self.backward_dtypes = (
             set(self.backward_dtypes)
             if self.backward_dtypes is not None
@@ -954,6 +985,11 @@ def __post_init__(self):
         self.dtypesIfCUDA = (
             set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypes
         )
+
+        self.dtypesIfXPU = (
+            set(self.dtypesIfXPU) if self.dtypesIfXPU is not None else self.dtypes
+        )
+
         self.dtypesIfROCM = (
             set(self.dtypesIfROCM)
             if self.dtypesIfROCM is not None
@@ -991,6 +1027,19 @@ def __post_init__(self):
             else:
                 self.inplace_operator_variant = None
 
+        if self.skipXPU == True:
+            skip_dtypes= self.dtypesIfXPU
+            
+            if self.skips is not None:
+                #self.skips = (*self.skips, DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=skip_dtypes))
+                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type="xpu", dtypes=None))
+            else:
+                #self.skips = (DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=skip_dtypes))
+                self.skips = (DecorateInfo(unittest.skip, device_type="xpu", dtypes=None))
+            print("#### skipXPU on {} {} {}".format(self.name, skip_dtypes, self.skips))
+        else:
+            print("#### Don't skipXPU on {}".format(self.name))
+
         self.decorators = (*self.decorators, *self.skips)
 
         # Specifying sample inputs function without specifying the
@@ -1125,6 +1174,9 @@ def __post_init__(self):
             self.aliases = tuple(AliasInfo(a) for a in self.aliases)  # type: ignore[assignment]
         else:
             self.aliases = ()
+        
+        
+
 
     def __call__(self, *args, **kwargs):
         """Calls the function variant of the operator."""
@@ -1329,6 +1381,8 @@ def sample_inputs_sparse_bsc(self, device, dtype, requires_grad=False, **kwargs)
     def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
         """Returns the decorators targeting the given test."""
         result = []
+        #import pdb
+        #pdb.set_trace()
         for decorator in self.decorators:
             if isinstance(decorator, DecorateInfo):
                 if decorator.is_active(
@@ -1345,6 +1399,9 @@ def supported_dtypes(self, device_type):
         device_type = torch.device(device_type).type
         if device_type == "cuda":
             return self.dtypesIfROCM if TEST_WITH_ROCM else self.dtypesIfCUDA
+        if device_type == "xpu":
+            return self.dtypesIfXPU
+
         return self.dtypes
 
     def supported_backward_dtypes(self, device_type):
@@ -1361,6 +1418,8 @@ def supported_backward_dtypes(self, device_type):
                 if TEST_WITH_ROCM
                 else self.backward_dtypesIfCUDA
             )
+        elif device_type == "xpu":
+            backward_dtypes = self.backward_dtypesIfXPU
         else:
             backward_dtypes = self.backward_dtypes
 
@@ -1515,6 +1574,7 @@ def __init__(
             yield tuple(),
             {},
         ),
+        skipXPU: bool = True,
         # Options from the OpInfo base class
         **kwargs,
     ):
@@ -1538,7 +1598,7 @@ def sample_inputs_func(*args, **kwargs):
         # Override OpInfo defaults and call base class __init__
         kwargs.setdefault("inplace_variant", None)
         kwargs.setdefault("sample_inputs_func", sample_inputs_func)
-        super().__init__(name, promotes_int_to_float=promotes_int_to_float, **kwargs)
+        super().__init__(name, promotes_int_to_float=promotes_int_to_float, skipXPU = skipXPU, **kwargs)
 
         self.identity = identity
         self.nan_policy = nan_policy
@@ -2103,6 +2163,7 @@ def __init__(
         supports_rhs_python_scalar=True,  # Whether the operator allows Tensor x scalar inputs
         supports_one_python_scalar=False,  # Whether the operator allows scalar x tensor and tensor x scalar inputs
         supports_two_python_scalars=False,  # Whether the operator allows scalar x scalar inputs
+        skipXPU=True,
         **kwargs,
     ):
         self._original_binary_ufunc_args = locals().copy()
@@ -2123,6 +2184,7 @@ def __init__(
             sample_inputs_func=sample_inputs_func,
             reference_inputs_func=reference_inputs_func,
             error_inputs_func=make_error_inputs_elementwise_binary(error_inputs_func),
+            skipXPU=skipXPU,
             **kwargs,
         )
 
@@ -2451,6 +2513,7 @@ def __init__(
         reference_inputs_func=reference_inputs_elementwise_unary,
         sample_kwargs=lambda device, dtype, input: ({}, {}),
         reference_numerics_filter=None,  # Filters values in the range of the domain specified above but that should not be tested
+        skipXPU=True,
         **kwargs,
     ):
         self._original_unary_ufunc_args = locals().copy()
@@ -2460,8 +2523,10 @@ def __init__(
             dtypes=dtypes,
             sample_inputs_func=sample_inputs_func,
             reference_inputs_func=reference_inputs_func,
+            skipXPU=skipXPU,
             **kwargs,
         )
+        
         self.domain = domain
         self.handles_complex_extremal_values = handles_complex_extremal_values
         self.handles_large_floats = handles_large_floats
@@ -2593,6 +2658,7 @@ def __init__(
         ndimensional: SpectralFuncType,
         sample_inputs_func=sample_inputs_spectral_ops,
         decorators=None,
+        skipXPU=True,
         **kwargs,
     ):
         self._original_spectral_func_args = dict(locals()).copy()
@@ -2613,6 +2679,7 @@ def __init__(
             dtypes=dtypes,
             decorators=decorators,
             sample_inputs_func=sample_inputs_func,
+            skipXPU=skipXPU,
             **kwargs,
         )
         self.ref = ref
@@ -2631,6 +2698,7 @@ def __init__(
         dtypesIfCUDA=None,
         dtypesIfROCM=None,
         sample_inputs_func=None,
+        skipXPU=True,
         **kwargs,
     ):
         super().__init__(
@@ -2639,6 +2707,7 @@ def __init__(
             dtypesIfCUDA=dtypesIfCUDA,
             dtypesIfROCM=dtypesIfROCM,
             sample_inputs_func=sample_inputs_func,
+            skipXPU=skipXPU,
             **kwargs,
         )
         self.ref = ref

From c791db9c4f7ae807f4d22bf9434df40aaf764428 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 12 May 2024 22:16:15 -0700
Subject: [PATCH 02/37] enabled torch-xpu ops in op_db

---
 test/test_ops.py                              | 1734 +++++++++--------
 .../_internal/common_methods_invocations.py   |   39 +-
 torch/testing/_internal/common_utils.py       |    6 +-
 torch/testing/_internal/opinfo/core.py        |   12 +-
 4 files changed, 920 insertions(+), 871 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 44f503ae9b6ed..4b665336c1a50 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -32,10 +32,12 @@
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
+    onlyCUDAAndXPU,
     onlyNativeDeviceTypes,
     OpDTypes,
     ops,
     skipMeta,
+    skipXPU,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -76,6 +78,7 @@
     TEST_WITH_TORCHDYNAMO,
     TEST_WITH_TORCHINDUCTOR,
     TEST_WITH_UBSAN,
+    TEST_XPU,
     TestCase,
     unMarkDynamoStrictTest,
 )
@@ -105,6 +108,12 @@
     )
 )
 
+if TEST_XPU:
+    any_common_cpu_device_one = OpDTypes.any_common_cpu_xpu_one
+else:
+    any_common_cpu_device_one = OpDTypes.any_common_cpu_cuda_one
+
+
 
 def reduction_dtype_filter(op):
     if (
@@ -127,6 +136,11 @@ def reduction_dtype_filter(op):
 
 aten = torch.ops.aten
 
+_xpu_computation_op_list = ["_refs.abs", "_refs.all", "item", "abs", "add", "_refs.fill"]
+_xpu_computation_op_list = ["abs"]
+_xpu_computation_ops = [
+   op for op in ops_and_refs if op.name in _xpu_computation_op_list
+]
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
@@ -153,9 +167,10 @@ def tearDownClass(cls):
             assert len(filtered_ops) == 0, err_msg
 
     # Validates that each OpInfo works correctly on different CUDA devices
-    @onlyCUDA
+    @onlyCUDAAndXPU
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
+    #@ops(_xpu_computation_ops, dtypes=any_common_cpu_device_one)
     def test_multiple_devices(self, devices, dtype, op):
         for cuda_device_str in devices:
             cuda_device = torch.device(cuda_device_str)
@@ -271,7 +286,7 @@ def test_numpy_ref(self, device, dtype, op):
             and op.formatted_name
             in ("signal_windows_exponential", "signal_windows_bartlett")
             and dtype == torch.float64
-            and "cuda" in device
+            and ("cuda" in device or "xpu" in device)
         ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
 
@@ -283,16 +298,19 @@ def test_numpy_ref(self, device, dtype, op):
                 )
 
     # Tests that the cpu and gpu results are consistent
-    @onlyCUDA
+    @onlyCUDAAndXPU
     @suppress_warnings
     @slowTest
-    @ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
+    @ops(_ops_and_refs_with_no_numpy_ref, dtypes=any_common_cpu_device_one)
+    #@ops(_xpu_computation_ops, dtypes=any_common_cpu_device_one)
     def test_compare_cpu(self, device, dtype, op):
         def to_cpu(arg):
             if isinstance(arg, torch.Tensor):
                 return arg.to(device="cpu")
             return arg
 
+        #import pdb
+        #pdb.set_trace()
         samples = op.reference_inputs(device, dtype)
 
         for sample in samples:
@@ -540,7 +558,7 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
         self._ref_test_helper(contextlib.nullcontext, device, dtype, op)
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @onlyCUDA
+    @onlyCUDAAndXPU
     @ops(python_ref_db)
     @parametrize(
         "executor",
@@ -590,6 +608,7 @@ def test_errors(self, device, op):
                 out = op(si.input, *si.args, **si.kwargs)
                 self.assertFalse(isinstance(out, type(NotImplemented)))
 
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypes
     @ops(
@@ -1002,6 +1021,9 @@ def _case_two_transform(t):
                 wrong_device = "cpu"
             elif torch.cuda.is_available():
                 wrong_device = "cuda"
+            elif torch.xpu.is_available(): 
+                # Daisy ????
+                wrong_device = "xpu"
 
             factory_fn_msg = (
                 "\n\nNOTE: If your op is a factory function (i.e., it accepts TensorOptions) you should mark its "
@@ -1818,857 +1840,859 @@ def check_cow_input(
                             allow_list=op.allow_cow_input_materialize_backward,
                         )
 
-    @ops(op_db, allowed_dtypes=(torch.float,))
-    def test_view_replay(self, device, dtype, op):
-        def _assert_match_metadata(a, b):
-            self.assertEqual(a.size(), b.size())
-            self.assertEqual(a.stride(), b.stride())
-            self.assertEqual(a.storage_offset(), b.storage_offset())
-            self.assertEqual(a.device, b.device)
-            self.assertEqual(a.dtype, b.dtype)
-
-        # ensure view replay is enabled
-        with torch.autograd._force_original_view_tracking(True):
-            for sample in op.sample_inputs(device, dtype, requires_grad=False):
-                inp = sample.input
-                outs = op(inp, *sample.args, **sample.kwargs)
-                if not isinstance(outs, (tuple, List)):
-                    outs = [outs]
-
-                # for all outputs that are views of the input, we should be able to replay the
-                # forward and reverse views via a functioning view_func() / rev_view_func().
-                for out in outs:
-                    if not (
-                        isinstance(out, torch.Tensor)
-                        and out._is_view()
-                        and out._base is inp
-                    ):
-                        continue
-
-                    # forward view_func
-                    new_inp = inp.clone()
-                    _assert_match_metadata(new_inp, inp)
-                    new_out = out._view_func_unsafe(new_inp)
-                    _assert_match_metadata(new_out, out)
-                    self.assertEqual(new_out, out)
-
-                    # reverse view_func
-                    new_out = out.detach()
-                    new_inp = out._rev_view_func_unsafe(new_out)
-                    _assert_match_metadata(new_inp, inp)
-                    self.assertTrue(new_inp._is_view())
-                    self.assertTrue(new_inp._base is new_out)
-
-
-@unMarkDynamoStrictTest
-class TestMathBits(TestCase):
-    # Tests that
-    # 1. The operator's output for physically conjugated/negated tensors and conjugate/negative view tensors
-    # produces the same value
-    # 2. The gradients are same in both cases mentioned in (1)
-    # 3. If the operator's inplace variant is supported, tests that the inplace operation
-    #    produces the correct value when called on a conjugate/negative view tensor and that the output
-    #    has its conj/neg bit set to true
-    # This test only runs for C -> R and C -> C functions
-    # TODO: add tests for `R->C` functions
-    # Note: This test runs for functions that take both tensors and tensorlists as input.
-    def _test_math_view(
-        self,
-        device,
-        dtype,
-        op,
-        samples,
-        math_op_physical,
-        math_op_view,
-        is_bit_set,
-        out_type,
-    ):
-        inplace_variant = op.inplace_variant
-
-        # helper function to clone and conjugate/negate the input if its a tensor
-        # else clone the sequence and conjugate/negate the first element in the sequence
-        # If a requires_grad argument is provided the tensor being conjugated/negated will
-        # have its requires_grad set to that value.
-        def clone_and_perform_view(input, **kwargs):
-            if isinstance(input, torch.Tensor):
-                requires_grad = kwargs.get("requires_grad", input.requires_grad)
-                with torch.no_grad():
-                    # Ensure view represents the original sample input
-                    input = math_op_physical(input)
-                # Note: .conj() is not called under no_grad mode since it's not allowed to modify a
-                # view created in no_grad mode. Here it's ok to do so, so as a workaround we call conj
-                # before resetting the requires_grad field for input
-                input = math_op_view(input)
-                assert input.is_leaf
-                return input.requires_grad_(requires_grad)
-
-            if isinstance(input, Sequence):
-                out = list(map(clone_input_helper, input))
-                out[0] = clone_and_perform_view(out[0])
-                return tuple(out)
-
-        for sample in samples:
-            tensor = (
-                sample.input
-                if isinstance(sample.input, torch.Tensor)
-                else sample.input[0]
-            )
-            cloned1 = clone_and_perform_view(sample.input)
-
-            # Computes function forward value with a physically conjugated/negated tensor and
-            # a conj/neg view tensor and verifies that the output in both case are equal.
-            expected_forward = op(sample.input, *sample.args, **sample.kwargs)
-            forward_with_mathview = op(cloned1, *sample.args, **sample.kwargs)
-            self.assertEqual(expected_forward, forward_with_mathview)
-
-            # If the op has an inplace variant, and the input doesn't require broadcasting
-            # and has the same dtype as output, verify that the inplace operation on a conjugated/negated
-            # input produces correct output, and the output tensor has the conj/neg bit set to True
-            if inplace_variant is not None and not sample.broadcasts_input:
-                cloned2 = clone_and_perform_view(tensor, requires_grad=False)
-                if (
-                    isinstance(expected_forward, torch.Tensor)
-                    and expected_forward.dtype is tensor.dtype
-                ):
-                    inplace_forward = inplace_variant(
-                        cloned2, *sample.args, **sample.kwargs
-                    )
-                    self.assertTrue(is_bit_set(inplace_forward))
-                    self.assertEqual(inplace_forward, expected_forward)
-
-            # TODO: backward consistency only supported for single tensor outputs
-            # TODO: backward consistency only checked on sample.input, not all
-            #   tensor inputs
-            # TODO: update to handle checking grads of all tensor inputs as
-            #   derived from each tensor output
-            if (
-                isinstance(expected_forward, torch.Tensor)
-                and expected_forward.requires_grad
-            ):
-                output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x)
-                expected_forward = output_process_fn_grad(expected_forward)
-                forward_with_mathview = output_process_fn_grad(forward_with_mathview)
-
-                tensor = (
-                    sample.input
-                    if isinstance(sample.input, torch.Tensor)
-                    else sample.input[0]
-                )
-                expected_forward.sum().abs().backward(retain_graph=True)
-                forward_with_mathview.sum().abs().backward(retain_graph=True)
-                if tensor.grad is not None:
-                    cloned1_tensor = (
-                        cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
-                    )
-                    self.assertEqual(tensor.grad, cloned1_tensor.grad)
-
-                    tensor.grad, cloned1_tensor.grad = None, None
-
-                    # a repeat of the above test if output is not complex valued
-                    if out_type(expected_forward):
-                        grad = torch.randn_like(expected_forward)
-                        expected_forward.backward(grad)
-                        forward_with_mathview.backward(
-                            math_op_view(math_op_physical(grad))
-                        )
-
-                        self.assertEqual(tensor.grad, cloned1_tensor.grad)
-
-    @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
-    def test_conj_view(self, device, dtype, op):
-        if not op.test_conjugated_samples:
-            self.skipTest("Operation doesn't support conjugated inputs.")
-        math_op_physical = torch.conj_physical
-        math_op_view = torch.conj
-        _requires_grad = torch.cfloat in op.supported_backward_dtypes(
-            torch.device(device).type
-        )
-        is_bit_set = torch.is_conj
-        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
-        self._test_math_view(
-            device,
-            dtype,
-            op,
-            samples,
-            math_op_physical,
-            math_op_view,
-            is_bit_set,
-            torch.is_complex,
-        )
-
-    @ops(ops_and_refs, allowed_dtypes=(torch.double,))
-    def test_neg_view(self, device, dtype, op):
-        if not op.test_neg_view:
-            self.skipTest("Operation not tested with tensors with negative bit.")
-        math_op_physical = torch.neg
-        math_op_view = torch._neg_view
-        is_bit_set = torch.is_neg
-        samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
-        self._test_math_view(
-            device,
-            dtype,
-            op,
-            samples,
-            math_op_physical,
-            math_op_view,
-            is_bit_set,
-            lambda x: True,
-        )
-
-    @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
-    def test_neg_conj_view(self, device, dtype, op):
-        if not op.test_neg_view:
-            self.skipTest("Operation not tested with tensors with negative bit.")
-        if not op.test_conjugated_samples:
-            self.skipTest("Operation doesn't support conjugated inputs.")
-
-        def math_op_physical(x):
-            return -x.conj_physical()
-
-        def math_op_view(x):
-            return torch._neg_view(x).conj()
-
-        def is_bit_set(x):
-            return torch.is_neg(x) and torch.is_conj(x)
-
-        _requires_grad = dtype in op.supported_backward_dtypes(
-            torch.device(device).type
-        )
-        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
-        # Only test one sample
-        samples = itertools.islice(samples, 1)
-        self._test_math_view(
-            device,
-            dtype,
-            op,
-            samples,
-            math_op_physical,
-            math_op_view,
-            is_bit_set,
-            torch.is_complex,
-        )
-
-
-# input strides and size may have been altered due to the result of an inplace op
-def check_inplace_view(func, input, rs, input_size, input_strides):
-    if func is None:
-        return
-    # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm(_legit).out
-    # which mutate not necessarily the first input.
-    if isinstance(rs, torch.Tensor) and rs is input:
-        unequal_size = rs.size() != input_size
-        unequal_strides = rs.stride() != input_strides
-        # resize_ should probably have inplace_view tag. Not adding the tag since it
-        # breaks some codegen logic
-        if unequal_size or unequal_strides:
-            if isinstance(func, torch._ops.OpOverloadPacket):
-                func = func.default
-            # Reference: https://github.com/pytorch/pytorch/issues/78759
-            if func is not torch.ops.aten.resize_.default:
-                # TODO: use self.assertIn when we have separate tests for each tag
-                assert torch.Tag.inplace_view in func.tags
-
-
-# A mode that when enabled runs correctness checks to ensure
-# that operators have expected tags based on their input and
-# output tensor properties
-class TestTagsMode(TorchDispatchMode):
-    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-        if isinstance(args[0], torch.Tensor):
-            old_size = args[0].size()
-            old_stride = args[0].stride()
-            rs = func(*args, **kwargs)
-            check_inplace_view(func, args[0], rs, old_size, old_stride)
-        else:
-            rs = func(*args, **kwargs)
-        return rs
-
-
-# Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
-@unMarkDynamoStrictTest
-class TestTags(TestCase):
-    @onlyCPU
-    @ops(ops_and_refs, dtypes=OpDTypes.any_one)
-    def test_tags(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype, requires_grad=False)
-        for sample in samples:
-            # TODO: Test tags for ops that return a list of tensors
-            input = sample.input
-            if isinstance(input, torch.Tensor):
-                old_size = input.size()
-                old_stride = input.stride()
-                with TestTagsMode():
-                    rs = op(input, *sample.args, **sample.kwargs)
-                # TODO: add test for aliases: https://github.com/pytorch/pytorch/issues/78761
-                aten_name = op.aten_name if op.aten_name is not None else op.name
-                opoverloadpacket = getattr(torch.ops.aten, aten_name, None)
-                check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
-
-
-class TestSelfKwarg(TestCase):
-    def test_self_kwargs(self):
-        """Verify that we can call the aten ops with all kwargs even if the
-        argument's name is "self"
-        """
-        torch.ops.aten.reshape.default(self=torch.rand(1, 2), shape=[2])
-        torch.ops.aten.min.default(self=torch.rand(100))
-
-
-@unMarkDynamoStrictTest
-class TestRefsOpsInfo(TestCase):
-    import_paths = [
-        "_refs",
-        "_refs.special",
-        "_refs.nn.functional",
-        "_refs.fft",
-        "_refs._conversions",
-    ]
-    module_alls = [
-        (path, import_module(f"torch.{path}").__all__) for path in import_paths
-    ]
-    ref_ops_names = tuple(
-        itertools.chain.from_iterable(
-            [f"{path}.{op}" for op in module_all] for path, module_all in module_alls
-        )
-    )
-    ref_db_names = {ref_op.name for ref_op in python_ref_db}
-
-    # TODO: References that do not have an entry in python_ref_db
-    skip_ref_ops = {
-        "_refs.alias",
-        "_refs.bitwise_right_shift",
-        "_refs.copy_to",
-        "_refs.empty_permuted",
-        "_refs.empty_strided",
-        "_refs.equal",
-        "_refs.full",
-        "_refs.full_like",
-        "_refs.is_complex",
-        "_refs.to",
-        "_refs.mvlgamma",
-        "_refs.ones",
-        "_refs.ones_like",
-        "_refs.special.expit",
-        "_refs.std_var",
-        "_refs.swap_axes",
-        "_refs.uniform",
-        "_refs.scalar_tensor",
-        "_refs.trunc_divide",
-        "_refs.zero",
-        "_refs.zeros",
-        "_refs.zeros_like",
-        "_refs.rfloordiv",
-        "_refs.rtruediv",
-        "_refs.rpow",
-        # These should be tested with their out-of-place counterparts
-        "_refs.index_add_",
-        "_refs.index_copy_",
-        "_refs.index_fill_",
-        "_refs.native_group_norm",
-    }
-
-    not_in_decomp_table = {
-        # duplicated in _decomp and _refs
-        "_refs.nn.functional.group_norm",
-        "_refs.nn.functional.mse_loss",
-        "_refs.floor_divide",
-        # duplicated as refs do not have decent support for advanced indexing
-        "_refs.index_copy",
-        "_refs.index_copy_",
-        "_refs.index_add",
-        "_refs.index_add_",
-        # these are not aten ops?
-        "_refs._conversions.bfloat16",
-        "_refs._conversions.bool",
-        "_refs._conversions.byte",
-        "_refs._conversions.char",
-        "_refs._conversions.double",
-        "_refs._conversions.float",
-        "_refs._conversions.half",
-        "_refs._conversions.int",
-        "_refs._conversions.long",
-        "_refs._conversions.short",
-        "_refs._conversions.chalf",
-        "_refs._conversions.cfloat",
-        "_refs._conversions.cdouble",
-        "_refs.broadcast_shapes",
-        "_refs.broadcast_tensors",
-        "_refs.mvlgamma",
-        "_refs.nn.functional.layer_norm",
-        "_refs.nn.functional.tanhshrink",
-        "_refs.nn.functional.triplet_margin_loss",
-        "_refs.rfloordiv",
-        "_refs.rtruediv",
-        "_refs.rpow",
-        # CompositeImplicitAutograd
-        "_refs.allclose",
-        "_refs.atleast_1d",
-        "_refs.atleast_2d",
-        "_refs.atleast_3d",
-        "_refs.broadcast_to",
-        "_refs.chunk",
-        "_refs.column_stack",
-        "_refs.contiguous",
-        "_refs.dsplit",
-        "_refs.dstack",
-        "_refs.fill",
-        "_refs.fill_",
-        "_refs.flatten",
-        "_refs.fliplr",
-        "_refs.flipud",
-        "_refs.float_power",
-        "_refs.hsplit",
-        "_refs.hstack",
-        "_refs.isclose",
-        "_refs.isfinite",
-        "_refs.isreal",
-        "_refs.istft",
-        "_refs.log_softmax",
-        "_refs.movedim",
-        "_refs.narrow",
-        "_refs.nn.functional.dropout",
-        "_refs.nn.functional.l1_loss",
-        "_refs.nn.functional.smooth_l1_loss",
-        "_refs.nn.functional.log_softmax",
-        "_refs.nn.functional.poisson_nll_loss",
-        "_refs.nn.functional.softmax",
-        "_refs.nn.functional.softmin",
-        "_refs.positive",
-        "_refs.ravel",
-        "_refs.reshape",
-        "_refs.softmax",
-        "_refs.special.expit",
-        "_refs.special.log_softmax",
-        "_refs.special.softmax",
-        "_refs.square",
-        "_refs.stft",
-        "_refs.T",
-        "_refs.take_along_dim",
-        "_refs.tensor_split",
-        "_refs.to",
-        "_refs.true_divide",
-        "_refs.trunc_divide",
-        "_refs.vsplit",
-        "_refs.vstack",
-        "_refs.linalg.matrix_norm",
-        "_refs.linalg.norm",
-        "_refs.linalg.svd",
-        "_refs.linalg.svdvals",
-        "_refs.unflatten",
-        "_refs.sum_to_size",
-        # ref implementation missing kwargs
-        "_refs.full_like",  # missing "layout"
-        "_refs.scalar_tensor",  # missing "layout"
-        # other
-        "_refs.block_diag",  # only refs._block_diag_iterable is in decomposition table
-        "_refs.empty",  # intentional; direct empty is faster and has less guards
-        "_refs.empty_permuted",  # intentional; direct empty is faster and has less guards
-        "_refs.expand_as",
-        "_refs.as_strided",  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
-        "_refs.copy_to",  # torch._C._jit_get_operation: No such operator aten::copy_to
-        "_refs.equal",  # 'bool' object has no attribute 'dtype'
-        "_refs.conj",  # Calls _prims.conj
-        "_refs.real",
-        "_refs.imag",
-        "_refs.reshape_as",
-        "_refs.view_as",
-        "_refs.view_as_complex",  # TorchInductor does not support complex at the moment.
-        # the decompositions for these ops are slightly different
-        # because of out handling
-        "_refs.var_mean",
-        "_refs.std_mean",
-        "_refs.native_layer_norm",
-    }
-
-    @parametrize("op", ref_ops_names)
-    def test_refs_are_in_python_ref_db(self, op):
-        inplace = op[-1] == "_"
-        if op in self.skip_ref_ops:
-            raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
-        elif inplace:
-            self.assertNotIn(
-                op,
-                self.ref_db_names,
-                msg=f"{op} is an in-place operation and should not have an OpInfo",
-            )
-        else:
-            # Intentionally don't use assertIn to avoid printing the
-            # (very large) container
-            self.assertTrue(op in self.ref_db_names, msg=f"{op} not in ref_db_names")
-
-    @parametrize("op", ref_ops_names)
-    def test_refs_are_in_decomp_table(self, op):
-        path = op.split(".")
-        module_path = ".".join(path[:-1])
-        op_name = path[-1]
-        op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
-
-        if op in self.not_in_decomp_table:
-            self.assertNotIn(
-                op_impl,
-                torch._decomp.decomposition_table.values(),
-                f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()",
-            )
-        else:
-            self.assertIn(
-                op_impl,
-                torch._decomp.decomposition_table.values(),
-                f"Did not find {op} in torch._decomp.decomposition_table.values()",
-            )
-
-
-fake_skips = (
-    "aminmax",  # failing input
-    "cov",  # aweights cannot be negtaive
-    "istft",  # window overlap add min: 0
-    "linalg.eigvals",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-    "linalg.eigvalsh",  # aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
-    "linalg.matrix_power",  # Could not run 'aten::eye.m_out' with arguments from the 'Meta' backend
-    # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backen
-    "linalg.matrix_rank.hermitian",  # Could not run 'aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
-    "linalg.pinv.hermitian",  # tensor.mH is only supported on matrices or batches of matrices. Got 1-D tensor
-    "linalg.solve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta' backend
-    "linalg.tensorsolve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta'
-    "lu_solve",  # MALLOC ERROR: debug
-    "multinomial",  # Could not run 'aten::multinomial' with arguments from the 'Meta' backend
-    "mvlgamma.mvlgamma_p_1",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-    "mvlgamma.mvlgamma_p_3",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-    "mvlgamma.mvlgamma_p_5",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-    "nanmean",  # logical_not() got an unexpected keyword argument 'out'
-    "quantile",  # quantile() q values must be in the range [0, 1]
-    "nanquantile",  # quantile() q values must be in the range [0, 1]
-    "nn.functional.ctc_loss",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-    "nn.functional.embedding_bag",  # sometimes errors
-    "nn.functional.nll_loss",  # sometimes errors
-    "nn.functional.max_pool1d",  # The tensor has a non-zero number of elements
-    "to_sparse",  # Could not run 'aten::_to_sparse' with arguments from the 'Meta' backend
-    "tensor_split",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-    "repeat_interleave",  # cannot repeat_interleave a meta tensor without output_size
-    "sparse.sampled.addmm",  # sparsity not supported
-    # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
-    "nn.functional.one_hot",
-    "narrow",  # Fails only for one overload with DataDependentOutputException (hence skip).
-)
-
-fake_autocast_device_skips = defaultdict(dict)
-
-# TODO: investigate/fix
-fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
-
-
-dynamic_output_op_tests = (
-    "argwhere",
-    "bincount",
-    "combinations",
-    "linalg.lstsq",
-    "masked_select",
-    "nonzero",
-    "unique_consecutive",
-    "unique",
-    "linalg.lstsq.grad_oriented",
-)
-
-# Ops that have dynamic output shapes that we can handle when
-# allow_dynamic_shape_ops is True in fake tensor shape environment.
-supported_dynamic_output_op_tests = (
-    "nonzero",
-    "unique",
-    "repeat_interleave",
-    "masked_select",
-)
-
-# some inputs invoke dynamic output shape operators, some do not
-sometimes_dynamic_output_op_test = (
-    "__getitem__",
-    "index_select",
-)
-
-data_dependent_op_tests = (
-    "equal",
-    "corrcoef",
-    "nn.functional.gaussian_nll_loss",
-    "allclose",
-)
-
-aliasing_failures = ("histogramdd",)
-
-fake_backward_skips = {
-    "linalg.cond",
-    "linalg.matrix_norm",
-    "linalg.norm",
-    "linalg.svd",
-    "linalg.svdvals",
-    "pca_lowrank",
-    "roll",
-    "svd_lowrank",
-    "sgn",
-}
-
-fake_backward_xfails = {skip(s) for s in fake_backward_skips} | {
-    xfail("fft.ihfftn"),  # Mismatch in aten._conj_physical.default
-    xfail("fft.ihfft2"),  # Mismatch in aten._conj_physical.default
-    skip("nn.functional.ctc_loss"),
-}
-
-fake_autocast_backward_xfails = {
-    skip("nn.functional.binary_cross_entropy"),
-    skip("sparse.sampled_addmm"),
-    skip("linalg.pinv"),
-    skip("linalg.pinv", "hermitian"),
-    skip("linalg.pinv", "singular"),
-    skip("pinverse"),
-}
-
-
-@unMarkDynamoStrictTest
-class TestFakeTensor(TestCase):
-    def setUp(self):
-        # Turn on FakeTensor caching and cross-checking for these tests:
-        cache_enabled = unittest.mock.patch(
-            "torch._dynamo.config.fake_tensor_cache_enabled", True
-        )
-        cache_enabled.start()
-        self.addCleanup(cache_enabled.stop)
-
-        cache_crosscheck = unittest.mock.patch(
-            "torch._dynamo.config.fake_tensor_cache_crosscheck_enabled", True
-        )
-        cache_crosscheck.start()
-        self.addCleanup(cache_crosscheck.stop)
-
-    def _test_fake_helper(self, device, dtype, op, context):
-        name = op.name
-        if op.variant_test_name:
-            name += "." + op.variant_test_name
-        if name in fake_skips or "sparse" in name or "jiterator" in name:
-            self.skipTest("Skip failing test")
-
-        samples = op.sample_inputs(device, dtype, requires_grad=False)
-        for sample in samples:
-            mode = FakeTensorMode()
-
-            from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-            allow_dynamic_output_shape_shape_env = ShapeEnv(
-                allow_dynamic_output_shape_ops=True
-            )
-
-            allow_dynamic_output_shape_mode = FakeTensorMode(
-                shape_env=allow_dynamic_output_shape_shape_env
-            )
-
-            try:
-                with context():
-                    res = op(sample.input, *sample.args, **sample.kwargs)
-            except Exception:
-                continue
-
-            def run_with_fake_mode_and_verify(fake_mode, match_results=True):
-                def map_to_fake(e):
-                    if isinstance(e, torch.Tensor):
-                        return fake_mode.from_tensor(e)
-                    else:
-                        return e
-
-                input = tree_map(map_to_fake, sample.input)
-                args = tree_map(map_to_fake, sample.args)
-                kwargs = tree_map(map_to_fake, sample.kwargs)
-
-                try:
-                    with context():
-                        with fake_mode:
-                            res_fake = op(input, *args, **kwargs)
-
-                    if not match_results:
-                        return
-
-                    for fake_out, real_out in zip(
-                        pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
-                    ):
-                        if not isinstance(fake_out, torch.Tensor):
-                            self.assertTrue(not isinstance(real_out, torch.Tensor))
-                            self.assertEqual(fake_out, real_out)
-                            continue
-
-                        self.assertTrue(isinstance(fake_out, FakeTensor))
-                        # if you see a shape exception here, you may need to add
-                        # a `dynamic_output_shape` tag to an operator
-
-                        # prims/decomps must correctly model strides,
-                        # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
-                        prims.utils.compare_tensor_meta(fake_out, real_out, True)
-
-                        if name not in aliasing_failures:
-                            fake_aliasing = outputs_alias_inputs(
-                                (input, args, kwargs), res_fake
-                            )
-                            real_aliasing = outputs_alias_inputs(
-                                (sample.input, sample, args, sample.kwargs), res
-                            )
-                            self.assertEqual(fake_aliasing, real_aliasing)
-
-                    self.assertTrue(
-                        name not in dynamic_output_op_tests
-                        and name not in data_dependent_op_tests
-                    )
-
-                except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
-                    pass
-                except torch._subclasses.fake_tensor.UnsupportedOperatorException:
-                    pass
-                except torch._subclasses.fake_tensor.DynamicOutputShapeException:
-                    self.assertTrue(
-                        name in dynamic_output_op_tests
-                        or name in sometimes_dynamic_output_op_test
-                    )
-                    self.assertTrue(
-                        mode.shape_env is None
-                        or not mode.shape_env.allow_dynamic_output_shape_ops
-                        or name not in supported_dynamic_output_op_tests
-                    )
-                except torch._subclasses.fake_tensor.DataDependentOutputException:
-                    self.assertTrue(name in data_dependent_op_tests)
-
-            run_with_fake_mode_and_verify(mode)
-            if name in supported_dynamic_output_op_tests:
-                run_with_fake_mode_and_verify(
-                    allow_dynamic_output_shape_mode, match_results=False
-                )
-
-    @ops(op_db, dtypes=OpDTypes.any_one)
-    def test_pointwise_ops(self, device, dtype, op):
-        name = op.name
-        if op.variant_test_name:
-            name += "." + op.variant_test_name
-        if name in fake_skips or "sparse" in name or "jiterator" in name:
-            self.skipTest("Skip failing test")
-
-        test_self = self
-
-        class TestPointwiseMode(TorchDispatchMode):
-            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-                kwargs = kwargs or {}
-
-                out = func(*args, **kwargs)
-
-                if torch.Tag.pointwise in func.tags:
-                    shapes = []
-                    for inp in pytree.arg_tree_leaves(*args, **kwargs):
-                        if isinstance(inp, torch.Tensor):
-                            shapes.append(inp.shape)
-
-                    out_shape = torch._refs._broadcast_shapes(*shapes)
-
-                    for out_elem in pytree.tree_leaves(out):
-                        if isinstance(out_elem, torch.Tensor):
-                            test_self.assertEqual(out_elem.shape, out_shape)
-
-                return out
-
-        samples = op.sample_inputs(device, dtype, requires_grad=False)
-        for sample in samples:
-            mode = FakeTensorMode()
-
-            def map_to_fake(e):
-                if isinstance(e, torch.Tensor):
-                    return mode.from_tensor(e)
-                else:
-                    return e
-
-            input = tree_map(map_to_fake, sample.input)
-            args = tree_map(map_to_fake, sample.args)
-            kwargs = tree_map(map_to_fake, sample.kwargs)
-
-            try:
-                op(input, *args, **kwargs)
-            except Exception as e:
-                continue
-
-            with TestPointwiseMode():
-                with mode:
-                    op(input, *args, **kwargs)
-
-    @ops(op_db, dtypes=OpDTypes.any_one)
-    def test_fake(self, device, dtype, op):
-        self._test_fake_helper(device, dtype, op, contextlib.nullcontext)
-
-    @ops(op_db, dtypes=OpDTypes.any_one)
-    def test_fake_autocast(self, device, dtype, op):
-        if op.name in fake_autocast_device_skips[device]:
-            self.skipTest("Skip failing test")
-        context = (
-            torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
-        )
-        self._test_fake_helper(device, dtype, op, context)
-
-    def _test_fake_crossref_helper(self, device, dtype, op, context):
-        samples = op.sample_inputs(device, dtype, requires_grad=True)
-
-        for iter, sample in enumerate(samples):
-            args = [sample.input] + list(sample.args)
-            kwargs = sample.kwargs
-
-            # skip these to speed up tests
-            common_skip_ops = (
-                aten.detach.default,
-                aten.empty_strided.default,
-                aten.copy_.default,
-                aten.is_same_size.default,
-            )
-
-            # TODO: enable check_aliasing, batch norm fails
-            try:
-                with torch._subclasses.CrossRefFakeMode(
-                    ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True
-                ):
-                    with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(
-                        False
-                    ):
-                        composite_compliance.compute_expected_grads(
-                            op.get_op(),
-                            args,
-                            kwargs,
-                            sample.output_process_fn_grad,
-                            op.gradcheck_wrapper,
-                        )
-            except torch._subclasses.fake_tensor.UnsupportedOperatorException:
-                pass
-
-    @onlyCUDA
-    @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-    @skipOps(
-        "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
-    )
-    def test_fake_crossref_backward_no_amp(self, device, dtype, op):
-        self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
-
-    @onlyCUDA
-    @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-    @skipOps(
-        "TestFakeTensor",
-        "test_fake_crossref_backward_amp",
-        fake_backward_xfails | fake_autocast_backward_xfails,
-    )
-    def test_fake_crossref_backward_amp(self, device, dtype, op):
-        self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
-
-    @ops([op for op in ops_and_refs if op.is_factory_function])
-    def test_strided_layout(self, device, dtype, op):
-        samples = op.sample_inputs(device, dtype)
-        for sample in samples:
-            kwargs = sample.kwargs.copy()
-            kwargs["layout"] = torch.strided
-            strided_result = op(sample.input, *sample.args, **kwargs)
-            self.assertEqual(strided_result.layout, torch.strided)
-
-
-instantiate_device_type_tests(TestCommon, globals())
-instantiate_device_type_tests(TestCompositeCompliance, globals())
-instantiate_device_type_tests(TestMathBits, globals())
-instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
-instantiate_device_type_tests(TestFakeTensor, globals())
-instantiate_device_type_tests(TestTags, globals())
+#     @ops(op_db, allowed_dtypes=(torch.float,))
+#     def test_view_replay(self, device, dtype, op):
+#         def _assert_match_metadata(a, b):
+#             self.assertEqual(a.size(), b.size())
+#             self.assertEqual(a.stride(), b.stride())
+#             self.assertEqual(a.storage_offset(), b.storage_offset())
+#             self.assertEqual(a.device, b.device)
+#             self.assertEqual(a.dtype, b.dtype)
+
+#         # ensure view replay is enabled
+#         with torch.autograd._force_original_view_tracking(True):
+#             for sample in op.sample_inputs(device, dtype, requires_grad=False):
+#                 inp = sample.input
+#                 outs = op(inp, *sample.args, **sample.kwargs)
+#                 if not isinstance(outs, (tuple, List)):
+#                     outs = [outs]
+
+#                 # for all outputs that are views of the input, we should be able to replay the
+#                 # forward and reverse views via a functioning view_func() / rev_view_func().
+#                 for out in outs:
+#                     if not (
+#                         isinstance(out, torch.Tensor)
+#                         and out._is_view()
+#                         and out._base is inp
+#                     ):
+#                         continue
+
+#                     # forward view_func
+#                     new_inp = inp.clone()
+#                     _assert_match_metadata(new_inp, inp)
+#                     new_out = out._view_func_unsafe(new_inp)
+#                     _assert_match_metadata(new_out, out)
+#                     self.assertEqual(new_out, out)
+
+#                     # reverse view_func
+#                     new_out = out.detach()
+#                     new_inp = out._rev_view_func_unsafe(new_out)
+#                     _assert_match_metadata(new_inp, inp)
+#                     self.assertTrue(new_inp._is_view())
+#                     self.assertTrue(new_inp._base is new_out)
+
+
+# @unMarkDynamoStrictTest
+# class TestMathBits(TestCase):
+#     # Tests that
+#     # 1. The operator's output for physically conjugated/negated tensors and conjugate/negative view tensors
+#     # produces the same value
+#     # 2. The gradients are same in both cases mentioned in (1)
+#     # 3. If the operator's inplace variant is supported, tests that the inplace operation
+#     #    produces the correct value when called on a conjugate/negative view tensor and that the output
+#     #    has its conj/neg bit set to true
+#     # This test only runs for C -> R and C -> C functions
+#     # TODO: add tests for `R->C` functions
+#     # Note: This test runs for functions that take both tensors and tensorlists as input.
+#     def _test_math_view(
+#         self,
+#         device,
+#         dtype,
+#         op,
+#         samples,
+#         math_op_physical,
+#         math_op_view,
+#         is_bit_set,
+#         out_type,
+#     ):
+#         inplace_variant = op.inplace_variant
+
+#         # helper function to clone and conjugate/negate the input if its a tensor
+#         # else clone the sequence and conjugate/negate the first element in the sequence
+#         # If a requires_grad argument is provided the tensor being conjugated/negated will
+#         # have its requires_grad set to that value.
+#         def clone_and_perform_view(input, **kwargs):
+#             if isinstance(input, torch.Tensor):
+#                 requires_grad = kwargs.get("requires_grad", input.requires_grad)
+#                 with torch.no_grad():
+#                     # Ensure view represents the original sample input
+#                     input = math_op_physical(input)
+#                 # Note: .conj() is not called under no_grad mode since it's not allowed to modify a
+#                 # view created in no_grad mode. Here it's ok to do so, so as a workaround we call conj
+#                 # before resetting the requires_grad field for input
+#                 input = math_op_view(input)
+#                 assert input.is_leaf
+#                 return input.requires_grad_(requires_grad)
+
+#             if isinstance(input, Sequence):
+#                 out = list(map(clone_input_helper, input))
+#                 out[0] = clone_and_perform_view(out[0])
+#                 return tuple(out)
+
+#         for sample in samples:
+#             tensor = (
+#                 sample.input
+#                 if isinstance(sample.input, torch.Tensor)
+#                 else sample.input[0]
+#             )
+#             cloned1 = clone_and_perform_view(sample.input)
+
+#             # Computes function forward value with a physically conjugated/negated tensor and
+#             # a conj/neg view tensor and verifies that the output in both case are equal.
+#             expected_forward = op(sample.input, *sample.args, **sample.kwargs)
+#             forward_with_mathview = op(cloned1, *sample.args, **sample.kwargs)
+#             self.assertEqual(expected_forward, forward_with_mathview)
+
+#             # If the op has an inplace variant, and the input doesn't require broadcasting
+#             # and has the same dtype as output, verify that the inplace operation on a conjugated/negated
+#             # input produces correct output, and the output tensor has the conj/neg bit set to True
+#             if inplace_variant is not None and not sample.broadcasts_input:
+#                 cloned2 = clone_and_perform_view(tensor, requires_grad=False)
+#                 if (
+#                     isinstance(expected_forward, torch.Tensor)
+#                     and expected_forward.dtype is tensor.dtype
+#                 ):
+#                     inplace_forward = inplace_variant(
+#                         cloned2, *sample.args, **sample.kwargs
+#                     )
+#                     self.assertTrue(is_bit_set(inplace_forward))
+#                     self.assertEqual(inplace_forward, expected_forward)
+
+#             # TODO: backward consistency only supported for single tensor outputs
+#             # TODO: backward consistency only checked on sample.input, not all
+#             #   tensor inputs
+#             # TODO: update to handle checking grads of all tensor inputs as
+#             #   derived from each tensor output
+#             if (
+#                 isinstance(expected_forward, torch.Tensor)
+#                 and expected_forward.requires_grad
+#             ):
+#                 output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x)
+#                 expected_forward = output_process_fn_grad(expected_forward)
+#                 forward_with_mathview = output_process_fn_grad(forward_with_mathview)
+
+#                 tensor = (
+#                     sample.input
+#                     if isinstance(sample.input, torch.Tensor)
+#                     else sample.input[0]
+#                 )
+#                 expected_forward.sum().abs().backward(retain_graph=True)
+#                 forward_with_mathview.sum().abs().backward(retain_graph=True)
+#                 if tensor.grad is not None:
+#                     cloned1_tensor = (
+#                         cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
+#                     )
+#                     self.assertEqual(tensor.grad, cloned1_tensor.grad)
+
+#                     tensor.grad, cloned1_tensor.grad = None, None
+
+#                     # a repeat of the above test if output is not complex valued
+#                     if out_type(expected_forward):
+#                         grad = torch.randn_like(expected_forward)
+#                         expected_forward.backward(grad)
+#                         forward_with_mathview.backward(
+#                             math_op_view(math_op_physical(grad))
+#                         )
+
+#                         self.assertEqual(tensor.grad, cloned1_tensor.grad)
+
+#     @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
+#     def test_conj_view(self, device, dtype, op):
+#         if not op.test_conjugated_samples:
+#             self.skipTest("Operation doesn't support conjugated inputs.")
+#         math_op_physical = torch.conj_physical
+#         math_op_view = torch.conj
+#         _requires_grad = torch.cfloat in op.supported_backward_dtypes(
+#             torch.device(device).type
+#         )
+#         is_bit_set = torch.is_conj
+#         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
+#         self._test_math_view(
+#             device,
+#             dtype,
+#             op,
+#             samples,
+#             math_op_physical,
+#             math_op_view,
+#             is_bit_set,
+#             torch.is_complex,
+#         )
+
+#     @ops(ops_and_refs, allowed_dtypes=(torch.double,))
+#     def test_neg_view(self, device, dtype, op):
+#         if not op.test_neg_view:
+#             self.skipTest("Operation not tested with tensors with negative bit.")
+#         math_op_physical = torch.neg
+#         math_op_view = torch._neg_view
+#         is_bit_set = torch.is_neg
+#         samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
+#         self._test_math_view(
+#             device,
+#             dtype,
+#             op,
+#             samples,
+#             math_op_physical,
+#             math_op_view,
+#             is_bit_set,
+#             lambda x: True,
+#         )
+
+#     @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
+#     def test_neg_conj_view(self, device, dtype, op):
+#         if not op.test_neg_view:
+#             self.skipTest("Operation not tested with tensors with negative bit.")
+#         if not op.test_conjugated_samples:
+#             self.skipTest("Operation doesn't support conjugated inputs.")
+
+#         def math_op_physical(x):
+#             return -x.conj_physical()
+
+#         def math_op_view(x):
+#             return torch._neg_view(x).conj()
+
+#         def is_bit_set(x):
+#             return torch.is_neg(x) and torch.is_conj(x)
+
+#         _requires_grad = dtype in op.supported_backward_dtypes(
+#             torch.device(device).type
+#         )
+#         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
+#         # Only test one sample
+#         samples = itertools.islice(samples, 1)
+#         self._test_math_view(
+#             device,
+#             dtype,
+#             op,
+#             samples,
+#             math_op_physical,
+#             math_op_view,
+#             is_bit_set,
+#             torch.is_complex,
+#         )
+
+
+# # input strides and size may have been altered due to the result of an inplace op
+# def check_inplace_view(func, input, rs, input_size, input_strides):
+#     if func is None:
+#         return
+#     # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm(_legit).out
+#     # which mutate not necessarily the first input.
+#     if isinstance(rs, torch.Tensor) and rs is input:
+#         unequal_size = rs.size() != input_size
+#         unequal_strides = rs.stride() != input_strides
+#         # resize_ should probably have inplace_view tag. Not adding the tag since it
+#         # breaks some codegen logic
+#         if unequal_size or unequal_strides:
+#             if isinstance(func, torch._ops.OpOverloadPacket):
+#                 func = func.default
+#             # Reference: https://github.com/pytorch/pytorch/issues/78759
+#             if func is not torch.ops.aten.resize_.default:
+#                 # TODO: use self.assertIn when we have separate tests for each tag
+#                 assert torch.Tag.inplace_view in func.tags
+
+
+# # A mode that when enabled runs correctness checks to ensure
+# # that operators have expected tags based on their input and
+# # output tensor properties
+# class TestTagsMode(TorchDispatchMode):
+#     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+#         if isinstance(args[0], torch.Tensor):
+#             old_size = args[0].size()
+#             old_stride = args[0].stride()
+#             rs = func(*args, **kwargs)
+#             check_inplace_view(func, args[0], rs, old_size, old_stride)
+#         else:
+#             rs = func(*args, **kwargs)
+#         return rs
+
+
+# # Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
+# @unMarkDynamoStrictTest
+# class TestTags(TestCase):
+#     @onlyCPU
+#     @ops(ops_and_refs, dtypes=OpDTypes.any_one)
+#     def test_tags(self, device, dtype, op):
+#         samples = op.sample_inputs(device, dtype, requires_grad=False)
+#         for sample in samples:
+#             # TODO: Test tags for ops that return a list of tensors
+#             input = sample.input
+#             if isinstance(input, torch.Tensor):
+#                 old_size = input.size()
+#                 old_stride = input.stride()
+#                 with TestTagsMode():
+#                     rs = op(input, *sample.args, **sample.kwargs)
+#                 # TODO: add test for aliases: https://github.com/pytorch/pytorch/issues/78761
+#                 aten_name = op.aten_name if op.aten_name is not None else op.name
+#                 opoverloadpacket = getattr(torch.ops.aten, aten_name, None)
+#                 check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
+
+
+# class TestSelfKwarg(TestCase):
+#     def test_self_kwargs(self):
+#         """Verify that we can call the aten ops with all kwargs even if the
+#         argument's name is "self"
+#         """
+#         torch.ops.aten.reshape.default(self=torch.rand(1, 2), shape=[2])
+#         torch.ops.aten.min.default(self=torch.rand(100))
+
+
+# @unMarkDynamoStrictTest
+# class TestRefsOpsInfo(TestCase):
+#     import_paths = [
+#         "_refs",
+#         "_refs.special",
+#         "_refs.nn.functional",
+#         "_refs.fft",
+#         "_refs._conversions",
+#     ]
+#     module_alls = [
+#         (path, import_module(f"torch.{path}").__all__) for path in import_paths
+#     ]
+#     ref_ops_names = tuple(
+#         itertools.chain.from_iterable(
+#             [f"{path}.{op}" for op in module_all] for path, module_all in module_alls
+#         )
+#     )
+#     ref_db_names = {ref_op.name for ref_op in python_ref_db}
+
+#     # TODO: References that do not have an entry in python_ref_db
+#     skip_ref_ops = {
+#         "_refs.alias",
+#         "_refs.bitwise_right_shift",
+#         "_refs.copy_to",
+#         "_refs.empty_permuted",
+#         "_refs.empty_strided",
+#         "_refs.equal",
+#         "_refs.full",
+#         "_refs.full_like",
+#         "_refs.is_complex",
+#         "_refs.to",
+#         "_refs.mvlgamma",
+#         "_refs.ones",
+#         "_refs.ones_like",
+#         "_refs.special.expit",
+#         "_refs.std_var",
+#         "_refs.swap_axes",
+#         "_refs.uniform",
+#         "_refs.scalar_tensor",
+#         "_refs.trunc_divide",
+#         "_refs.zero",
+#         "_refs.zeros",
+#         "_refs.zeros_like",
+#         "_refs.rfloordiv",
+#         "_refs.rtruediv",
+#         "_refs.rpow",
+#         # These should be tested with their out-of-place counterparts
+#         "_refs.index_add_",
+#         "_refs.index_copy_",
+#         "_refs.index_fill_",
+#         "_refs.native_group_norm",
+#     }
+
+#     not_in_decomp_table = {
+#         # duplicated in _decomp and _refs
+#         "_refs.nn.functional.group_norm",
+#         "_refs.nn.functional.mse_loss",
+#         "_refs.floor_divide",
+#         # duplicated as refs do not have decent support for advanced indexing
+#         "_refs.index_copy",
+#         "_refs.index_copy_",
+#         "_refs.index_add",
+#         "_refs.index_add_",
+#         # these are not aten ops?
+#         "_refs._conversions.bfloat16",
+#         "_refs._conversions.bool",
+#         "_refs._conversions.byte",
+#         "_refs._conversions.char",
+#         "_refs._conversions.double",
+#         "_refs._conversions.float",
+#         "_refs._conversions.half",
+#         "_refs._conversions.int",
+#         "_refs._conversions.long",
+#         "_refs._conversions.short",
+#         "_refs._conversions.chalf",
+#         "_refs._conversions.cfloat",
+#         "_refs._conversions.cdouble",
+#         "_refs.broadcast_shapes",
+#         "_refs.broadcast_tensors",
+#         "_refs.mvlgamma",
+#         "_refs.nn.functional.layer_norm",
+#         "_refs.nn.functional.tanhshrink",
+#         "_refs.nn.functional.triplet_margin_loss",
+#         "_refs.rfloordiv",
+#         "_refs.rtruediv",
+#         "_refs.rpow",
+#         # CompositeImplicitAutograd
+#         "_refs.allclose",
+#         "_refs.atleast_1d",
+#         "_refs.atleast_2d",
+#         "_refs.atleast_3d",
+#         "_refs.broadcast_to",
+#         "_refs.chunk",
+#         "_refs.column_stack",
+#         "_refs.contiguous",
+#         "_refs.dsplit",
+#         "_refs.dstack",
+#         "_refs.fill",
+#         "_refs.fill_",
+#         "_refs.flatten",
+#         "_refs.fliplr",
+#         "_refs.flipud",
+#         "_refs.float_power",
+#         "_refs.hsplit",
+#         "_refs.hstack",
+#         "_refs.isclose",
+#         "_refs.isfinite",
+#         "_refs.isreal",
+#         "_refs.istft",
+#         "_refs.log_softmax",
+#         "_refs.movedim",
+#         "_refs.narrow",
+#         "_refs.nn.functional.dropout",
+#         "_refs.nn.functional.l1_loss",
+#         "_refs.nn.functional.smooth_l1_loss",
+#         "_refs.nn.functional.log_softmax",
+#         "_refs.nn.functional.poisson_nll_loss",
+#         "_refs.nn.functional.softmax",
+#         "_refs.nn.functional.softmin",
+#         "_refs.positive",
+#         "_refs.ravel",
+#         "_refs.reshape",
+#         "_refs.softmax",
+#         "_refs.special.expit",
+#         "_refs.special.log_softmax",
+#         "_refs.special.softmax",
+#         "_refs.square",
+#         "_refs.stft",
+#         "_refs.T",
+#         "_refs.take_along_dim",
+#         "_refs.tensor_split",
+#         "_refs.to",
+#         "_refs.true_divide",
+#         "_refs.trunc_divide",
+#         "_refs.vsplit",
+#         "_refs.vstack",
+#         "_refs.linalg.matrix_norm",
+#         "_refs.linalg.norm",
+#         "_refs.linalg.svd",
+#         "_refs.linalg.svdvals",
+#         "_refs.unflatten",
+#         "_refs.sum_to_size",
+#         # ref implementation missing kwargs
+#         "_refs.full_like",  # missing "layout"
+#         "_refs.scalar_tensor",  # missing "layout"
+#         # other
+#         "_refs.block_diag",  # only refs._block_diag_iterable is in decomposition table
+#         "_refs.empty",  # intentional; direct empty is faster and has less guards
+#         "_refs.empty_permuted",  # intentional; direct empty is faster and has less guards
+#         "_refs.expand_as",
+#         "_refs.as_strided",  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
+#         "_refs.copy_to",  # torch._C._jit_get_operation: No such operator aten::copy_to
+#         "_refs.equal",  # 'bool' object has no attribute 'dtype'
+#         "_refs.conj",  # Calls _prims.conj
+#         "_refs.real",
+#         "_refs.imag",
+#         "_refs.reshape_as",
+#         "_refs.view_as",
+#         "_refs.view_as_complex",  # TorchInductor does not support complex at the moment.
+#         # the decompositions for these ops are slightly different
+#         # because of out handling
+#         "_refs.var_mean",
+#         "_refs.std_mean",
+#         "_refs.native_layer_norm",
+#     }
+
+#     @parametrize("op", ref_ops_names)
+#     def test_refs_are_in_python_ref_db(self, op):
+#         inplace = op[-1] == "_"
+#         if op in self.skip_ref_ops:
+#             raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
+#         elif inplace:
+#             self.assertNotIn(
+#                 op,
+#                 self.ref_db_names,
+#                 msg=f"{op} is an in-place operation and should not have an OpInfo",
+#             )
+#         else:
+#             # Intentionally don't use assertIn to avoid printing the
+#             # (very large) container
+#             self.assertTrue(op in self.ref_db_names, msg=f"{op} not in ref_db_names")
+
+#     @parametrize("op", ref_ops_names)
+#     def test_refs_are_in_decomp_table(self, op):
+#         path = op.split(".")
+#         module_path = ".".join(path[:-1])
+#         op_name = path[-1]
+#         op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
+
+#         if op in self.not_in_decomp_table:
+#             self.assertNotIn(
+#                 op_impl,
+#                 torch._decomp.decomposition_table.values(),
+#                 f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()",
+#             )
+#         else:
+#             self.assertIn(
+#                 op_impl,
+#                 torch._decomp.decomposition_table.values(),
+#                 f"Did not find {op} in torch._decomp.decomposition_table.values()",
+#             )
+
+
+# fake_skips = (
+#     "aminmax",  # failing input
+#     "cov",  # aweights cannot be negtaive
+#     "istft",  # window overlap add min: 0
+#     "linalg.eigvals",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+#     "linalg.eigvalsh",  # aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
+#     "linalg.matrix_power",  # Could not run 'aten::eye.m_out' with arguments from the 'Meta' backend
+#     # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backen
+#     "linalg.matrix_rank.hermitian",  # Could not run 'aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
+#     "linalg.pinv.hermitian",  # tensor.mH is only supported on matrices or batches of matrices. Got 1-D tensor
+#     "linalg.solve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta' backend
+#     "linalg.tensorsolve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta'
+#     "lu_solve",  # MALLOC ERROR: debug
+#     "multinomial",  # Could not run 'aten::multinomial' with arguments from the 'Meta' backend
+#     "mvlgamma.mvlgamma_p_1",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+#     "mvlgamma.mvlgamma_p_3",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+#     "mvlgamma.mvlgamma_p_5",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+#     "nanmean",  # logical_not() got an unexpected keyword argument 'out'
+#     "quantile",  # quantile() q values must be in the range [0, 1]
+#     "nanquantile",  # quantile() q values must be in the range [0, 1]
+#     "nn.functional.ctc_loss",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+#     "nn.functional.embedding_bag",  # sometimes errors
+#     "nn.functional.nll_loss",  # sometimes errors
+#     "nn.functional.max_pool1d",  # The tensor has a non-zero number of elements
+#     "to_sparse",  # Could not run 'aten::_to_sparse' with arguments from the 'Meta' backend
+#     "tensor_split",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+#     "repeat_interleave",  # cannot repeat_interleave a meta tensor without output_size
+#     "sparse.sampled.addmm",  # sparsity not supported
+#     # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
+#     "nn.functional.one_hot",
+#     "narrow",  # Fails only for one overload with DataDependentOutputException (hence skip).
+# )
+
+# fake_autocast_device_skips = defaultdict(dict)
+
+# # TODO: investigate/fix
+# fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
+
+
+# dynamic_output_op_tests = (
+#     "argwhere",
+#     "bincount",
+#     "combinations",
+#     "linalg.lstsq",
+#     "masked_select",
+#     "nonzero",
+#     "unique_consecutive",
+#     "unique",
+#     "linalg.lstsq.grad_oriented",
+# )
+
+# # Ops that have dynamic output shapes that we can handle when
+# # allow_dynamic_shape_ops is True in fake tensor shape environment.
+# supported_dynamic_output_op_tests = (
+#     "nonzero",
+#     "unique",
+#     "repeat_interleave",
+#     "masked_select",
+# )
+
+# # some inputs invoke dynamic output shape operators, some do not
+# sometimes_dynamic_output_op_test = (
+#     "__getitem__",
+#     "index_select",
+# )
+
+# data_dependent_op_tests = (
+#     "equal",
+#     "corrcoef",
+#     "nn.functional.gaussian_nll_loss",
+#     "allclose",
+# )
+
+# aliasing_failures = ("histogramdd",)
+
+# fake_backward_skips = {
+#     "linalg.cond",
+#     "linalg.matrix_norm",
+#     "linalg.norm",
+#     "linalg.svd",
+#     "linalg.svdvals",
+#     "pca_lowrank",
+#     "roll",
+#     "svd_lowrank",
+#     "sgn",
+# }
+
+# fake_backward_xfails = {skip(s) for s in fake_backward_skips} | {
+#     xfail("fft.ihfftn"),  # Mismatch in aten._conj_physical.default
+#     xfail("fft.ihfft2"),  # Mismatch in aten._conj_physical.default
+#     skip("nn.functional.ctc_loss"),
+# }
+
+# fake_autocast_backward_xfails = {
+#     skip("nn.functional.binary_cross_entropy"),
+#     skip("sparse.sampled_addmm"),
+#     skip("linalg.pinv"),
+#     skip("linalg.pinv", "hermitian"),
+#     skip("linalg.pinv", "singular"),
+#     skip("pinverse"),
+# }
+
+
+# @unMarkDynamoStrictTest
+# class TestFakeTensor(TestCase):
+#     def setUp(self):
+#         # Turn on FakeTensor caching and cross-checking for these tests:
+#         cache_enabled = unittest.mock.patch(
+#             "torch._dynamo.config.fake_tensor_cache_enabled", True
+#         )
+#         cache_enabled.start()
+#         self.addCleanup(cache_enabled.stop)
+
+#         cache_crosscheck = unittest.mock.patch(
+#             "torch._dynamo.config.fake_tensor_cache_crosscheck_enabled", True
+#         )
+#         cache_crosscheck.start()
+#         self.addCleanup(cache_crosscheck.stop)
+
+#     def _test_fake_helper(self, device, dtype, op, context):
+#         name = op.name
+#         if op.variant_test_name:
+#             name += "." + op.variant_test_name
+#         if name in fake_skips or "sparse" in name or "jiterator" in name:
+#             self.skipTest("Skip failing test")
+
+#         samples = op.sample_inputs(device, dtype, requires_grad=False)
+#         for sample in samples:
+#             mode = FakeTensorMode()
+
+#             from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+#             allow_dynamic_output_shape_shape_env = ShapeEnv(
+#                 allow_dynamic_output_shape_ops=True
+#             )
+
+#             allow_dynamic_output_shape_mode = FakeTensorMode(
+#                 shape_env=allow_dynamic_output_shape_shape_env
+#             )
+
+#             try:
+#                 with context():
+#                     res = op(sample.input, *sample.args, **sample.kwargs)
+#             except Exception:
+#                 continue
+
+#             def run_with_fake_mode_and_verify(fake_mode, match_results=True):
+#                 def map_to_fake(e):
+#                     if isinstance(e, torch.Tensor):
+#                         return fake_mode.from_tensor(e)
+#                     else:
+#                         return e
+
+#                 input = tree_map(map_to_fake, sample.input)
+#                 args = tree_map(map_to_fake, sample.args)
+#                 kwargs = tree_map(map_to_fake, sample.kwargs)
+
+#                 try:
+#                     with context():
+#                         with fake_mode:
+#                             res_fake = op(input, *args, **kwargs)
+
+#                     if not match_results:
+#                         return
+
+#                     for fake_out, real_out in zip(
+#                         pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
+#                     ):
+#                         if not isinstance(fake_out, torch.Tensor):
+#                             self.assertTrue(not isinstance(real_out, torch.Tensor))
+#                             self.assertEqual(fake_out, real_out)
+#                             continue
+
+#                         self.assertTrue(isinstance(fake_out, FakeTensor))
+#                         # if you see a shape exception here, you may need to add
+#                         # a `dynamic_output_shape` tag to an operator
+
+#                         # prims/decomps must correctly model strides,
+#                         # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
+#                         prims.utils.compare_tensor_meta(fake_out, real_out, True)
+
+#                         if name not in aliasing_failures:
+#                             fake_aliasing = outputs_alias_inputs(
+#                                 (input, args, kwargs), res_fake
+#                             )
+#                             real_aliasing = outputs_alias_inputs(
+#                                 (sample.input, sample, args, sample.kwargs), res
+#                             )
+#                             self.assertEqual(fake_aliasing, real_aliasing)
+
+#                     self.assertTrue(
+#                         name not in dynamic_output_op_tests
+#                         and name not in data_dependent_op_tests
+#                     )
+
+#                 except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
+#                     pass
+#                 except torch._subclasses.fake_tensor.UnsupportedOperatorException:
+#                     pass
+#                 except torch._subclasses.fake_tensor.DynamicOutputShapeException:
+#                     self.assertTrue(
+#                         name in dynamic_output_op_tests
+#                         or name in sometimes_dynamic_output_op_test
+#                     )
+#                     self.assertTrue(
+#                         mode.shape_env is None
+#                         or not mode.shape_env.allow_dynamic_output_shape_ops
+#                         or name not in supported_dynamic_output_op_tests
+#                     )
+#                 except torch._subclasses.fake_tensor.DataDependentOutputException:
+#                     self.assertTrue(name in data_dependent_op_tests)
+
+#             run_with_fake_mode_and_verify(mode)
+#             if name in supported_dynamic_output_op_tests:
+#                 run_with_fake_mode_and_verify(
+#                     allow_dynamic_output_shape_mode, match_results=False
+#                 )
+
+#     @ops(op_db, dtypes=OpDTypes.any_one)
+#     def test_pointwise_ops(self, device, dtype, op):
+#         name = op.name
+#         if op.variant_test_name:
+#             name += "." + op.variant_test_name
+#         if name in fake_skips or "sparse" in name or "jiterator" in name:
+#             self.skipTest("Skip failing test")
+
+#         test_self = self
+
+#         class TestPointwiseMode(TorchDispatchMode):
+#             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+#                 kwargs = kwargs or {}
+
+#                 out = func(*args, **kwargs)
+
+#                 if torch.Tag.pointwise in func.tags:
+#                     shapes = []
+#                     for inp in pytree.arg_tree_leaves(*args, **kwargs):
+#                         if isinstance(inp, torch.Tensor):
+#                             shapes.append(inp.shape)
+
+#                     out_shape = torch._refs._broadcast_shapes(*shapes)
+
+#                     for out_elem in pytree.tree_leaves(out):
+#                         if isinstance(out_elem, torch.Tensor):
+#                             test_self.assertEqual(out_elem.shape, out_shape)
+
+#                 return out
+
+#         samples = op.sample_inputs(device, dtype, requires_grad=False)
+#         for sample in samples:
+#             mode = FakeTensorMode()
+
+#             def map_to_fake(e):
+#                 if isinstance(e, torch.Tensor):
+#                     return mode.from_tensor(e)
+#                 else:
+#                     return e
+
+#             input = tree_map(map_to_fake, sample.input)
+#             args = tree_map(map_to_fake, sample.args)
+#             kwargs = tree_map(map_to_fake, sample.kwargs)
+
+#             try:
+#                 op(input, *args, **kwargs)
+#             except Exception as e:
+#                 continue
+
+#             with TestPointwiseMode():
+#                 with mode:
+#                     op(input, *args, **kwargs)
+
+#     @ops(op_db, dtypes=OpDTypes.any_one)
+#     def test_fake(self, device, dtype, op):
+#         self._test_fake_helper(device, dtype, op, contextlib.nullcontext)
+
+#     @ops(op_db, dtypes=OpDTypes.any_one)
+#     def test_fake_autocast(self, device, dtype, op):
+#         if op.name in fake_autocast_device_skips[device]:
+#             self.skipTest("Skip failing test")
+#         context = (
+#             torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
+#         )
+#         self._test_fake_helper(device, dtype, op, context)
+
+#     def _test_fake_crossref_helper(self, device, dtype, op, context):
+#         samples = op.sample_inputs(device, dtype, requires_grad=True)
+
+#         for iter, sample in enumerate(samples):
+#             args = [sample.input] + list(sample.args)
+#             kwargs = sample.kwargs
+
+#             # skip these to speed up tests
+#             common_skip_ops = (
+#                 aten.detach.default,
+#                 aten.empty_strided.default,
+#                 aten.copy_.default,
+#                 aten.is_same_size.default,
+#             )
+
+#             # TODO: enable check_aliasing, batch norm fails
+#             try:
+#                 with torch._subclasses.CrossRefFakeMode(
+#                     ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True
+#                 ):
+#                     with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(
+#                         False
+#                     ):
+#                         composite_compliance.compute_expected_grads(
+#                             op.get_op(),
+#                             args,
+#                             kwargs,
+#                             sample.output_process_fn_grad,
+#                             op.gradcheck_wrapper,
+#                         )
+#             except torch._subclasses.fake_tensor.UnsupportedOperatorException:
+#                 pass
+
+#     @onlyCUDA
+#     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
+#     @skipOps(
+#         "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
+#     )
+#     def test_fake_crossref_backward_no_amp(self, device, dtype, op):
+#         self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
+
+#     @onlyCUDA
+#     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
+#     @skipOps(
+#         "TestFakeTensor",
+#         "test_fake_crossref_backward_amp",
+#         fake_backward_xfails | fake_autocast_backward_xfails,
+#     )
+#     def test_fake_crossref_backward_amp(self, device, dtype, op):
+#         self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
+
+#     @ops([op for op in ops_and_refs if op.is_factory_function])
+#     def test_strided_layout(self, device, dtype, op):
+#         samples = op.sample_inputs(device, dtype)
+#         for sample in samples:
+#             kwargs = sample.kwargs.copy()
+#             kwargs["layout"] = torch.strided
+#             strided_result = op(sample.input, *sample.args, **kwargs)
+#             self.assertEqual(strided_result.layout, torch.strided)
+
+
+#instantiate_device_type_tests(TestCommon, globals(), only_for="xpu")
+instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu")
+#instantiate_device_type_tests(TestMathBits, globals())
+#instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
+#instantiate_device_type_tests(TestFakeTensor, globals())
+#instantiate_device_type_tests(TestTags, globals())
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
+    #import pdb
+    #pdb.set_trace()
     run_tests()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8c2a72390aca9..8d93fcc590b2c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10807,6 +10807,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_rhs_python_scalar=False,
                     supports_fwgrad_bwgrad=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=False),
+                    skipXPU=False,
                     skips=(
                         # RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.expectedFailure,
@@ -10825,6 +10826,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_rhs_python_scalar=False,
                     supports_fwgrad_bwgrad=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=False),
+                    skipXPU=False,
                     skips=(
                         # RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.expectedFailure,
@@ -10848,7 +10850,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csr),
                     sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csc),
                     sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsr),
-                    sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc)),
+                    sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc),
+                    skipXPU=False),
+                    
     BinaryUfuncInfo('sub',
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
                     ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
@@ -10875,6 +10879,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
                             'TestDecomp', 'test_quick', device_type='cpu'),
                     ),
+                    skipXPU=False,
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"),
                                      'TestBinaryUfuncs',
@@ -11326,7 +11331,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.bitwise_not,
                    dtypes=integral_types_and(torch.bool),
                    operator_variant=operator.invert,
-                   supports_autograd=False),
+                   supports_autograd=False,
+                   skipXPU=False),
     BinaryUfuncInfo('bitwise_left_shift',
                     op=torch.bitwise_left_shift,
                     dtypes=integral_types(),
@@ -11456,6 +11462,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False,
+           skipXPU=False,
            skips=(
                # TypeError: _copy_dispatcher() got an unexpected keyword argument 'memory_format'
                # (NumPy reference needs to be extended with memory_format)
@@ -11497,6 +11504,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skipXPU=False,
            skips=(
                # NNC appear to not handle boolean clamp
                DecorateInfo(unittest.expectedFailure,
@@ -11567,6 +11575,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_view_as_real,
            test_conjugated_samples=False,
+           skipXPU=False,
            ),
     OpInfo('view_as_complex',
            dtypes=floating_types_and(torch.half),
@@ -11575,6 +11584,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            test_neg_view=False,
            sample_inputs_func=sample_inputs_view_as_complex,
+           skipXUP=False,
            skips=(
                # RuntimeError: Tensor must have a last dimension with stride 1
                DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
@@ -11621,12 +11631,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.cos,
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfXPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    handles_large_floats=False,
                    supports_forward_ad=True,
                    supports_fwgrad_bwgrad=True,
                    promotes_int_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+                   skipXPU=False,
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
@@ -11715,6 +11727,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
+           skipXPU=False,
            skips=(
                # cumsum does not handle correctly out= dtypes
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
@@ -11783,6 +11796,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     variant_test_name='no_rounding_mode',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                    dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                     # Runs very slowly on slow gradcheck - alternatively reduce input sizes
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
@@ -11790,7 +11804,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True,
                     supports_two_python_scalars=True,
                     assert_autodiffed=True,
-                    rhs_make_tensor_kwargs=dict(exclude_zero=True),),
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    skipXPU=False,),
+                    
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='trunc_rounding',
@@ -11807,6 +11823,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         # See https://github.com/pytorch/pytorch/issues/111126
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
                     ),
+                    skipXPU=False,
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
@@ -11827,6 +11844,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         # See https://github.com/pytorch/pytorch/issues/111126
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
                     ),
+                    skipXPU=False,
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
@@ -11930,6 +11948,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     always_returns_bool=True,
                     supports_autograd=False,
                     sample_inputs_func=sample_inputs_comparison_ops,
+                    skipXPU=False,
                     skips=(
                     )),
     BinaryUfuncInfo('fmax',
@@ -11956,12 +11975,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     ref=np.fmod,
                     dtypes=all_types_and(torch.float16, torch.bfloat16),
                     dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    dtypesIfXPU=all_types_and(torch.float16, torch.bfloat16),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
                     assert_autodiffed=None,
                     rhs_make_tensor_kwargs={'exclude_zero': True},
+                    skipXPU=False,
                     decorators=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
                                      'test_contig_vs_every_other',
@@ -13008,6 +13029,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.iand,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
+                    skipXPU=False,
                     skips=(
                         # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
@@ -13020,6 +13042,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.ior,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
+                    skipXPU=False,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_or_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
@@ -13034,6 +13057,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.ixor,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
+                    skipXPU=False,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_xor_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
@@ -13070,6 +13094,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     dtypes=integral_types_and(),
                     supports_autograd=False,
                     supports_rhs_python_scalar=False,
+                    skipXPU=False,
                     skips=(
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
@@ -15384,6 +15409,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            autodiff_nonfusible_nodes=["aten::gelu"],
+           skipXPU=False,
            skips=(
                # AssertionError: Tensor-likes are not close!
                # May not replicate in CI
@@ -17231,6 +17257,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csc),
            sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsr),
            sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsc),
+           skipXPU=False,
            skips=(
            )),
     OpInfo('ones_like',
@@ -17397,6 +17424,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=True,
            sample_inputs_func=sample_inputs_ones_zeros,
+           skipXPU=False,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
@@ -17539,6 +17567,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_empty,
            supports_autograd=False,
+           skipXPU=False,
            skips=(
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
@@ -17761,6 +17790,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_bernoulli,
            error_inputs_func=error_inputs_bernoulli,
+           skipXPU=False,
            skips=(
                # vmap: We do not yet support calling random operations inside of vmap
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
@@ -18065,6 +18095,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
+        skipXPU=False,
         skips=(
             # JIT has issue when op is passed as lambda
             # AssertionError: JIT Test does not execute any logic
@@ -18082,6 +18113,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           skipXPU=False,
            skips=(
                # Cannot resize variables that require grad
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
@@ -18096,6 +18128,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
+           skipXPU=False,
            skips=(
                # Cannot resize variables that require grad
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index b56d44c3904bb..6e6e1c596fd48 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -389,8 +389,7 @@ def composite_fn(test, generic_cls, device_cls,
                      old_parametrize_fn=old_parametrize_fn,
                      new_parametrize_fn=new_parametrize_fn):
         old_tests = list(old_parametrize_fn(test, generic_cls, device_cls))
-        import pdb
-        pdb.set_trace()
+        
         for (old_test, old_test_name, old_param_kwargs, old_dec_fn) in old_tests:
             for (new_test, new_test_name, new_param_kwargs, new_dec_fn) in \
                     new_parametrize_fn(old_test, generic_cls, device_cls):
@@ -405,8 +404,7 @@ def composite_fn(test, generic_cls, device_cls,
                                                    old_test_name)
 
                 def merged_decorator_fn(param_kwargs, old_dec_fn=old_dec_fn, new_dec_fn=new_dec_fn):
-                    import pdb
-                    pdb.set_trace()
+                    
                     return list(old_dec_fn(param_kwargs)) + list(new_dec_fn(param_kwargs))
 
                 yield (new_test, merged_test_name, full_param_kwargs, merged_decorator_fn)
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 02e430ed616c5..6ae58240ea75a 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -96,26 +96,20 @@ def __init__(
         self.dtypes = dtypes
         self.active_if = active_if
 
-        print("init decorators: {} {} {} {} {}".format(self.cls_name, self.test_name, self.device_type, self.dtypes, self.active_if))
-
+        
         # Validate dtypes
         if self.dtypes is not None:
             for dtype in self.dtypes:
                 assert isinstance(dtype, torch.dtype)
 
     def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
-        print("is_active: {} {} {} {} {} {}".format(self.decorators, self.active_if, 
-                                                 (self.cls_name is None or self.cls_name == cls_name), 
-                                                 (self.test_name is None or self.test_name == test_name), 
-                                                 (self.device_type is None or self.device_type == device_type), 
-                                                 (self.dtypes is None or dtype in self.dtypes)))   
-        print("is_active details: {} {} {} {}".format(self.cls_name, cls_name, self.test_name, test_name))
+     
         return (
             self.active_if
             and (self.cls_name is None or self.cls_name == cls_name)
             and (self.test_name is None or self.test_name == test_name)
             and (self.device_type is None or self.device_type == device_type)
-            and (self.dtypes is None or G in self.dtypes)
+            and (self.dtypes is None or dtype in self.dtypes)
             # Support callables over kwargs to determine if the decorator is active.
             and (
                 self.active_if(param_kwargs)

From f5cbd50068fae398c01c8e200e560118118fdbb7 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 12 May 2024 22:28:04 -0700
Subject: [PATCH 03/37] clean up code

---
 test/test_ops.py                              | 1699 ++++++++---------
 .../_internal/common_methods_invocations.py   |    1 -
 torch/testing/_internal/opinfo/core.py        |    7 +-
 3 files changed, 851 insertions(+), 856 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 4b665336c1a50..5a8d9bc461f8d 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -136,11 +136,11 @@ def reduction_dtype_filter(op):
 
 aten = torch.ops.aten
 
-_xpu_computation_op_list = ["_refs.abs", "_refs.all", "item", "abs", "add", "_refs.fill"]
-_xpu_computation_op_list = ["abs"]
-_xpu_computation_ops = [
-   op for op in ops_and_refs if op.name in _xpu_computation_op_list
-]
+# _xpu_computation_op_list = ["_refs.abs", "_refs.all", "item", "abs", "add", "_refs.fill"]
+# _xpu_computation_op_list = ["abs"]
+# _xpu_computation_ops = [
+#    op for op in ops_and_refs if op.name in _xpu_computation_op_list
+# ]
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
@@ -309,8 +309,7 @@ def to_cpu(arg):
                 return arg.to(device="cpu")
             return arg
 
-        #import pdb
-        #pdb.set_trace()
+ 
         samples = op.reference_inputs(device, dtype)
 
         for sample in samples:
@@ -1840,848 +1839,848 @@ def check_cow_input(
                             allow_list=op.allow_cow_input_materialize_backward,
                         )
 
-#     @ops(op_db, allowed_dtypes=(torch.float,))
-#     def test_view_replay(self, device, dtype, op):
-#         def _assert_match_metadata(a, b):
-#             self.assertEqual(a.size(), b.size())
-#             self.assertEqual(a.stride(), b.stride())
-#             self.assertEqual(a.storage_offset(), b.storage_offset())
-#             self.assertEqual(a.device, b.device)
-#             self.assertEqual(a.dtype, b.dtype)
-
-#         # ensure view replay is enabled
-#         with torch.autograd._force_original_view_tracking(True):
-#             for sample in op.sample_inputs(device, dtype, requires_grad=False):
-#                 inp = sample.input
-#                 outs = op(inp, *sample.args, **sample.kwargs)
-#                 if not isinstance(outs, (tuple, List)):
-#                     outs = [outs]
-
-#                 # for all outputs that are views of the input, we should be able to replay the
-#                 # forward and reverse views via a functioning view_func() / rev_view_func().
-#                 for out in outs:
-#                     if not (
-#                         isinstance(out, torch.Tensor)
-#                         and out._is_view()
-#                         and out._base is inp
-#                     ):
-#                         continue
-
-#                     # forward view_func
-#                     new_inp = inp.clone()
-#                     _assert_match_metadata(new_inp, inp)
-#                     new_out = out._view_func_unsafe(new_inp)
-#                     _assert_match_metadata(new_out, out)
-#                     self.assertEqual(new_out, out)
-
-#                     # reverse view_func
-#                     new_out = out.detach()
-#                     new_inp = out._rev_view_func_unsafe(new_out)
-#                     _assert_match_metadata(new_inp, inp)
-#                     self.assertTrue(new_inp._is_view())
-#                     self.assertTrue(new_inp._base is new_out)
-
-
-# @unMarkDynamoStrictTest
-# class TestMathBits(TestCase):
-#     # Tests that
-#     # 1. The operator's output for physically conjugated/negated tensors and conjugate/negative view tensors
-#     # produces the same value
-#     # 2. The gradients are same in both cases mentioned in (1)
-#     # 3. If the operator's inplace variant is supported, tests that the inplace operation
-#     #    produces the correct value when called on a conjugate/negative view tensor and that the output
-#     #    has its conj/neg bit set to true
-#     # This test only runs for C -> R and C -> C functions
-#     # TODO: add tests for `R->C` functions
-#     # Note: This test runs for functions that take both tensors and tensorlists as input.
-#     def _test_math_view(
-#         self,
-#         device,
-#         dtype,
-#         op,
-#         samples,
-#         math_op_physical,
-#         math_op_view,
-#         is_bit_set,
-#         out_type,
-#     ):
-#         inplace_variant = op.inplace_variant
-
-#         # helper function to clone and conjugate/negate the input if its a tensor
-#         # else clone the sequence and conjugate/negate the first element in the sequence
-#         # If a requires_grad argument is provided the tensor being conjugated/negated will
-#         # have its requires_grad set to that value.
-#         def clone_and_perform_view(input, **kwargs):
-#             if isinstance(input, torch.Tensor):
-#                 requires_grad = kwargs.get("requires_grad", input.requires_grad)
-#                 with torch.no_grad():
-#                     # Ensure view represents the original sample input
-#                     input = math_op_physical(input)
-#                 # Note: .conj() is not called under no_grad mode since it's not allowed to modify a
-#                 # view created in no_grad mode. Here it's ok to do so, so as a workaround we call conj
-#                 # before resetting the requires_grad field for input
-#                 input = math_op_view(input)
-#                 assert input.is_leaf
-#                 return input.requires_grad_(requires_grad)
-
-#             if isinstance(input, Sequence):
-#                 out = list(map(clone_input_helper, input))
-#                 out[0] = clone_and_perform_view(out[0])
-#                 return tuple(out)
-
-#         for sample in samples:
-#             tensor = (
-#                 sample.input
-#                 if isinstance(sample.input, torch.Tensor)
-#                 else sample.input[0]
-#             )
-#             cloned1 = clone_and_perform_view(sample.input)
-
-#             # Computes function forward value with a physically conjugated/negated tensor and
-#             # a conj/neg view tensor and verifies that the output in both case are equal.
-#             expected_forward = op(sample.input, *sample.args, **sample.kwargs)
-#             forward_with_mathview = op(cloned1, *sample.args, **sample.kwargs)
-#             self.assertEqual(expected_forward, forward_with_mathview)
-
-#             # If the op has an inplace variant, and the input doesn't require broadcasting
-#             # and has the same dtype as output, verify that the inplace operation on a conjugated/negated
-#             # input produces correct output, and the output tensor has the conj/neg bit set to True
-#             if inplace_variant is not None and not sample.broadcasts_input:
-#                 cloned2 = clone_and_perform_view(tensor, requires_grad=False)
-#                 if (
-#                     isinstance(expected_forward, torch.Tensor)
-#                     and expected_forward.dtype is tensor.dtype
-#                 ):
-#                     inplace_forward = inplace_variant(
-#                         cloned2, *sample.args, **sample.kwargs
-#                     )
-#                     self.assertTrue(is_bit_set(inplace_forward))
-#                     self.assertEqual(inplace_forward, expected_forward)
-
-#             # TODO: backward consistency only supported for single tensor outputs
-#             # TODO: backward consistency only checked on sample.input, not all
-#             #   tensor inputs
-#             # TODO: update to handle checking grads of all tensor inputs as
-#             #   derived from each tensor output
-#             if (
-#                 isinstance(expected_forward, torch.Tensor)
-#                 and expected_forward.requires_grad
-#             ):
-#                 output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x)
-#                 expected_forward = output_process_fn_grad(expected_forward)
-#                 forward_with_mathview = output_process_fn_grad(forward_with_mathview)
-
-#                 tensor = (
-#                     sample.input
-#                     if isinstance(sample.input, torch.Tensor)
-#                     else sample.input[0]
-#                 )
-#                 expected_forward.sum().abs().backward(retain_graph=True)
-#                 forward_with_mathview.sum().abs().backward(retain_graph=True)
-#                 if tensor.grad is not None:
-#                     cloned1_tensor = (
-#                         cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
-#                     )
-#                     self.assertEqual(tensor.grad, cloned1_tensor.grad)
-
-#                     tensor.grad, cloned1_tensor.grad = None, None
-
-#                     # a repeat of the above test if output is not complex valued
-#                     if out_type(expected_forward):
-#                         grad = torch.randn_like(expected_forward)
-#                         expected_forward.backward(grad)
-#                         forward_with_mathview.backward(
-#                             math_op_view(math_op_physical(grad))
-#                         )
-
-#                         self.assertEqual(tensor.grad, cloned1_tensor.grad)
-
-#     @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
-#     def test_conj_view(self, device, dtype, op):
-#         if not op.test_conjugated_samples:
-#             self.skipTest("Operation doesn't support conjugated inputs.")
-#         math_op_physical = torch.conj_physical
-#         math_op_view = torch.conj
-#         _requires_grad = torch.cfloat in op.supported_backward_dtypes(
-#             torch.device(device).type
-#         )
-#         is_bit_set = torch.is_conj
-#         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
-#         self._test_math_view(
-#             device,
-#             dtype,
-#             op,
-#             samples,
-#             math_op_physical,
-#             math_op_view,
-#             is_bit_set,
-#             torch.is_complex,
-#         )
-
-#     @ops(ops_and_refs, allowed_dtypes=(torch.double,))
-#     def test_neg_view(self, device, dtype, op):
-#         if not op.test_neg_view:
-#             self.skipTest("Operation not tested with tensors with negative bit.")
-#         math_op_physical = torch.neg
-#         math_op_view = torch._neg_view
-#         is_bit_set = torch.is_neg
-#         samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
-#         self._test_math_view(
-#             device,
-#             dtype,
-#             op,
-#             samples,
-#             math_op_physical,
-#             math_op_view,
-#             is_bit_set,
-#             lambda x: True,
-#         )
-
-#     @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
-#     def test_neg_conj_view(self, device, dtype, op):
-#         if not op.test_neg_view:
-#             self.skipTest("Operation not tested with tensors with negative bit.")
-#         if not op.test_conjugated_samples:
-#             self.skipTest("Operation doesn't support conjugated inputs.")
-
-#         def math_op_physical(x):
-#             return -x.conj_physical()
-
-#         def math_op_view(x):
-#             return torch._neg_view(x).conj()
-
-#         def is_bit_set(x):
-#             return torch.is_neg(x) and torch.is_conj(x)
-
-#         _requires_grad = dtype in op.supported_backward_dtypes(
-#             torch.device(device).type
-#         )
-#         samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
-#         # Only test one sample
-#         samples = itertools.islice(samples, 1)
-#         self._test_math_view(
-#             device,
-#             dtype,
-#             op,
-#             samples,
-#             math_op_physical,
-#             math_op_view,
-#             is_bit_set,
-#             torch.is_complex,
-#         )
-
-
-# # input strides and size may have been altered due to the result of an inplace op
-# def check_inplace_view(func, input, rs, input_size, input_strides):
-#     if func is None:
-#         return
-#     # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm(_legit).out
-#     # which mutate not necessarily the first input.
-#     if isinstance(rs, torch.Tensor) and rs is input:
-#         unequal_size = rs.size() != input_size
-#         unequal_strides = rs.stride() != input_strides
-#         # resize_ should probably have inplace_view tag. Not adding the tag since it
-#         # breaks some codegen logic
-#         if unequal_size or unequal_strides:
-#             if isinstance(func, torch._ops.OpOverloadPacket):
-#                 func = func.default
-#             # Reference: https://github.com/pytorch/pytorch/issues/78759
-#             if func is not torch.ops.aten.resize_.default:
-#                 # TODO: use self.assertIn when we have separate tests for each tag
-#                 assert torch.Tag.inplace_view in func.tags
-
-
-# # A mode that when enabled runs correctness checks to ensure
-# # that operators have expected tags based on their input and
-# # output tensor properties
-# class TestTagsMode(TorchDispatchMode):
-#     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-#         if isinstance(args[0], torch.Tensor):
-#             old_size = args[0].size()
-#             old_stride = args[0].stride()
-#             rs = func(*args, **kwargs)
-#             check_inplace_view(func, args[0], rs, old_size, old_stride)
-#         else:
-#             rs = func(*args, **kwargs)
-#         return rs
-
-
-# # Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
-# @unMarkDynamoStrictTest
-# class TestTags(TestCase):
-#     @onlyCPU
-#     @ops(ops_and_refs, dtypes=OpDTypes.any_one)
-#     def test_tags(self, device, dtype, op):
-#         samples = op.sample_inputs(device, dtype, requires_grad=False)
-#         for sample in samples:
-#             # TODO: Test tags for ops that return a list of tensors
-#             input = sample.input
-#             if isinstance(input, torch.Tensor):
-#                 old_size = input.size()
-#                 old_stride = input.stride()
-#                 with TestTagsMode():
-#                     rs = op(input, *sample.args, **sample.kwargs)
-#                 # TODO: add test for aliases: https://github.com/pytorch/pytorch/issues/78761
-#                 aten_name = op.aten_name if op.aten_name is not None else op.name
-#                 opoverloadpacket = getattr(torch.ops.aten, aten_name, None)
-#                 check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
-
-
-# class TestSelfKwarg(TestCase):
-#     def test_self_kwargs(self):
-#         """Verify that we can call the aten ops with all kwargs even if the
-#         argument's name is "self"
-#         """
-#         torch.ops.aten.reshape.default(self=torch.rand(1, 2), shape=[2])
-#         torch.ops.aten.min.default(self=torch.rand(100))
-
-
-# @unMarkDynamoStrictTest
-# class TestRefsOpsInfo(TestCase):
-#     import_paths = [
-#         "_refs",
-#         "_refs.special",
-#         "_refs.nn.functional",
-#         "_refs.fft",
-#         "_refs._conversions",
-#     ]
-#     module_alls = [
-#         (path, import_module(f"torch.{path}").__all__) for path in import_paths
-#     ]
-#     ref_ops_names = tuple(
-#         itertools.chain.from_iterable(
-#             [f"{path}.{op}" for op in module_all] for path, module_all in module_alls
-#         )
-#     )
-#     ref_db_names = {ref_op.name for ref_op in python_ref_db}
-
-#     # TODO: References that do not have an entry in python_ref_db
-#     skip_ref_ops = {
-#         "_refs.alias",
-#         "_refs.bitwise_right_shift",
-#         "_refs.copy_to",
-#         "_refs.empty_permuted",
-#         "_refs.empty_strided",
-#         "_refs.equal",
-#         "_refs.full",
-#         "_refs.full_like",
-#         "_refs.is_complex",
-#         "_refs.to",
-#         "_refs.mvlgamma",
-#         "_refs.ones",
-#         "_refs.ones_like",
-#         "_refs.special.expit",
-#         "_refs.std_var",
-#         "_refs.swap_axes",
-#         "_refs.uniform",
-#         "_refs.scalar_tensor",
-#         "_refs.trunc_divide",
-#         "_refs.zero",
-#         "_refs.zeros",
-#         "_refs.zeros_like",
-#         "_refs.rfloordiv",
-#         "_refs.rtruediv",
-#         "_refs.rpow",
-#         # These should be tested with their out-of-place counterparts
-#         "_refs.index_add_",
-#         "_refs.index_copy_",
-#         "_refs.index_fill_",
-#         "_refs.native_group_norm",
-#     }
-
-#     not_in_decomp_table = {
-#         # duplicated in _decomp and _refs
-#         "_refs.nn.functional.group_norm",
-#         "_refs.nn.functional.mse_loss",
-#         "_refs.floor_divide",
-#         # duplicated as refs do not have decent support for advanced indexing
-#         "_refs.index_copy",
-#         "_refs.index_copy_",
-#         "_refs.index_add",
-#         "_refs.index_add_",
-#         # these are not aten ops?
-#         "_refs._conversions.bfloat16",
-#         "_refs._conversions.bool",
-#         "_refs._conversions.byte",
-#         "_refs._conversions.char",
-#         "_refs._conversions.double",
-#         "_refs._conversions.float",
-#         "_refs._conversions.half",
-#         "_refs._conversions.int",
-#         "_refs._conversions.long",
-#         "_refs._conversions.short",
-#         "_refs._conversions.chalf",
-#         "_refs._conversions.cfloat",
-#         "_refs._conversions.cdouble",
-#         "_refs.broadcast_shapes",
-#         "_refs.broadcast_tensors",
-#         "_refs.mvlgamma",
-#         "_refs.nn.functional.layer_norm",
-#         "_refs.nn.functional.tanhshrink",
-#         "_refs.nn.functional.triplet_margin_loss",
-#         "_refs.rfloordiv",
-#         "_refs.rtruediv",
-#         "_refs.rpow",
-#         # CompositeImplicitAutograd
-#         "_refs.allclose",
-#         "_refs.atleast_1d",
-#         "_refs.atleast_2d",
-#         "_refs.atleast_3d",
-#         "_refs.broadcast_to",
-#         "_refs.chunk",
-#         "_refs.column_stack",
-#         "_refs.contiguous",
-#         "_refs.dsplit",
-#         "_refs.dstack",
-#         "_refs.fill",
-#         "_refs.fill_",
-#         "_refs.flatten",
-#         "_refs.fliplr",
-#         "_refs.flipud",
-#         "_refs.float_power",
-#         "_refs.hsplit",
-#         "_refs.hstack",
-#         "_refs.isclose",
-#         "_refs.isfinite",
-#         "_refs.isreal",
-#         "_refs.istft",
-#         "_refs.log_softmax",
-#         "_refs.movedim",
-#         "_refs.narrow",
-#         "_refs.nn.functional.dropout",
-#         "_refs.nn.functional.l1_loss",
-#         "_refs.nn.functional.smooth_l1_loss",
-#         "_refs.nn.functional.log_softmax",
-#         "_refs.nn.functional.poisson_nll_loss",
-#         "_refs.nn.functional.softmax",
-#         "_refs.nn.functional.softmin",
-#         "_refs.positive",
-#         "_refs.ravel",
-#         "_refs.reshape",
-#         "_refs.softmax",
-#         "_refs.special.expit",
-#         "_refs.special.log_softmax",
-#         "_refs.special.softmax",
-#         "_refs.square",
-#         "_refs.stft",
-#         "_refs.T",
-#         "_refs.take_along_dim",
-#         "_refs.tensor_split",
-#         "_refs.to",
-#         "_refs.true_divide",
-#         "_refs.trunc_divide",
-#         "_refs.vsplit",
-#         "_refs.vstack",
-#         "_refs.linalg.matrix_norm",
-#         "_refs.linalg.norm",
-#         "_refs.linalg.svd",
-#         "_refs.linalg.svdvals",
-#         "_refs.unflatten",
-#         "_refs.sum_to_size",
-#         # ref implementation missing kwargs
-#         "_refs.full_like",  # missing "layout"
-#         "_refs.scalar_tensor",  # missing "layout"
-#         # other
-#         "_refs.block_diag",  # only refs._block_diag_iterable is in decomposition table
-#         "_refs.empty",  # intentional; direct empty is faster and has less guards
-#         "_refs.empty_permuted",  # intentional; direct empty is faster and has less guards
-#         "_refs.expand_as",
-#         "_refs.as_strided",  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
-#         "_refs.copy_to",  # torch._C._jit_get_operation: No such operator aten::copy_to
-#         "_refs.equal",  # 'bool' object has no attribute 'dtype'
-#         "_refs.conj",  # Calls _prims.conj
-#         "_refs.real",
-#         "_refs.imag",
-#         "_refs.reshape_as",
-#         "_refs.view_as",
-#         "_refs.view_as_complex",  # TorchInductor does not support complex at the moment.
-#         # the decompositions for these ops are slightly different
-#         # because of out handling
-#         "_refs.var_mean",
-#         "_refs.std_mean",
-#         "_refs.native_layer_norm",
-#     }
-
-#     @parametrize("op", ref_ops_names)
-#     def test_refs_are_in_python_ref_db(self, op):
-#         inplace = op[-1] == "_"
-#         if op in self.skip_ref_ops:
-#             raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
-#         elif inplace:
-#             self.assertNotIn(
-#                 op,
-#                 self.ref_db_names,
-#                 msg=f"{op} is an in-place operation and should not have an OpInfo",
-#             )
-#         else:
-#             # Intentionally don't use assertIn to avoid printing the
-#             # (very large) container
-#             self.assertTrue(op in self.ref_db_names, msg=f"{op} not in ref_db_names")
-
-#     @parametrize("op", ref_ops_names)
-#     def test_refs_are_in_decomp_table(self, op):
-#         path = op.split(".")
-#         module_path = ".".join(path[:-1])
-#         op_name = path[-1]
-#         op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
-
-#         if op in self.not_in_decomp_table:
-#             self.assertNotIn(
-#                 op_impl,
-#                 torch._decomp.decomposition_table.values(),
-#                 f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()",
-#             )
-#         else:
-#             self.assertIn(
-#                 op_impl,
-#                 torch._decomp.decomposition_table.values(),
-#                 f"Did not find {op} in torch._decomp.decomposition_table.values()",
-#             )
-
-
-# fake_skips = (
-#     "aminmax",  # failing input
-#     "cov",  # aweights cannot be negtaive
-#     "istft",  # window overlap add min: 0
-#     "linalg.eigvals",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-#     "linalg.eigvalsh",  # aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
-#     "linalg.matrix_power",  # Could not run 'aten::eye.m_out' with arguments from the 'Meta' backend
-#     # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backen
-#     "linalg.matrix_rank.hermitian",  # Could not run 'aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
-#     "linalg.pinv.hermitian",  # tensor.mH is only supported on matrices or batches of matrices. Got 1-D tensor
-#     "linalg.solve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta' backend
-#     "linalg.tensorsolve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta'
-#     "lu_solve",  # MALLOC ERROR: debug
-#     "multinomial",  # Could not run 'aten::multinomial' with arguments from the 'Meta' backend
-#     "mvlgamma.mvlgamma_p_1",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-#     "mvlgamma.mvlgamma_p_3",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-#     "mvlgamma.mvlgamma_p_5",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
-#     "nanmean",  # logical_not() got an unexpected keyword argument 'out'
-#     "quantile",  # quantile() q values must be in the range [0, 1]
-#     "nanquantile",  # quantile() q values must be in the range [0, 1]
-#     "nn.functional.ctc_loss",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-#     "nn.functional.embedding_bag",  # sometimes errors
-#     "nn.functional.nll_loss",  # sometimes errors
-#     "nn.functional.max_pool1d",  # The tensor has a non-zero number of elements
-#     "to_sparse",  # Could not run 'aten::_to_sparse' with arguments from the 'Meta' backend
-#     "tensor_split",  # The tensor has a non-zero number of elements, but its data is not allocated yet
-#     "repeat_interleave",  # cannot repeat_interleave a meta tensor without output_size
-#     "sparse.sampled.addmm",  # sparsity not supported
-#     # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
-#     "nn.functional.one_hot",
-#     "narrow",  # Fails only for one overload with DataDependentOutputException (hence skip).
-# )
-
-# fake_autocast_device_skips = defaultdict(dict)
-
-# # TODO: investigate/fix
-# fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
-
-
-# dynamic_output_op_tests = (
-#     "argwhere",
-#     "bincount",
-#     "combinations",
-#     "linalg.lstsq",
-#     "masked_select",
-#     "nonzero",
-#     "unique_consecutive",
-#     "unique",
-#     "linalg.lstsq.grad_oriented",
-# )
-
-# # Ops that have dynamic output shapes that we can handle when
-# # allow_dynamic_shape_ops is True in fake tensor shape environment.
-# supported_dynamic_output_op_tests = (
-#     "nonzero",
-#     "unique",
-#     "repeat_interleave",
-#     "masked_select",
-# )
-
-# # some inputs invoke dynamic output shape operators, some do not
-# sometimes_dynamic_output_op_test = (
-#     "__getitem__",
-#     "index_select",
-# )
-
-# data_dependent_op_tests = (
-#     "equal",
-#     "corrcoef",
-#     "nn.functional.gaussian_nll_loss",
-#     "allclose",
-# )
-
-# aliasing_failures = ("histogramdd",)
-
-# fake_backward_skips = {
-#     "linalg.cond",
-#     "linalg.matrix_norm",
-#     "linalg.norm",
-#     "linalg.svd",
-#     "linalg.svdvals",
-#     "pca_lowrank",
-#     "roll",
-#     "svd_lowrank",
-#     "sgn",
-# }
-
-# fake_backward_xfails = {skip(s) for s in fake_backward_skips} | {
-#     xfail("fft.ihfftn"),  # Mismatch in aten._conj_physical.default
-#     xfail("fft.ihfft2"),  # Mismatch in aten._conj_physical.default
-#     skip("nn.functional.ctc_loss"),
-# }
-
-# fake_autocast_backward_xfails = {
-#     skip("nn.functional.binary_cross_entropy"),
-#     skip("sparse.sampled_addmm"),
-#     skip("linalg.pinv"),
-#     skip("linalg.pinv", "hermitian"),
-#     skip("linalg.pinv", "singular"),
-#     skip("pinverse"),
-# }
-
-
-# @unMarkDynamoStrictTest
-# class TestFakeTensor(TestCase):
-#     def setUp(self):
-#         # Turn on FakeTensor caching and cross-checking for these tests:
-#         cache_enabled = unittest.mock.patch(
-#             "torch._dynamo.config.fake_tensor_cache_enabled", True
-#         )
-#         cache_enabled.start()
-#         self.addCleanup(cache_enabled.stop)
-
-#         cache_crosscheck = unittest.mock.patch(
-#             "torch._dynamo.config.fake_tensor_cache_crosscheck_enabled", True
-#         )
-#         cache_crosscheck.start()
-#         self.addCleanup(cache_crosscheck.stop)
-
-#     def _test_fake_helper(self, device, dtype, op, context):
-#         name = op.name
-#         if op.variant_test_name:
-#             name += "." + op.variant_test_name
-#         if name in fake_skips or "sparse" in name or "jiterator" in name:
-#             self.skipTest("Skip failing test")
-
-#         samples = op.sample_inputs(device, dtype, requires_grad=False)
-#         for sample in samples:
-#             mode = FakeTensorMode()
-
-#             from torch.fx.experimental.symbolic_shapes import ShapeEnv
-
-#             allow_dynamic_output_shape_shape_env = ShapeEnv(
-#                 allow_dynamic_output_shape_ops=True
-#             )
-
-#             allow_dynamic_output_shape_mode = FakeTensorMode(
-#                 shape_env=allow_dynamic_output_shape_shape_env
-#             )
-
-#             try:
-#                 with context():
-#                     res = op(sample.input, *sample.args, **sample.kwargs)
-#             except Exception:
-#                 continue
-
-#             def run_with_fake_mode_and_verify(fake_mode, match_results=True):
-#                 def map_to_fake(e):
-#                     if isinstance(e, torch.Tensor):
-#                         return fake_mode.from_tensor(e)
-#                     else:
-#                         return e
-
-#                 input = tree_map(map_to_fake, sample.input)
-#                 args = tree_map(map_to_fake, sample.args)
-#                 kwargs = tree_map(map_to_fake, sample.kwargs)
-
-#                 try:
-#                     with context():
-#                         with fake_mode:
-#                             res_fake = op(input, *args, **kwargs)
-
-#                     if not match_results:
-#                         return
-
-#                     for fake_out, real_out in zip(
-#                         pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
-#                     ):
-#                         if not isinstance(fake_out, torch.Tensor):
-#                             self.assertTrue(not isinstance(real_out, torch.Tensor))
-#                             self.assertEqual(fake_out, real_out)
-#                             continue
-
-#                         self.assertTrue(isinstance(fake_out, FakeTensor))
-#                         # if you see a shape exception here, you may need to add
-#                         # a `dynamic_output_shape` tag to an operator
-
-#                         # prims/decomps must correctly model strides,
-#                         # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
-#                         prims.utils.compare_tensor_meta(fake_out, real_out, True)
-
-#                         if name not in aliasing_failures:
-#                             fake_aliasing = outputs_alias_inputs(
-#                                 (input, args, kwargs), res_fake
-#                             )
-#                             real_aliasing = outputs_alias_inputs(
-#                                 (sample.input, sample, args, sample.kwargs), res
-#                             )
-#                             self.assertEqual(fake_aliasing, real_aliasing)
-
-#                     self.assertTrue(
-#                         name not in dynamic_output_op_tests
-#                         and name not in data_dependent_op_tests
-#                     )
-
-#                 except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
-#                     pass
-#                 except torch._subclasses.fake_tensor.UnsupportedOperatorException:
-#                     pass
-#                 except torch._subclasses.fake_tensor.DynamicOutputShapeException:
-#                     self.assertTrue(
-#                         name in dynamic_output_op_tests
-#                         or name in sometimes_dynamic_output_op_test
-#                     )
-#                     self.assertTrue(
-#                         mode.shape_env is None
-#                         or not mode.shape_env.allow_dynamic_output_shape_ops
-#                         or name not in supported_dynamic_output_op_tests
-#                     )
-#                 except torch._subclasses.fake_tensor.DataDependentOutputException:
-#                     self.assertTrue(name in data_dependent_op_tests)
-
-#             run_with_fake_mode_and_verify(mode)
-#             if name in supported_dynamic_output_op_tests:
-#                 run_with_fake_mode_and_verify(
-#                     allow_dynamic_output_shape_mode, match_results=False
-#                 )
-
-#     @ops(op_db, dtypes=OpDTypes.any_one)
-#     def test_pointwise_ops(self, device, dtype, op):
-#         name = op.name
-#         if op.variant_test_name:
-#             name += "." + op.variant_test_name
-#         if name in fake_skips or "sparse" in name or "jiterator" in name:
-#             self.skipTest("Skip failing test")
-
-#         test_self = self
-
-#         class TestPointwiseMode(TorchDispatchMode):
-#             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
-#                 kwargs = kwargs or {}
-
-#                 out = func(*args, **kwargs)
-
-#                 if torch.Tag.pointwise in func.tags:
-#                     shapes = []
-#                     for inp in pytree.arg_tree_leaves(*args, **kwargs):
-#                         if isinstance(inp, torch.Tensor):
-#                             shapes.append(inp.shape)
-
-#                     out_shape = torch._refs._broadcast_shapes(*shapes)
-
-#                     for out_elem in pytree.tree_leaves(out):
-#                         if isinstance(out_elem, torch.Tensor):
-#                             test_self.assertEqual(out_elem.shape, out_shape)
-
-#                 return out
-
-#         samples = op.sample_inputs(device, dtype, requires_grad=False)
-#         for sample in samples:
-#             mode = FakeTensorMode()
-
-#             def map_to_fake(e):
-#                 if isinstance(e, torch.Tensor):
-#                     return mode.from_tensor(e)
-#                 else:
-#                     return e
-
-#             input = tree_map(map_to_fake, sample.input)
-#             args = tree_map(map_to_fake, sample.args)
-#             kwargs = tree_map(map_to_fake, sample.kwargs)
-
-#             try:
-#                 op(input, *args, **kwargs)
-#             except Exception as e:
-#                 continue
-
-#             with TestPointwiseMode():
-#                 with mode:
-#                     op(input, *args, **kwargs)
-
-#     @ops(op_db, dtypes=OpDTypes.any_one)
-#     def test_fake(self, device, dtype, op):
-#         self._test_fake_helper(device, dtype, op, contextlib.nullcontext)
-
-#     @ops(op_db, dtypes=OpDTypes.any_one)
-#     def test_fake_autocast(self, device, dtype, op):
-#         if op.name in fake_autocast_device_skips[device]:
-#             self.skipTest("Skip failing test")
-#         context = (
-#             torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
-#         )
-#         self._test_fake_helper(device, dtype, op, context)
-
-#     def _test_fake_crossref_helper(self, device, dtype, op, context):
-#         samples = op.sample_inputs(device, dtype, requires_grad=True)
-
-#         for iter, sample in enumerate(samples):
-#             args = [sample.input] + list(sample.args)
-#             kwargs = sample.kwargs
-
-#             # skip these to speed up tests
-#             common_skip_ops = (
-#                 aten.detach.default,
-#                 aten.empty_strided.default,
-#                 aten.copy_.default,
-#                 aten.is_same_size.default,
-#             )
-
-#             # TODO: enable check_aliasing, batch norm fails
-#             try:
-#                 with torch._subclasses.CrossRefFakeMode(
-#                     ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True
-#                 ):
-#                     with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(
-#                         False
-#                     ):
-#                         composite_compliance.compute_expected_grads(
-#                             op.get_op(),
-#                             args,
-#                             kwargs,
-#                             sample.output_process_fn_grad,
-#                             op.gradcheck_wrapper,
-#                         )
-#             except torch._subclasses.fake_tensor.UnsupportedOperatorException:
-#                 pass
-
-#     @onlyCUDA
-#     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-#     @skipOps(
-#         "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
-#     )
-#     def test_fake_crossref_backward_no_amp(self, device, dtype, op):
-#         self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
-
-#     @onlyCUDA
-#     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
-#     @skipOps(
-#         "TestFakeTensor",
-#         "test_fake_crossref_backward_amp",
-#         fake_backward_xfails | fake_autocast_backward_xfails,
-#     )
-#     def test_fake_crossref_backward_amp(self, device, dtype, op):
-#         self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
-
-#     @ops([op for op in ops_and_refs if op.is_factory_function])
-#     def test_strided_layout(self, device, dtype, op):
-#         samples = op.sample_inputs(device, dtype)
-#         for sample in samples:
-#             kwargs = sample.kwargs.copy()
-#             kwargs["layout"] = torch.strided
-#             strided_result = op(sample.input, *sample.args, **kwargs)
-#             self.assertEqual(strided_result.layout, torch.strided)
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    def test_view_replay(self, device, dtype, op):
+        def _assert_match_metadata(a, b):
+            self.assertEqual(a.size(), b.size())
+            self.assertEqual(a.stride(), b.stride())
+            self.assertEqual(a.storage_offset(), b.storage_offset())
+            self.assertEqual(a.device, b.device)
+            self.assertEqual(a.dtype, b.dtype)
+
+        # ensure view replay is enabled
+        with torch.autograd._force_original_view_tracking(True):
+            for sample in op.sample_inputs(device, dtype, requires_grad=False):
+                inp = sample.input
+                outs = op(inp, *sample.args, **sample.kwargs)
+                if not isinstance(outs, (tuple, List)):
+                    outs = [outs]
+
+                # for all outputs that are views of the input, we should be able to replay the
+                # forward and reverse views via a functioning view_func() / rev_view_func().
+                for out in outs:
+                    if not (
+                        isinstance(out, torch.Tensor)
+                        and out._is_view()
+                        and out._base is inp
+                    ):
+                        continue
+
+                    # forward view_func
+                    new_inp = inp.clone()
+                    _assert_match_metadata(new_inp, inp)
+                    new_out = out._view_func_unsafe(new_inp)
+                    _assert_match_metadata(new_out, out)
+                    self.assertEqual(new_out, out)
+
+                    # reverse view_func
+                    new_out = out.detach()
+                    new_inp = out._rev_view_func_unsafe(new_out)
+                    _assert_match_metadata(new_inp, inp)
+                    self.assertTrue(new_inp._is_view())
+                    self.assertTrue(new_inp._base is new_out)
+
+
+@unMarkDynamoStrictTest
+class TestMathBits(TestCase):
+    # Tests that
+    # 1. The operator's output for physically conjugated/negated tensors and conjugate/negative view tensors
+    # produces the same value
+    # 2. The gradients are same in both cases mentioned in (1)
+    # 3. If the operator's inplace variant is supported, tests that the inplace operation
+    #    produces the correct value when called on a conjugate/negative view tensor and that the output
+    #    has its conj/neg bit set to true
+    # This test only runs for C -> R and C -> C functions
+    # TODO: add tests for `R->C` functions
+    # Note: This test runs for functions that take both tensors and tensorlists as input.
+    def _test_math_view(
+        self,
+        device,
+        dtype,
+        op,
+        samples,
+        math_op_physical,
+        math_op_view,
+        is_bit_set,
+        out_type,
+    ):
+        inplace_variant = op.inplace_variant
+
+        # helper function to clone and conjugate/negate the input if its a tensor
+        # else clone the sequence and conjugate/negate the first element in the sequence
+        # If a requires_grad argument is provided the tensor being conjugated/negated will
+        # have its requires_grad set to that value.
+        def clone_and_perform_view(input, **kwargs):
+            if isinstance(input, torch.Tensor):
+                requires_grad = kwargs.get("requires_grad", input.requires_grad)
+                with torch.no_grad():
+                    # Ensure view represents the original sample input
+                    input = math_op_physical(input)
+                # Note: .conj() is not called under no_grad mode since it's not allowed to modify a
+                # view created in no_grad mode. Here it's ok to do so, so as a workaround we call conj
+                # before resetting the requires_grad field for input
+                input = math_op_view(input)
+                assert input.is_leaf
+                return input.requires_grad_(requires_grad)
+
+            if isinstance(input, Sequence):
+                out = list(map(clone_input_helper, input))
+                out[0] = clone_and_perform_view(out[0])
+                return tuple(out)
+
+        for sample in samples:
+            tensor = (
+                sample.input
+                if isinstance(sample.input, torch.Tensor)
+                else sample.input[0]
+            )
+            cloned1 = clone_and_perform_view(sample.input)
+
+            # Computes function forward value with a physically conjugated/negated tensor and
+            # a conj/neg view tensor and verifies that the output in both case are equal.
+            expected_forward = op(sample.input, *sample.args, **sample.kwargs)
+            forward_with_mathview = op(cloned1, *sample.args, **sample.kwargs)
+            self.assertEqual(expected_forward, forward_with_mathview)
+
+            # If the op has an inplace variant, and the input doesn't require broadcasting
+            # and has the same dtype as output, verify that the inplace operation on a conjugated/negated
+            # input produces correct output, and the output tensor has the conj/neg bit set to True
+            if inplace_variant is not None and not sample.broadcasts_input:
+                cloned2 = clone_and_perform_view(tensor, requires_grad=False)
+                if (
+                    isinstance(expected_forward, torch.Tensor)
+                    and expected_forward.dtype is tensor.dtype
+                ):
+                    inplace_forward = inplace_variant(
+                        cloned2, *sample.args, **sample.kwargs
+                    )
+                    self.assertTrue(is_bit_set(inplace_forward))
+                    self.assertEqual(inplace_forward, expected_forward)
+
+            # TODO: backward consistency only supported for single tensor outputs
+            # TODO: backward consistency only checked on sample.input, not all
+            #   tensor inputs
+            # TODO: update to handle checking grads of all tensor inputs as
+            #   derived from each tensor output
+            if (
+                isinstance(expected_forward, torch.Tensor)
+                and expected_forward.requires_grad
+            ):
+                output_process_fn_grad = sample.output_process_fn_grad or (lambda x: x)
+                expected_forward = output_process_fn_grad(expected_forward)
+                forward_with_mathview = output_process_fn_grad(forward_with_mathview)
+
+                tensor = (
+                    sample.input
+                    if isinstance(sample.input, torch.Tensor)
+                    else sample.input[0]
+                )
+                expected_forward.sum().abs().backward(retain_graph=True)
+                forward_with_mathview.sum().abs().backward(retain_graph=True)
+                if tensor.grad is not None:
+                    cloned1_tensor = (
+                        cloned1 if isinstance(cloned1, torch.Tensor) else cloned1[0]
+                    )
+                    self.assertEqual(tensor.grad, cloned1_tensor.grad)
+
+                    tensor.grad, cloned1_tensor.grad = None, None
+
+                    # a repeat of the above test if output is not complex valued
+                    if out_type(expected_forward):
+                        grad = torch.randn_like(expected_forward)
+                        expected_forward.backward(grad)
+                        forward_with_mathview.backward(
+                            math_op_view(math_op_physical(grad))
+                        )
+
+                        self.assertEqual(tensor.grad, cloned1_tensor.grad)
+
+    @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
+    def test_conj_view(self, device, dtype, op):
+        if not op.test_conjugated_samples:
+            self.skipTest("Operation doesn't support conjugated inputs.")
+        math_op_physical = torch.conj_physical
+        math_op_view = torch.conj
+        _requires_grad = torch.cfloat in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
+        is_bit_set = torch.is_conj
+        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            torch.is_complex,
+        )
+
+    @ops(ops_and_refs, allowed_dtypes=(torch.double,))
+    def test_neg_view(self, device, dtype, op):
+        if not op.test_neg_view:
+            self.skipTest("Operation not tested with tensors with negative bit.")
+        math_op_physical = torch.neg
+        math_op_view = torch._neg_view
+        is_bit_set = torch.is_neg
+        samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            lambda x: True,
+        )
+
+    @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
+    def test_neg_conj_view(self, device, dtype, op):
+        if not op.test_neg_view:
+            self.skipTest("Operation not tested with tensors with negative bit.")
+        if not op.test_conjugated_samples:
+            self.skipTest("Operation doesn't support conjugated inputs.")
+
+        def math_op_physical(x):
+            return -x.conj_physical()
+
+        def math_op_view(x):
+            return torch._neg_view(x).conj()
+
+        def is_bit_set(x):
+            return torch.is_neg(x) and torch.is_conj(x)
+
+        _requires_grad = dtype in op.supported_backward_dtypes(
+            torch.device(device).type
+        )
+        samples = op.sample_inputs(device, dtype, requires_grad=_requires_grad)
+        # Only test one sample
+        samples = itertools.islice(samples, 1)
+        self._test_math_view(
+            device,
+            dtype,
+            op,
+            samples,
+            math_op_physical,
+            math_op_view,
+            is_bit_set,
+            torch.is_complex,
+        )
+
+
+# input strides and size may have been altered due to the result of an inplace op
+def check_inplace_view(func, input, rs, input_size, input_strides):
+    if func is None:
+        return
+    # TODO: extend this test to test ops with multiple outputs and ops like native_batch_norm(_legit).out
+    # which mutate not necessarily the first input.
+    if isinstance(rs, torch.Tensor) and rs is input:
+        unequal_size = rs.size() != input_size
+        unequal_strides = rs.stride() != input_strides
+        # resize_ should probably have inplace_view tag. Not adding the tag since it
+        # breaks some codegen logic
+        if unequal_size or unequal_strides:
+            if isinstance(func, torch._ops.OpOverloadPacket):
+                func = func.default
+            # Reference: https://github.com/pytorch/pytorch/issues/78759
+            if func is not torch.ops.aten.resize_.default:
+                # TODO: use self.assertIn when we have separate tests for each tag
+                assert torch.Tag.inplace_view in func.tags
+
+
+# A mode that when enabled runs correctness checks to ensure
+# that operators have expected tags based on their input and
+# output tensor properties
+class TestTagsMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if isinstance(args[0], torch.Tensor):
+            old_size = args[0].size()
+            old_stride = args[0].stride()
+            rs = func(*args, **kwargs)
+            check_inplace_view(func, args[0], rs, old_size, old_stride)
+        else:
+            rs = func(*args, **kwargs)
+        return rs
+
+
+# Test to verify the correctness for tags in `tags.yaml`, also available for access through `torch.Tags`
+@unMarkDynamoStrictTest
+class TestTags(TestCase):
+    @onlyCPU
+    @ops(ops_and_refs, dtypes=OpDTypes.any_one)
+    def test_tags(self, device, dtype, op):
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample in samples:
+            # TODO: Test tags for ops that return a list of tensors
+            input = sample.input
+            if isinstance(input, torch.Tensor):
+                old_size = input.size()
+                old_stride = input.stride()
+                with TestTagsMode():
+                    rs = op(input, *sample.args, **sample.kwargs)
+                # TODO: add test for aliases: https://github.com/pytorch/pytorch/issues/78761
+                aten_name = op.aten_name if op.aten_name is not None else op.name
+                opoverloadpacket = getattr(torch.ops.aten, aten_name, None)
+                check_inplace_view(opoverloadpacket, input, rs, old_size, old_stride)
+
+
+class TestSelfKwarg(TestCase):
+    def test_self_kwargs(self):
+        """Verify that we can call the aten ops with all kwargs even if the
+        argument's name is "self"
+        """
+        torch.ops.aten.reshape.default(self=torch.rand(1, 2), shape=[2])
+        torch.ops.aten.min.default(self=torch.rand(100))
+
+
+@unMarkDynamoStrictTest
+class TestRefsOpsInfo(TestCase):
+    import_paths = [
+        "_refs",
+        "_refs.special",
+        "_refs.nn.functional",
+        "_refs.fft",
+        "_refs._conversions",
+    ]
+    module_alls = [
+        (path, import_module(f"torch.{path}").__all__) for path in import_paths
+    ]
+    ref_ops_names = tuple(
+        itertools.chain.from_iterable(
+            [f"{path}.{op}" for op in module_all] for path, module_all in module_alls
+        )
+    )
+    ref_db_names = {ref_op.name for ref_op in python_ref_db}
+
+    # TODO: References that do not have an entry in python_ref_db
+    skip_ref_ops = {
+        "_refs.alias",
+        "_refs.bitwise_right_shift",
+        "_refs.copy_to",
+        "_refs.empty_permuted",
+        "_refs.empty_strided",
+        "_refs.equal",
+        "_refs.full",
+        "_refs.full_like",
+        "_refs.is_complex",
+        "_refs.to",
+        "_refs.mvlgamma",
+        "_refs.ones",
+        "_refs.ones_like",
+        "_refs.special.expit",
+        "_refs.std_var",
+        "_refs.swap_axes",
+        "_refs.uniform",
+        "_refs.scalar_tensor",
+        "_refs.trunc_divide",
+        "_refs.zero",
+        "_refs.zeros",
+        "_refs.zeros_like",
+        "_refs.rfloordiv",
+        "_refs.rtruediv",
+        "_refs.rpow",
+        # These should be tested with their out-of-place counterparts
+        "_refs.index_add_",
+        "_refs.index_copy_",
+        "_refs.index_fill_",
+        "_refs.native_group_norm",
+    }
+
+    not_in_decomp_table = {
+        # duplicated in _decomp and _refs
+        "_refs.nn.functional.group_norm",
+        "_refs.nn.functional.mse_loss",
+        "_refs.floor_divide",
+        # duplicated as refs do not have decent support for advanced indexing
+        "_refs.index_copy",
+        "_refs.index_copy_",
+        "_refs.index_add",
+        "_refs.index_add_",
+        # these are not aten ops?
+        "_refs._conversions.bfloat16",
+        "_refs._conversions.bool",
+        "_refs._conversions.byte",
+        "_refs._conversions.char",
+        "_refs._conversions.double",
+        "_refs._conversions.float",
+        "_refs._conversions.half",
+        "_refs._conversions.int",
+        "_refs._conversions.long",
+        "_refs._conversions.short",
+        "_refs._conversions.chalf",
+        "_refs._conversions.cfloat",
+        "_refs._conversions.cdouble",
+        "_refs.broadcast_shapes",
+        "_refs.broadcast_tensors",
+        "_refs.mvlgamma",
+        "_refs.nn.functional.layer_norm",
+        "_refs.nn.functional.tanhshrink",
+        "_refs.nn.functional.triplet_margin_loss",
+        "_refs.rfloordiv",
+        "_refs.rtruediv",
+        "_refs.rpow",
+        # CompositeImplicitAutograd
+        "_refs.allclose",
+        "_refs.atleast_1d",
+        "_refs.atleast_2d",
+        "_refs.atleast_3d",
+        "_refs.broadcast_to",
+        "_refs.chunk",
+        "_refs.column_stack",
+        "_refs.contiguous",
+        "_refs.dsplit",
+        "_refs.dstack",
+        "_refs.fill",
+        "_refs.fill_",
+        "_refs.flatten",
+        "_refs.fliplr",
+        "_refs.flipud",
+        "_refs.float_power",
+        "_refs.hsplit",
+        "_refs.hstack",
+        "_refs.isclose",
+        "_refs.isfinite",
+        "_refs.isreal",
+        "_refs.istft",
+        "_refs.log_softmax",
+        "_refs.movedim",
+        "_refs.narrow",
+        "_refs.nn.functional.dropout",
+        "_refs.nn.functional.l1_loss",
+        "_refs.nn.functional.smooth_l1_loss",
+        "_refs.nn.functional.log_softmax",
+        "_refs.nn.functional.poisson_nll_loss",
+        "_refs.nn.functional.softmax",
+        "_refs.nn.functional.softmin",
+        "_refs.positive",
+        "_refs.ravel",
+        "_refs.reshape",
+        "_refs.softmax",
+        "_refs.special.expit",
+        "_refs.special.log_softmax",
+        "_refs.special.softmax",
+        "_refs.square",
+        "_refs.stft",
+        "_refs.T",
+        "_refs.take_along_dim",
+        "_refs.tensor_split",
+        "_refs.to",
+        "_refs.true_divide",
+        "_refs.trunc_divide",
+        "_refs.vsplit",
+        "_refs.vstack",
+        "_refs.linalg.matrix_norm",
+        "_refs.linalg.norm",
+        "_refs.linalg.svd",
+        "_refs.linalg.svdvals",
+        "_refs.unflatten",
+        "_refs.sum_to_size",
+        # ref implementation missing kwargs
+        "_refs.full_like",  # missing "layout"
+        "_refs.scalar_tensor",  # missing "layout"
+        # other
+        "_refs.block_diag",  # only refs._block_diag_iterable is in decomposition table
+        "_refs.empty",  # intentional; direct empty is faster and has less guards
+        "_refs.empty_permuted",  # intentional; direct empty is faster and has less guards
+        "_refs.expand_as",
+        "_refs.as_strided",  # _prims._as_strided_meta: "reduce() of empty sequence with no initial value"
+        "_refs.copy_to",  # torch._C._jit_get_operation: No such operator aten::copy_to
+        "_refs.equal",  # 'bool' object has no attribute 'dtype'
+        "_refs.conj",  # Calls _prims.conj
+        "_refs.real",
+        "_refs.imag",
+        "_refs.reshape_as",
+        "_refs.view_as",
+        "_refs.view_as_complex",  # TorchInductor does not support complex at the moment.
+        # the decompositions for these ops are slightly different
+        # because of out handling
+        "_refs.var_mean",
+        "_refs.std_mean",
+        "_refs.native_layer_norm",
+    }
+
+    @parametrize("op", ref_ops_names)
+    def test_refs_are_in_python_ref_db(self, op):
+        inplace = op[-1] == "_"
+        if op in self.skip_ref_ops:
+            raise unittest.SkipTest(f"{op} does not have an entry in python_ref_db")
+        elif inplace:
+            self.assertNotIn(
+                op,
+                self.ref_db_names,
+                msg=f"{op} is an in-place operation and should not have an OpInfo",
+            )
+        else:
+            # Intentionally don't use assertIn to avoid printing the
+            # (very large) container
+            self.assertTrue(op in self.ref_db_names, msg=f"{op} not in ref_db_names")
+
+    @parametrize("op", ref_ops_names)
+    def test_refs_are_in_decomp_table(self, op):
+        path = op.split(".")
+        module_path = ".".join(path[:-1])
+        op_name = path[-1]
+        op_impl = getattr(import_module(f"torch.{module_path}"), op_name)
+
+        if op in self.not_in_decomp_table:
+            self.assertNotIn(
+                op_impl,
+                torch._decomp.decomposition_table.values(),
+                f"Unexpectedly found {op} in torch._decomp.decomposition_table.values()",
+            )
+        else:
+            self.assertIn(
+                op_impl,
+                torch._decomp.decomposition_table.values(),
+                f"Did not find {op} in torch._decomp.decomposition_table.values()",
+            )
+
+
+fake_skips = (
+    "aminmax",  # failing input
+    "cov",  # aweights cannot be negtaive
+    "istft",  # window overlap add min: 0
+    "linalg.eigvals",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+    "linalg.eigvalsh",  # aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
+    "linalg.matrix_power",  # Could not run 'aten::eye.m_out' with arguments from the 'Meta' backend
+    # "linalg.pinv",  # Could not run 'aten::pinv.out' with arguments from the 'Meta' backen
+    "linalg.matrix_rank.hermitian",  # Could not run 'aten::linalg_eigvalsh.out' with arguments from the 'Meta' backend
+    "linalg.pinv.hermitian",  # tensor.mH is only supported on matrices or batches of matrices. Got 1-D tensor
+    "linalg.solve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta' backend
+    "linalg.tensorsolve",  # Could not run 'aten::linalg_solve' with arguments from the 'Meta'
+    "lu_solve",  # MALLOC ERROR: debug
+    "multinomial",  # Could not run 'aten::multinomial' with arguments from the 'Meta' backend
+    "mvlgamma.mvlgamma_p_1",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+    "mvlgamma.mvlgamma_p_3",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+    "mvlgamma.mvlgamma_p_5",  # Could not run 'aten::_local_scalar_dense' with arguments from the 'Meta' backend
+    "nanmean",  # logical_not() got an unexpected keyword argument 'out'
+    "quantile",  # quantile() q values must be in the range [0, 1]
+    "nanquantile",  # quantile() q values must be in the range [0, 1]
+    "nn.functional.ctc_loss",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+    "nn.functional.embedding_bag",  # sometimes errors
+    "nn.functional.nll_loss",  # sometimes errors
+    "nn.functional.max_pool1d",  # The tensor has a non-zero number of elements
+    "to_sparse",  # Could not run 'aten::_to_sparse' with arguments from the 'Meta' backend
+    "tensor_split",  # The tensor has a non-zero number of elements, but its data is not allocated yet
+    "repeat_interleave",  # cannot repeat_interleave a meta tensor without output_size
+    "sparse.sampled.addmm",  # sparsity not supported
+    # Can not infer total number of classes from meta. no way at present to throw DynamicOutputShapeException
+    "nn.functional.one_hot",
+    "narrow",  # Fails only for one overload with DataDependentOutputException (hence skip).
+)
+
+fake_autocast_device_skips = defaultdict(dict)
+
+# TODO: investigate/fix
+fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
+
+
+dynamic_output_op_tests = (
+    "argwhere",
+    "bincount",
+    "combinations",
+    "linalg.lstsq",
+    "masked_select",
+    "nonzero",
+    "unique_consecutive",
+    "unique",
+    "linalg.lstsq.grad_oriented",
+)
+
+# Ops that have dynamic output shapes that we can handle when
+# allow_dynamic_shape_ops is True in fake tensor shape environment.
+supported_dynamic_output_op_tests = (
+    "nonzero",
+    "unique",
+    "repeat_interleave",
+    "masked_select",
+)
+
+# some inputs invoke dynamic output shape operators, some do not
+sometimes_dynamic_output_op_test = (
+    "__getitem__",
+    "index_select",
+)
+
+data_dependent_op_tests = (
+    "equal",
+    "corrcoef",
+    "nn.functional.gaussian_nll_loss",
+    "allclose",
+)
+
+aliasing_failures = ("histogramdd",)
+
+fake_backward_skips = {
+    "linalg.cond",
+    "linalg.matrix_norm",
+    "linalg.norm",
+    "linalg.svd",
+    "linalg.svdvals",
+    "pca_lowrank",
+    "roll",
+    "svd_lowrank",
+    "sgn",
+}
+
+fake_backward_xfails = {skip(s) for s in fake_backward_skips} | {
+    xfail("fft.ihfftn"),  # Mismatch in aten._conj_physical.default
+    xfail("fft.ihfft2"),  # Mismatch in aten._conj_physical.default
+    skip("nn.functional.ctc_loss"),
+}
+
+fake_autocast_backward_xfails = {
+    skip("nn.functional.binary_cross_entropy"),
+    skip("sparse.sampled_addmm"),
+    skip("linalg.pinv"),
+    skip("linalg.pinv", "hermitian"),
+    skip("linalg.pinv", "singular"),
+    skip("pinverse"),
+}
+
+
+@unMarkDynamoStrictTest
+class TestFakeTensor(TestCase):
+    def setUp(self):
+        # Turn on FakeTensor caching and cross-checking for these tests:
+        cache_enabled = unittest.mock.patch(
+            "torch._dynamo.config.fake_tensor_cache_enabled", True
+        )
+        cache_enabled.start()
+        self.addCleanup(cache_enabled.stop)
+
+        cache_crosscheck = unittest.mock.patch(
+            "torch._dynamo.config.fake_tensor_cache_crosscheck_enabled", True
+        )
+        cache_crosscheck.start()
+        self.addCleanup(cache_crosscheck.stop)
+
+    def _test_fake_helper(self, device, dtype, op, context):
+        name = op.name
+        if op.variant_test_name:
+            name += "." + op.variant_test_name
+        if name in fake_skips or "sparse" in name or "jiterator" in name:
+            self.skipTest("Skip failing test")
+
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample in samples:
+            mode = FakeTensorMode()
+
+            from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+            allow_dynamic_output_shape_shape_env = ShapeEnv(
+                allow_dynamic_output_shape_ops=True
+            )
+
+            allow_dynamic_output_shape_mode = FakeTensorMode(
+                shape_env=allow_dynamic_output_shape_shape_env
+            )
+
+            try:
+                with context():
+                    res = op(sample.input, *sample.args, **sample.kwargs)
+            except Exception:
+                continue
+
+            def run_with_fake_mode_and_verify(fake_mode, match_results=True):
+                def map_to_fake(e):
+                    if isinstance(e, torch.Tensor):
+                        return fake_mode.from_tensor(e)
+                    else:
+                        return e
+
+                input = tree_map(map_to_fake, sample.input)
+                args = tree_map(map_to_fake, sample.args)
+                kwargs = tree_map(map_to_fake, sample.kwargs)
+
+                try:
+                    with context():
+                        with fake_mode:
+                            res_fake = op(input, *args, **kwargs)
+
+                    if not match_results:
+                        return
+
+                    for fake_out, real_out in zip(
+                        pytree.tree_leaves(res_fake), pytree.tree_leaves(res)
+                    ):
+                        if not isinstance(fake_out, torch.Tensor):
+                            self.assertTrue(not isinstance(real_out, torch.Tensor))
+                            self.assertEqual(fake_out, real_out)
+                            continue
+
+                        self.assertTrue(isinstance(fake_out, FakeTensor))
+                        # if you see a shape exception here, you may need to add
+                        # a `dynamic_output_shape` tag to an operator
+
+                        # prims/decomps must correctly model strides,
+                        # see https://github.com/pytorch/pytorch/issues/78050#issuecomment-1253950325
+                        prims.utils.compare_tensor_meta(fake_out, real_out, True)
+
+                        if name not in aliasing_failures:
+                            fake_aliasing = outputs_alias_inputs(
+                                (input, args, kwargs), res_fake
+                            )
+                            real_aliasing = outputs_alias_inputs(
+                                (sample.input, sample, args, sample.kwargs), res
+                            )
+                            self.assertEqual(fake_aliasing, real_aliasing)
+
+                    self.assertTrue(
+                        name not in dynamic_output_op_tests
+                        and name not in data_dependent_op_tests
+                    )
+
+                except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
+                    pass
+                except torch._subclasses.fake_tensor.UnsupportedOperatorException:
+                    pass
+                except torch._subclasses.fake_tensor.DynamicOutputShapeException:
+                    self.assertTrue(
+                        name in dynamic_output_op_tests
+                        or name in sometimes_dynamic_output_op_test
+                    )
+                    self.assertTrue(
+                        mode.shape_env is None
+                        or not mode.shape_env.allow_dynamic_output_shape_ops
+                        or name not in supported_dynamic_output_op_tests
+                    )
+                except torch._subclasses.fake_tensor.DataDependentOutputException:
+                    self.assertTrue(name in data_dependent_op_tests)
+
+            run_with_fake_mode_and_verify(mode)
+            if name in supported_dynamic_output_op_tests:
+                run_with_fake_mode_and_verify(
+                    allow_dynamic_output_shape_mode, match_results=False
+                )
+
+    @ops(op_db, dtypes=OpDTypes.any_one)
+    def test_pointwise_ops(self, device, dtype, op):
+        name = op.name
+        if op.variant_test_name:
+            name += "." + op.variant_test_name
+        if name in fake_skips or "sparse" in name or "jiterator" in name:
+            self.skipTest("Skip failing test")
+
+        test_self = self
+
+        class TestPointwiseMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                kwargs = kwargs or {}
+
+                out = func(*args, **kwargs)
+
+                if torch.Tag.pointwise in func.tags:
+                    shapes = []
+                    for inp in pytree.arg_tree_leaves(*args, **kwargs):
+                        if isinstance(inp, torch.Tensor):
+                            shapes.append(inp.shape)
+
+                    out_shape = torch._refs._broadcast_shapes(*shapes)
+
+                    for out_elem in pytree.tree_leaves(out):
+                        if isinstance(out_elem, torch.Tensor):
+                            test_self.assertEqual(out_elem.shape, out_shape)
+
+                return out
+
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+        for sample in samples:
+            mode = FakeTensorMode()
+
+            def map_to_fake(e):
+                if isinstance(e, torch.Tensor):
+                    return mode.from_tensor(e)
+                else:
+                    return e
+
+            input = tree_map(map_to_fake, sample.input)
+            args = tree_map(map_to_fake, sample.args)
+            kwargs = tree_map(map_to_fake, sample.kwargs)
+
+            try:
+                op(input, *args, **kwargs)
+            except Exception as e:
+                continue
+
+            with TestPointwiseMode():
+                with mode:
+                    op(input, *args, **kwargs)
+
+    @ops(op_db, dtypes=OpDTypes.any_one)
+    def test_fake(self, device, dtype, op):
+        self._test_fake_helper(device, dtype, op, contextlib.nullcontext)
+
+    @ops(op_db, dtypes=OpDTypes.any_one)
+    def test_fake_autocast(self, device, dtype, op):
+        if op.name in fake_autocast_device_skips[device]:
+            self.skipTest("Skip failing test")
+        context = (
+            torch.cuda.amp.autocast if device == "cuda" else torch.cpu.amp.autocast
+        )
+        self._test_fake_helper(device, dtype, op, context)
+
+    def _test_fake_crossref_helper(self, device, dtype, op, context):
+        samples = op.sample_inputs(device, dtype, requires_grad=True)
+
+        for iter, sample in enumerate(samples):
+            args = [sample.input] + list(sample.args)
+            kwargs = sample.kwargs
+
+            # skip these to speed up tests
+            common_skip_ops = (
+                aten.detach.default,
+                aten.empty_strided.default,
+                aten.copy_.default,
+                aten.is_same_size.default,
+            )
+
+            # TODO: enable check_aliasing, batch norm fails
+            try:
+                with torch._subclasses.CrossRefFakeMode(
+                    ignore_op_fn=lambda fn: fn in common_skip_ops, check_aliasing=True
+                ):
+                    with warnings.catch_warnings(), context(), torch.autograd.set_multithreading_enabled(
+                        False
+                    ):
+                        composite_compliance.compute_expected_grads(
+                            op.get_op(),
+                            args,
+                            kwargs,
+                            sample.output_process_fn_grad,
+                            op.gradcheck_wrapper,
+                        )
+            except torch._subclasses.fake_tensor.UnsupportedOperatorException:
+                pass
+
+    @onlyCUDA
+    @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
+    )
+    def test_fake_crossref_backward_no_amp(self, device, dtype, op):
+        self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
+
+    @onlyCUDA
+    @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestFakeTensor",
+        "test_fake_crossref_backward_amp",
+        fake_backward_xfails | fake_autocast_backward_xfails,
+    )
+    def test_fake_crossref_backward_amp(self, device, dtype, op):
+        self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
+
+    @ops([op for op in ops_and_refs if op.is_factory_function])
+    def test_strided_layout(self, device, dtype, op):
+        samples = op.sample_inputs(device, dtype)
+        for sample in samples:
+            kwargs = sample.kwargs.copy()
+            kwargs["layout"] = torch.strided
+            strided_result = op(sample.input, *sample.args, **kwargs)
+            self.assertEqual(strided_result.layout, torch.strided)
 
 
 #instantiate_device_type_tests(TestCommon, globals(), only_for="xpu")
@@ -2693,6 +2692,4 @@ def check_cow_input(
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
-    #import pdb
-    #pdb.set_trace()
     run_tests()
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8d93fcc590b2c..322ead8db2f3c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10588,7 +10588,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast'),
                # Booleans mismatch: AssertionError: False is not true
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake'),
-               #DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=None),
            )
         ),
     OpInfo('arange',
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 6ae58240ea75a..a949574326323 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -1030,9 +1030,9 @@ def __post_init__(self):
             else:
                 #self.skips = (DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=skip_dtypes))
                 self.skips = (DecorateInfo(unittest.skip, device_type="xpu", dtypes=None))
-            print("#### skipXPU on {} {} {}".format(self.name, skip_dtypes, self.skips))
+            #print("Skip XPU backend on {} with {} and {}".format(self.name, skip_dtypes, self.skips))
         else:
-            print("#### Don't skipXPU on {}".format(self.name))
+            print("Won't skip XPU backend on op {}".format(self.name))
 
         self.decorators = (*self.decorators, *self.skips)
 
@@ -1375,8 +1375,7 @@ def sample_inputs_sparse_bsc(self, device, dtype, requires_grad=False, **kwargs)
     def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
         """Returns the decorators targeting the given test."""
         result = []
-        #import pdb
-        #pdb.set_trace()
+   
         for decorator in self.decorators:
             if isinstance(decorator, DecorateInfo):
                 if decorator.is_active(

From fa6c8ae1da4bd242d1319ba431abe474a593da03 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 21 May 2024 20:35:49 -0700
Subject: [PATCH 04/37] refine the xpu ops switch method

---
 test/test_ops.py                              |  25 ++--
 torch/testing/_internal/common_device_type.py |  11 +-
 .../_internal/common_methods_invocations.py   | 128 +++++++++---------
 torch/testing/_internal/opinfo/core.py        |  37 ++---
 4 files changed, 92 insertions(+), 109 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 5a8d9bc461f8d..70c2eb41eb26e 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -108,10 +108,6 @@
     )
 )
 
-if TEST_XPU:
-    any_common_cpu_device_one = OpDTypes.any_common_cpu_xpu_one
-else:
-    any_common_cpu_device_one = OpDTypes.any_common_cpu_cuda_one
 
 
@@ -136,11 +132,13 @@ def reduction_dtype_filter(op):
 
 aten = torch.ops.aten
 
-# _xpu_computation_op_list = ["_refs.abs", "_refs.all", "item", "abs", "add", "_refs.fill"]
-# _xpu_computation_op_list = ["abs"]
-# _xpu_computation_ops = [
-#    op for op in ops_and_refs if op.name in _xpu_computation_op_list
-# ]
+def any_common_cpu_device_one():
+    # import pdb
+    # pdb.set_trace()
+    return OpDTypes.any_common_cpu_xpu_one if TEST_XPU else OpDTypes.any_common_cpu_cuda_one
+
+def has_gpu_device(devices: List[str]):
+    return "cuda" in devices or "xpu" in devices
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
@@ -286,7 +284,7 @@ def test_numpy_ref(self, device, dtype, op):
             and op.formatted_name
             in ("signal_windows_exponential", "signal_windows_bartlett")
             and dtype == torch.float64
-            and ("cuda" in device or "xpu" in device)
+            and has_gpu_device(device)
         ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
 
@@ -301,8 +299,7 @@ def test_numpy_ref(self, device, dtype, op):
     @onlyCUDAAndXPU
     @suppress_warnings
     @slowTest
-    @ops(_ops_and_refs_with_no_numpy_ref, dtypes=any_common_cpu_device_one)
-    #@ops(_xpu_computation_ops, dtypes=any_common_cpu_device_one)
+    @ops(_ops_and_refs_with_no_numpy_ref, dtypes=any_common_cpu_device_one())
     def test_compare_cpu(self, device, dtype, op):
         def to_cpu(arg):
             if isinstance(arg, torch.Tensor):
@@ -2683,8 +2680,8 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-#instantiate_device_type_tests(TestCommon, globals(), only_for="xpu")
-instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu")
+instantiate_device_type_tests(TestCommon, globals(), only_for="xpu")
+#instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu")
 #instantiate_device_type_tests(TestMathBits, globals())
 #instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
 #instantiate_device_type_tests(TestFakeTensor, globals())
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 17bd8357f15b0..86940f6c95cdc 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -397,19 +397,13 @@ def instantiate_test_helper(cls, name, *, test, param_kwargs=None, decorator_fn=
             # Add the device param kwarg if the test needs device or devices.
             param_kwargs = {} if param_kwargs is None else param_kwargs
             test_sig_params = inspect.signature(test).parameters
-            #import pdb
-            #pdb.set_trace()
             if 'device' in test_sig_params or 'devices' in test_sig_params:
                 device_arg: str = cls._init_and_get_primary_device()
                 if hasattr(test, 'num_required_devices'):
                     device_arg = cls.get_all_devices()
                 _update_param_kwargs(param_kwargs, 'device', device_arg)
-            #import pdb
-            #pdb.set_trace()
             # Apply decorators based on param kwargs.
             for decorator in decorator_fn(param_kwargs):
-                #import pdb
-                #pdb.set_trace()
                 test = decorator(test)
 
             # Constructs the test
@@ -845,6 +839,7 @@ class OpDTypes(Enum):
     any_common_cpu_xpu_one = 7 # Test precisely one supported dtype that is common to both xpu and cpu
 
 
+
 # Arbitrary order
 ANY_DTYPE_ORDER = (
     torch.float32,
@@ -920,8 +915,6 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                                'instantiate_parametrized_tests()')
 
         op = check_exhausted_iterator = object()
-        #import pdb
-        #pdb.set_trace()
         for op in self.op_list:
             # Determine the set of dtypes to use.
             dtypes: Union[Set[torch.dtype], Set[None]]
@@ -1615,3 +1608,5 @@ def skipPRIVATEUSE1(fn):
 #  This should probably enumerate all available device type test base classes.
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+
+
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 322ead8db2f3c..872fca9514c8f 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -10422,7 +10422,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                    dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
-                   skipXPU=False,
                    skips=(
                        DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
                                     'test_inplace_grad', dtypes=(torch.cdouble,)),
@@ -10456,7 +10455,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    supports_sparse_csc=True,
                    supports_sparse_bsr=True,
                    supports_sparse_bsc=True,
-                   supports_forward_ad=True),
+                   supports_forward_ad=True,
+                   enable_skipped_device=('xpu',)),
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
     UnaryUfuncInfo('acos',
                    aliases=('arccos', ),
@@ -10545,7 +10545,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True,
                     supports_forward_ad=True,
                     supports_two_python_scalars=True,
-                    skipXPU=False,
                     decorators=(
                         DecorateInfo(
                             toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
@@ -10565,7 +10564,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_extremal_values',
                                      dtypes=(torch.complex64, torch.complex128)),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',),
+                    ),
     OpInfo('item',
            op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.item, inp, *args, **kwargs),
            ref=np.ndarray.item,
@@ -10575,7 +10576,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_autograd=False,
            error_inputs_func=error_inputs_item,
            sample_inputs_func=sample_inputs_item,
-           skipXPU=False,
            skips=(
                # Error testing item function variant
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
@@ -10588,7 +10588,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast'),
                # Booleans mismatch: AssertionError: False is not true
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake'),
-           )
+           ),
+           enable_skipped_device=('xpu',),
         ),
     OpInfo('arange',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
@@ -10597,7 +10598,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            is_factory_function=True,
            error_inputs_func=error_inputs_arange,
            sample_inputs_func=sample_inputs_arange,
-           skipXPU=False,
            skips=(
                # https://github.com/pytorch/pytorch/issues/81774
                DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
@@ -10626,7 +10626,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     OpInfo('cauchy',
            op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.cauchy_, inp, *args, **kwargs),
            inplace_variant=torch.Tensor.cauchy_,
@@ -10806,7 +10807,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_rhs_python_scalar=False,
                     supports_fwgrad_bwgrad=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=False),
-                    skipXPU=False,
                     skips=(
                         # RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.expectedFailure,
@@ -10817,7 +10817,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
                         # test error disabled since rhs non-tensor python scalar is supported
                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',),),
     BinaryUfuncInfo('clamp_min',
                     ref=_clamp_min_numpy,
                     dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
@@ -10825,7 +10826,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_rhs_python_scalar=False,
                     supports_fwgrad_bwgrad=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=False),
-                    skipXPU=False,
                     skips=(
                         # RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
                         DecorateInfo(unittest.expectedFailure,
@@ -10836,7 +10836,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
                         # test error disabled since rhs non-tensor python scalar is supported
                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('mul',
                     aliases=('multiply',),
                     dtypes=all_types_and_complex_and(torch.chalf, torch.float16, torch.bfloat16, torch.bool),
@@ -10850,7 +10851,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csc),
                     sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsr),
                     sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc),
-                    skipXPU=False),
+                    enable_skipped_device=('xpu',)),
                     
     BinaryUfuncInfo('sub',
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
@@ -10878,7 +10879,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
                             'TestDecomp', 'test_quick', device_type='cpu'),
                     ),
-                    skipXPU=False,
                     skips=(
                         DecorateInfo(unittest.skip("Skipped!"),
                                      'TestBinaryUfuncs',
@@ -10888,7 +10888,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
                                      dtypes=(torch.uint8,)),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     OpInfo('addmm',
            # This addmm OpInfo is for when alpha and beta are not both equal to 1.
            # alpha=beta=1 is tested in the following opinfo, because that special case will
@@ -11331,7 +11332,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    dtypes=integral_types_and(torch.bool),
                    operator_variant=operator.invert,
                    supports_autograd=False,
-                   skipXPU=False),
+                   enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('bitwise_left_shift',
                     op=torch.bitwise_left_shift,
                     dtypes=integral_types(),
@@ -11461,13 +11462,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            supports_out=False,
-           skipXPU=False,
            skips=(
                # TypeError: _copy_dispatcher() got an unexpected keyword argument 'memory_format'
                # (NumPy reference needs to be extended with memory_format)
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref'),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
-           ),),
+           ),
+           enable_skipped_device=('xpu',)),
     OpInfo('contiguous',
            op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
@@ -11503,14 +11504,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            assert_autodiffed=True,
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skipXPU=False,
            skips=(
                # NNC appear to not handle boolean clamp
                DecorateInfo(unittest.expectedFailure,
                             'TestNNCOpInfo',
                             'test_nnc_correctness',
                             dtypes=(torch.bool,)),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     UnaryUfuncInfo('positive',
                    ref=np.positive,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
@@ -11574,7 +11575,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_view_as_real,
            test_conjugated_samples=False,
-           skipXPU=False,
+           enable_skipped_device=('xpu',),
            ),
     OpInfo('view_as_complex',
            dtypes=floating_types_and(torch.half),
@@ -11583,7 +11584,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            test_neg_view=False,
            sample_inputs_func=sample_inputs_view_as_complex,
-           skipXUP=False,
            skips=(
                # RuntimeError: Tensor must have a last dimension with stride 1
                DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
@@ -11591,7 +11591,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.half,)),
                # RuntimeError: view size is not compatible with input tensor's size and stride
                DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('complex',
                     dtypes=floating_types_and(torch.half),
                     supports_forward_ad=True,
@@ -11637,7 +11638,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    supports_fwgrad_bwgrad=True,
                    promotes_int_to_float=True,
                    decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
-                   skipXPU=False,
                    skips=(
                        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
@@ -11655,7 +11655,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda',
                                     dtypes=(torch.chalf,), active_if=IS_WINDOWS),
-                   )),
+                   ),
+                   enable_skipped_device=('xpu',)),
     UnaryUfuncInfo('cosh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -11726,12 +11727,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
-           skipXPU=False,
            skips=(
                # cumsum does not handle correctly out= dtypes
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
            ),
-           sample_inputs_func=sample_inputs_cumulative_ops),
+           sample_inputs_func=sample_inputs_cumulative_ops,
+           enable_skipped_device=('xpu',)),
     OpInfo('cumprod',
            dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            supports_forward_ad=True,
@@ -11804,7 +11805,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_two_python_scalars=True,
                     assert_autodiffed=True,
                     rhs_make_tensor_kwargs=dict(exclude_zero=True),
-                    skipXPU=False,),
+                    enable_skipped_device=('xpu',),),
                     
     BinaryUfuncInfo('div',
                     aliases=('divide',),
@@ -11822,11 +11823,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         # See https://github.com/pytorch/pytorch/issues/111126
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
                     ),
-                    skipXPU=False,
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='floor_rounding',
@@ -11843,11 +11844,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         # See https://github.com/pytorch/pytorch/issues/111126
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
                     ),
-                    skipXPU=False,
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('true_divide',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
@@ -11947,9 +11948,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     always_returns_bool=True,
                     supports_autograd=False,
                     sample_inputs_func=sample_inputs_comparison_ops,
-                    skipXPU=False,
                     skips=(
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('fmax',
                     op=torch.fmax,
                     dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
@@ -11981,7 +11982,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True,
                     assert_autodiffed=None,
                     rhs_make_tensor_kwargs={'exclude_zero': True},
-                    skipXPU=False,
                     decorators=(
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
                                      'test_contig_vs_every_other',
@@ -11995,7 +11995,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
                                      dtypes=(torch.uint8,)),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('remainder',
                     ref=np.remainder,
                     dtypes=all_types_and(torch.float16, torch.bfloat16),
@@ -13028,12 +13029,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.iand,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
-                    skipXPU=False,
                     skips=(
                         # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
                                      'test_type_promotion', device_type='cuda'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('bitwise_or',
                     ref=np.bitwise_or,
                     dtypes=integral_types_and(torch.bool),
@@ -13041,14 +13042,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.ior,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
-                    skipXPU=False,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_or_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
                                      'test_type_promotion',
                                      device_type='cuda'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('bitwise_xor',
                     ref=np.bitwise_xor,
                     dtypes=integral_types_and(torch.bool),
@@ -13056,14 +13057,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     inplace_operator_variant=operator.ixor,
                     supports_autograd=False,
                     supports_one_python_scalar=True,
-                    skipXPU=False,
                     skips=(
                         # TODO: FIXME: RuntimeError: "bitwise_xor_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
                                      'test_type_promotion',
                                      device_type='cuda'),
-                    )),
+                    ),
+                    enable_skipped_device=('xpu',)),
     BinaryUfuncInfo('heaviside',
                     ref=lambda a, b: (
                         # necessary because np.heaviside incorrectly returns float64 when passed args of dtype int64
@@ -13093,12 +13094,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     dtypes=integral_types_and(),
                     supports_autograd=False,
                     supports_rhs_python_scalar=False,
-                    skipXPU=False,
                     skips=(
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
-                                     dtypes=(torch.int8,)),)),
+                                     dtypes=(torch.int8,)),),
+                    enable_skipped_device=('xpu',)
+                    ),
     BinaryUfuncInfo('isclose',
                     ref=np.isclose,
                     dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -15408,13 +15410,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_forward_ad=True,
            supports_fwgrad_bwgrad=True,
            autodiff_nonfusible_nodes=["aten::gelu"],
-           skipXPU=False,
            skips=(
                # AssertionError: Tensor-likes are not close!
                # May not replicate in CI
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
                DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     UnaryUfuncInfo('nn.functional.relu6',
                    aten_name="relu6",
                    dtypes=all_types_and(torch.half, torch.bfloat16),
@@ -17256,9 +17258,9 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csc),
            sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsr),
            sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsc),
-           skipXPU=False,
            skips=(
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     OpInfo('ones_like',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
@@ -17423,7 +17425,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=True,
            sample_inputs_func=sample_inputs_ones_zeros,
-           skipXPU=False,
            skips=(
                # Tests that assume input is a tensor or sequence of tensors
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
@@ -17436,7 +17437,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     OpInfo('full',
            op=torch.full,
            supports_autograd=False,
@@ -17566,7 +17568,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_empty,
            supports_autograd=False,
-           skipXPU=False,
            skips=(
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                # Empty tensor data is garbage so it's hard to make comparisons with it.
@@ -17602,7 +17603,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Expected: empty is not comparable"),
                             'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
-           )),
+           ),
+           enable_skipped_device=('xpu',)),
     OpInfo('eye',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_eye,
@@ -17789,7 +17791,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_bernoulli,
            error_inputs_func=error_inputs_bernoulli,
-           skipXPU=False,
            skips=(
                # vmap: We do not yet support calling random operations inside of vmap
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
@@ -17801,7 +17802,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+            enable_skipped_device=('xpu',)),
     OpInfo('scatter_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_scatter_add,
@@ -18048,10 +18050,10 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                   supports_forward_ad=True,
                   supports_fwgrad_bwgrad=True,
                   sample_inputs_func=sample_repeat_tile,
-                  skipXPU=True,
                   skips=(
                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-                  )),
+                  ),
+                  enable_skipped_device=('xpu',)),
     OpInfo('squeeze',
            ref=_squeeze_ref,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -18094,14 +18096,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         check_batched_forward_grad=False,
         dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
         supports_out=False,
-        skipXPU=False,
         skips=(
             # JIT has issue when op is passed as lambda
             # AssertionError: JIT Test does not execute any logic
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             DecorateInfo(unittest.skip("No fill_ op"), 'TestCudaFuserOpInfo'),
             DecorateInfo(unittest.skip("No fill_ op"), 'TestNNCOpInfo'),
-        )),
+        ),
+        enable_skipped_device=('xpu',)),
     OpInfo('resize_',
            op=lambda x, shape: x.clone().resize_(shape),
            method_variant=None,
@@ -18112,14 +18114,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
-           skipXPU=False,
            skips=(
                # Cannot resize variables that require grad
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
            ),
-           sample_inputs_func=sample_inputs_resize_ops),
+           sample_inputs_func=sample_inputs_resize_ops,
+           enable_skipped_device=('xpu',)),
     OpInfo('resize_as_',
            op=lambda x, other: torch.resize_as_(x.clone(), other),
            method_variant=None,
@@ -18127,13 +18129,13 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            supports_out=False,
            supports_autograd=False,
-           skipXPU=False,
            skips=(
                # Cannot resize variables that require grad
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
                DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
            ),
-           sample_inputs_func=sample_inputs_resize_ops),
+           sample_inputs_func=sample_inputs_resize_ops,
+           enable_skipped_device=('xpu',)),
     OpInfo('take_along_dim',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -19238,11 +19240,11 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         result_dtype=torch.bool,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         ref=reference_reduction_numpy(np.all),
-        skipXPU=False,
         skips=(
             # FIXME: uint8 input returns uint8 instead of bool
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
         ),
+        enable_skipped_device=('xpu',),
     ),
     ReductionOpInfo(
         'any',
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index a949574326323..5d1ac019c8b4b 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -5,7 +5,8 @@
 import math
 import operator
 import unittest
-from dataclasses import asdict, dataclass
+from dataclasses import InitVar, asdict, dataclass
+from typing import Dict, Optional
 from enum import Enum
 from functools import partial
 from itertools import product
@@ -25,7 +26,6 @@
     floating_and_complex_types,
     floating_and_complex_types_and,
     floating_types,
-    empty_types,
 )
 from torch.testing._internal.common_utils import (
     is_iterable_of_tensors,
@@ -683,8 +683,11 @@ class OpInfo:
     # information about which tests to skip
     skips: Tuple = tuple()
 
-    # skip xpu by default
-    skipXPU: bool = True
+    # skip the test for a device
+    skip_device: Tuple = tuple()
+
+    # enable the test for a device
+    enable_skipped_device: Tuple = tuple()
 
     # decorators to apply to generated tests
     decorators: Tuple = tuple()
@@ -1021,18 +1024,13 @@ def __post_init__(self):
             else:
                 self.inplace_operator_variant = None
 
-        if self.skipXPU == True:
-            skip_dtypes= self.dtypesIfXPU
-            
+        # Skip XPU test by default
+        self.skip_device = ('xpu',)
+        for device in (set(self.skip_device).difference(set(self.enable_skipped_device))):
             if self.skips is not None:
-                #self.skips = (*self.skips, DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=skip_dtypes))
-                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type="xpu", dtypes=None))
+                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
             else:
-                #self.skips = (DecorateInfo(unittest.skip, 'TestCommon', 'test_compare_cpu',  device_type="xpu", dtypes=skip_dtypes))
-                self.skips = (DecorateInfo(unittest.skip, device_type="xpu", dtypes=None))
-            #print("Skip XPU backend on {} with {} and {}".format(self.name, skip_dtypes, self.skips))
-        else:
-            print("Won't skip XPU backend on op {}".format(self.name))
+                self.skips = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
 
         self.decorators = (*self.decorators, *self.skips)
 
@@ -1567,7 +1565,6 @@ def __init__(
             yield tuple(),
             {},
         ),
-        skipXPU: bool = True,
         # Options from the OpInfo base class
         **kwargs,
     ):
@@ -1591,7 +1588,7 @@ def sample_inputs_func(*args, **kwargs):
         # Override OpInfo defaults and call base class __init__
         kwargs.setdefault("inplace_variant", None)
         kwargs.setdefault("sample_inputs_func", sample_inputs_func)
-        super().__init__(name, promotes_int_to_float=promotes_int_to_float, skipXPU = skipXPU, **kwargs)
+        super().__init__(name, promotes_int_to_float=promotes_int_to_float, **kwargs)
 
         self.identity = identity
         self.nan_policy = nan_policy
@@ -2156,7 +2153,6 @@ def __init__(
         supports_rhs_python_scalar=True,  # Whether the operator allows Tensor x scalar inputs
         supports_one_python_scalar=False,  # Whether the operator allows scalar x tensor and tensor x scalar inputs
         supports_two_python_scalars=False,  # Whether the operator allows scalar x scalar inputs
-        skipXPU=True,
         **kwargs,
     ):
         self._original_binary_ufunc_args = locals().copy()
@@ -2177,7 +2173,6 @@ def __init__(
             sample_inputs_func=sample_inputs_func,
             reference_inputs_func=reference_inputs_func,
             error_inputs_func=make_error_inputs_elementwise_binary(error_inputs_func),
-            skipXPU=skipXPU,
             **kwargs,
         )
 
@@ -2506,7 +2501,6 @@ def __init__(
         reference_inputs_func=reference_inputs_elementwise_unary,
         sample_kwargs=lambda device, dtype, input: ({}, {}),
         reference_numerics_filter=None,  # Filters values in the range of the domain specified above but that should not be tested
-        skipXPU=True,
         **kwargs,
     ):
         self._original_unary_ufunc_args = locals().copy()
@@ -2516,7 +2510,6 @@ def __init__(
             dtypes=dtypes,
             sample_inputs_func=sample_inputs_func,
             reference_inputs_func=reference_inputs_func,
-            skipXPU=skipXPU,
             **kwargs,
         )
         
@@ -2651,7 +2644,6 @@ def __init__(
         ndimensional: SpectralFuncType,
         sample_inputs_func=sample_inputs_spectral_ops,
         decorators=None,
-        skipXPU=True,
         **kwargs,
     ):
         self._original_spectral_func_args = dict(locals()).copy()
@@ -2672,7 +2664,6 @@ def __init__(
             dtypes=dtypes,
             decorators=decorators,
             sample_inputs_func=sample_inputs_func,
-            skipXPU=skipXPU,
             **kwargs,
         )
         self.ref = ref
@@ -2691,7 +2682,6 @@ def __init__(
         dtypesIfCUDA=None,
         dtypesIfROCM=None,
         sample_inputs_func=None,
-        skipXPU=True,
         **kwargs,
     ):
         super().__init__(
@@ -2700,7 +2690,6 @@ def __init__(
             dtypesIfCUDA=dtypesIfCUDA,
             dtypesIfROCM=dtypesIfROCM,
             sample_inputs_func=sample_inputs_func,
-            skipXPU=skipXPU,
             **kwargs,
         )
         self.ref = ref

From 22ad9f82a0a1979d64313021d2ee2ef9eec25aac Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 21 May 2024 20:38:37 -0700
Subject: [PATCH 05/37] remove unused comments

---
 test/test_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 70c2eb41eb26e..a6be62fe3cef6 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -133,8 +133,6 @@ def reduction_dtype_filter(op):
 aten = torch.ops.aten
 
 def any_common_cpu_device_one():
-    # import pdb
-    # pdb.set_trace()
     return OpDTypes.any_common_cpu_xpu_one if TEST_XPU else OpDTypes.any_common_cpu_cuda_one
 
 def has_gpu_device(devices: List[str]):

From d87908360b0280f068787c6756dbb7190f008d50 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 26 May 2024 02:32:31 -0700
Subject: [PATCH 06/37] added xpu_op_db.yaml to enable xpu op UT. Refined
 onlyOn interface to support multiple devices.Support
 PYTORCH_TESTING_DEVICE_ONLY_FOR='xpu' to enable xpu test

---
 test/test_ops.py                              |  20 +-
 test/xpu/xpu_op_db.yaml                       |  87 ++++++++
 torch/testing/_internal/common_device_type.py |  27 ++-
 .../_internal/common_methods_invocations.py   | 185 ++++++++++--------
 torch/testing/_internal/common_utils.py       |  14 ++
 torch/testing/_internal/opinfo/core.py        |  22 ++-
 6 files changed, 241 insertions(+), 114 deletions(-)
 create mode 100644 test/xpu/xpu_op_db.yaml

diff --git a/test/test_ops.py b/test/test_ops.py
index a6be62fe3cef6..ecefb53195788 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -810,7 +810,7 @@ def _extract_strides(out):
             # NOTE: only extracts on the CPU and CUDA device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
-                if self.device_type != "cpu" and self.device_type != "cuda":
+                if self.device_type != "cpu" and self.device_type != "cuda" and self.device_type != "xpu":
                     return ()
 
                 if isinstance(out, torch.Tensor):
@@ -938,7 +938,7 @@ def _extract_strides(out):
             # NOTE: only extracts on the CPU and CUDA device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
-                if self.device_type != "cpu" and self.device_type != "cuda":
+                if self.device_type != "cpu" and self.device_type != "cuda" and self.device_type != "xpu":
                     return ()
 
                 if isinstance(out, torch.Tensor):
@@ -1016,7 +1016,6 @@ def _case_two_transform(t):
             elif torch.cuda.is_available():
                 wrong_device = "cuda"
             elif torch.xpu.is_available(): 
-                # Daisy ????
                 wrong_device = "xpu"
 
             factory_fn_msg = (
@@ -1674,7 +1673,8 @@ def test_forward_ad(self, device, dtype, op):
             composite_compliance.check_forward_ad_formula(
                 op.get_op(), args, kwargs, op.gradcheck_wrapper, self.assertEqual
             )
-
+         
+    @skipXPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_cow_input(self, device, dtype, op):
         samples = op.sample_inputs(device, dtype, requires_grad=op.supports_autograd)
@@ -2678,12 +2678,12 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-instantiate_device_type_tests(TestCommon, globals(), only_for="xpu")
-#instantiate_device_type_tests(TestCompositeCompliance, globals(), only_for="xpu")
-#instantiate_device_type_tests(TestMathBits, globals())
-#instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
-#instantiate_device_type_tests(TestFakeTensor, globals())
-#instantiate_device_type_tests(TestTags, globals())
+instantiate_device_type_tests(TestCommon, globals())
+instantiate_device_type_tests(TestCompositeCompliance, globals())
+instantiate_device_type_tests(TestMathBits, globals())
+instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
+instantiate_device_type_tests(TestFakeTensor, globals())
+instantiate_device_type_tests(TestTags, globals())
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
diff --git a/test/xpu/xpu_op_db.yaml b/test/xpu/xpu_op_db.yaml
new file mode 100644
index 0000000000000..c0833b0cd30ea
--- /dev/null
+++ b/test/xpu/xpu_op_db.yaml
@@ -0,0 +1,87 @@
+backend: XPU
+supported: 
+    - fill
+    - zeros
+    - zeros_like
+    - clone
+    - view_as_real
+    - view_as_complex
+    - view
+    - resize_
+    - resize_as_
+    - add
+    - sub
+    - mul
+    - div
+    - abs
+    - bernoulli
+    - bitwise_and
+    - bitwise_not
+    - bitwise_or
+    - bitwise_xor
+    - clamp
+    - clamp_max
+    - clamp_min
+    - clone
+    - copy
+    - cos
+    - cumsum
+    - empty
+    - eq
+    - fill
+    - fmod
+    - gcd
+    - ge
+    - gelu
+    - gt
+    - index_add
+    - index_put
+    - index_select
+    - isnan
+    - le
+    - log
+    - lt
+    - masked_fill
+    - maximum
+    - minimum
+    - mul
+    - native_dropout_backward
+    - ne
+    - neg
+    - nn.functional.adaptive_avg_pool2d
+    - nn.functional.threshold
+    - nonzero
+    - normal
+    - pow
+    - reciprocal
+    - rsub
+    - relu
+    - remainder
+    - reshape
+    - rsqrt
+    - sin
+    - sqrt
+    - sum
+    - tanh
+    - unfold
+    - uniform
+    - view
+    - where
+    - zero
+    - add
+    - any
+    - arange
+    - as_strided
+    - flip
+    - tril
+    - triu
+    - cat
+    - log_softmax
+    - softmax
+    - scatter
+    - gather
+    - max_pool2d_with_indices_backward
+    - nn.functional.embedding
+    - nn.functional.unfold
+    
+    
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 86940f6c95cdc..e4455a0b912ed 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -702,8 +702,8 @@ def get_desired_device_type_test_bases(except_for=None, only_for=None, include_l
     test_bases = device_type_test_bases.copy()
     if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
         test_bases.append(MPSTestBase)
-    if only_for == 'xpu' and TEST_XPU and XPUTestBase not in test_bases:
-        test_bases.append(XPUTestBase)
+    if (only_for == 'xpu' or 'xpu' in os.getenv(PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY)) and TEST_XPU and XPUTestBase not in test_bases:
+       test_bases.append(XPUTestBase)
     # Filter out the device types based on user inputs
     desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
     if include_lazy:
@@ -1170,15 +1170,20 @@ def efail_fn(slf, *args, **kwargs):
 
 class onlyOn:
 
-    def __init__(self, device_type):
-        self.device_type = device_type
+    def __init__(self, device_type: Union[str, List[str]]):
+        self.device_types = []
+        if isinstance(device_type, str):
+            self.device_types.append(device_type)
+        else:
+            assert isinstance(device_type, list)
+            self.device_types = device_type
 
     def __call__(self, fn):
 
         @wraps(fn)
         def only_fn(slf, *args, **kwargs):
-            if self.device_type != slf.device_type:
-                reason = f"Only runs on {self.device_type}"
+            if slf.device_type not in self.device_types:
+                reason = f"Only runs on {self.device_types}"
                 raise unittest.SkipTest(reason)
 
             return fn(slf, *args, **kwargs)
@@ -1375,15 +1380,7 @@ def only_fn(self, *args, **kwargs):
     return only_fn
 
 def onlyCUDAAndXPU(fn):
-    @wraps(fn)
-    def only_fn(self, *args, **kwargs):
-        if self.device_type not in ('cuda', 'xpu'):
-            reason = f"onlyCUDAAndXPU: doesn't run on {self.device_type}"
-            raise unittest.SkipTest(reason)
-
-        return fn(self, *args, **kwargs)
-
-    return only_fn
+    return onlyOn(['cuda', 'xpu'])(fn)
 
 def disablecuDNN(fn):
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 872fca9514c8f..8c4db5c3b33cd 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -37,7 +37,7 @@
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, freeze_rng_state, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR
+    TEST_WITH_TORCHINDUCTOR, TEST_XPU
 )
 
 import torch._refs as refs  # noqa: F401
@@ -9515,6 +9515,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9533,6 +9534,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9584,6 +9586,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         backward_requires_result=True,
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9603,6 +9606,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
+        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9629,6 +9633,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+        dtypesIfXPU=floating_and_complex_types_and(torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9712,6 +9717,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half),
+        dtypesIfXPU=floating_types_and(torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9741,6 +9747,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+        dtypesIfXPU=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9756,6 +9763,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half),
+        dtypesIfXPU=all_types_and_complex_and(torch.bfloat16, torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -10455,8 +10463,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    supports_sparse_csc=True,
                    supports_sparse_bsr=True,
                    supports_sparse_bsc=True,
-                   supports_forward_ad=True,
-                   enable_skipped_device=('xpu',)),
+                   supports_forward_ad=True),
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
     UnaryUfuncInfo('acos',
                    aliases=('arccos', ),
@@ -10564,9 +10571,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_extremal_values',
                                      dtypes=(torch.complex64, torch.complex128)),
-                    ),
-                    enable_skipped_device=('xpu',),
-                    ),
+                    )),
     OpInfo('item',
            op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.item, inp, *args, **kwargs),
            ref=np.ndarray.item,
@@ -10588,9 +10593,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast'),
                # Booleans mismatch: AssertionError: False is not true
                DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake'),
-           ),
-           enable_skipped_device=('xpu',),
-        ),
+           )),
     OpInfo('arange',
            dtypes=all_types_and(torch.bfloat16, torch.float16),
            supports_out=True,
@@ -10626,8 +10629,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     OpInfo('cauchy',
            op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.cauchy_, inp, *args, **kwargs),
            inplace_variant=torch.Tensor.cauchy_,
@@ -10817,8 +10819,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
                         # test error disabled since rhs non-tensor python scalar is supported
                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
-                    ),
-                    enable_skipped_device=('xpu',),),
+                    )),
     BinaryUfuncInfo('clamp_min',
                     ref=_clamp_min_numpy,
                     dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
@@ -10836,8 +10837,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
                         # test error disabled since rhs non-tensor python scalar is supported
                         DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('mul',
                     aliases=('multiply',),
                     dtypes=all_types_and_complex_and(torch.chalf, torch.float16, torch.bfloat16, torch.bool),
@@ -10850,9 +10850,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csr),
                     sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csc),
                     sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsr),
-                    sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc),
-                    enable_skipped_device=('xpu',)),
-                    
+                    sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc)),
     BinaryUfuncInfo('sub',
                     # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
                     ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
@@ -10888,8 +10886,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
                                      dtypes=(torch.uint8,)),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     OpInfo('addmm',
            # This addmm OpInfo is for when alpha and beta are not both equal to 1.
            # alpha=beta=1 is tested in the following opinfo, because that special case will
@@ -11331,8 +11328,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.bitwise_not,
                    dtypes=integral_types_and(torch.bool),
                    operator_variant=operator.invert,
-                   supports_autograd=False,
-                   enable_skipped_device=('xpu',)),
+                   supports_autograd=False),
     BinaryUfuncInfo('bitwise_left_shift',
                     op=torch.bitwise_left_shift,
                     dtypes=integral_types(),
@@ -11467,8 +11463,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # (NumPy reference needs to be extended with memory_format)
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref'),
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
-           ),
-           enable_skipped_device=('xpu',)),
+           ),),
     OpInfo('contiguous',
            op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs),
            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
@@ -11510,8 +11505,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                             'TestNNCOpInfo',
                             'test_nnc_correctness',
                             dtypes=(torch.bool,)),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     UnaryUfuncInfo('positive',
                    ref=np.positive,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
@@ -11575,7 +11569,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            supports_fwgrad_bwgrad=True,
            sample_inputs_func=sample_inputs_view_as_real,
            test_conjugated_samples=False,
-           enable_skipped_device=('xpu',),
            ),
     OpInfo('view_as_complex',
            dtypes=floating_types_and(torch.half),
@@ -11591,8 +11584,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.half,)),
                # RuntimeError: view size is not compatible with input tensor's size and stride
                DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     BinaryUfuncInfo('complex',
                     dtypes=floating_types_and(torch.half),
                     supports_forward_ad=True,
@@ -11631,7 +11623,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.cos,
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
-                   dtypesIfXPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
                    assert_autodiffed=True,
                    handles_large_floats=False,
                    supports_forward_ad=True,
@@ -11655,8 +11646,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                        DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large',
                                     device_type='cuda',
                                     dtypes=(torch.chalf,), active_if=IS_WINDOWS),
-                   ),
-                   enable_skipped_device=('xpu',)),
+                   )),
     UnaryUfuncInfo('cosh',
                    ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
@@ -11731,8 +11721,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # cumsum does not handle correctly out= dtypes
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
            ),
-           sample_inputs_func=sample_inputs_cumulative_ops,
-           enable_skipped_device=('xpu',)),
+           sample_inputs_func=sample_inputs_cumulative_ops),
     OpInfo('cumprod',
            dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
            supports_forward_ad=True,
@@ -11804,9 +11793,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_fwgrad_bwgrad=True,
                     supports_two_python_scalars=True,
                     assert_autodiffed=True,
-                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
-                    enable_skipped_device=('xpu',),),
-                    
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),),
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='trunc_rounding',
@@ -11826,8 +11813,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('div',
                     aliases=('divide',),
                     variant_test_name='floor_rounding',
@@ -11847,8 +11833,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     skips=(
                         # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
                         DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('true_divide',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
@@ -11949,8 +11934,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     supports_autograd=False,
                     sample_inputs_func=sample_inputs_comparison_ops,
                     skips=(
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('fmax',
                     op=torch.fmax,
                     dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
@@ -11995,12 +11979,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
                                      dtypes=(torch.uint8,)),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('remainder',
                     ref=np.remainder,
                     dtypes=all_types_and(torch.float16, torch.bfloat16),
                     dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    dtypesIfXPU=all_types_and(torch.float16, torch.bfloat16),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
@@ -13033,8 +13017,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
                         DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
                                      'test_type_promotion', device_type='cuda'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('bitwise_or',
                     ref=np.bitwise_or,
                     dtypes=integral_types_and(torch.bool),
@@ -13048,8 +13031,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_type_promotion',
                                      device_type='cuda'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('bitwise_xor',
                     ref=np.bitwise_xor,
                     dtypes=integral_types_and(torch.bool),
@@ -13063,8 +13045,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                                      'TestBinaryUfuncs',
                                      'test_type_promotion',
                                      device_type='cuda'),
-                    ),
-                    enable_skipped_device=('xpu',)),
+                    )),
     BinaryUfuncInfo('heaviside',
                     ref=lambda a, b: (
                         # necessary because np.heaviside incorrectly returns float64 when passed args of dtype int64
@@ -13098,9 +13079,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                         DecorateInfo(unittest.expectedFailure,
                                      'TestBinaryUfuncs',
                                      'test_reference_numerics_small_values',
-                                     dtypes=(torch.int8,)),),
-                    enable_skipped_device=('xpu',)
-                    ),
+                                     dtypes=(torch.int8,)),)),
     BinaryUfuncInfo('isclose',
                     ref=np.isclose,
                     dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -13289,6 +13268,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace_all_strides'),
+               DecorateInfo(unittest.skip("Not support XPU"), 'TestCompositeCompliance', 'test_operator', device_type='xpu', dtypes=None),
            )),
     OpInfo('as_strided_scatter',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -13886,8 +13866,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
-                            active_if=TEST_WITH_ROCM)
            ],
            sample_inputs_func=sample_inputs_instance_norm,
            supports_expanded_weight=True,),
@@ -14036,6 +14014,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            aten_name='im2col',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           dtypesIfXPU=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_nn_unfold,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
@@ -15208,10 +15187,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.bfloat16, torch.float16)),
-               # Trying to use forward AD with miopen_batch_norm that does not support it
-               # because it has not been implemented yet.
-               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
-                            device_type="cuda", active_if=TEST_WITH_ROCM),
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}),
                             'TestCompositeCompliance', 'test_forward_ad', device_type="cpu"),
            )),
@@ -15415,8 +15390,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # May not replicate in CI
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
                DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     UnaryUfuncInfo('nn.functional.relu6',
                    aten_name="relu6",
                    dtypes=all_types_and(torch.half, torch.bfloat16),
@@ -15586,12 +15560,14 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     BinaryUfuncInfo('pow',
                     dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                    dtypesIfXPU=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                     ref=np.power,
                     # Due to AVX2 currently not being fully supported for Float16, log_vml_cpu can't be enabled
                     # for Float16, causing this test to fail. pow's autograd for Float16 is thus currently
                     # unsupported on CPU.
                     backward_dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
                     backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
+                    backward_dtypesIfXPU=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_inplace_autograd=False,
@@ -16393,6 +16369,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    domain=(0, None),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfXPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
                    decorators=(precisionOverride({torch.half: 5e-2}),),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
@@ -16871,6 +16848,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('gather',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_gather,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            supports_forward_ad=True,
@@ -16905,6 +16883,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('index_select',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
+           backward_dtypesIfXPU=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_index,
            reference_inputs_func=partial(sample_inputs_index, reference=True),
            error_inputs_func=error_inputs_index_select,
@@ -17259,8 +17238,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsr),
            sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsc),
            skips=(
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     OpInfo('ones_like',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
            supports_out=False,
@@ -17437,8 +17415,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     OpInfo('full',
            op=torch.full,
            supports_autograd=False,
@@ -17603,8 +17580,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.skip("Expected: empty is not comparable"),
                             'TestCommon', 'test_complex_half_reference_testing'),
                DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
-           ),
-           enable_skipped_device=('xpu',)),
+           )),
     OpInfo('eye',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_eye,
@@ -17725,6 +17701,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            inplace_variant=None,
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
+           dtypesIfXPU=floating_types_and(torch.bfloat16, torch.half),
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_first,
            skips=(
@@ -17754,6 +17731,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            inplace_variant=None,
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
+           dtypesIfXPU=floating_types_and(torch.bfloat16, torch.half),
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_second,
            skips=(
@@ -17802,8 +17780,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
                # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
-               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
-            enable_skipped_device=('xpu',)),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
     OpInfo('scatter_add',
            dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_scatter_add,
@@ -18052,8 +18029,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                   sample_inputs_func=sample_repeat_tile,
                   skips=(
                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
-                  ),
-                  enable_skipped_device=('xpu',)),
+                  )),
     OpInfo('squeeze',
            ref=_squeeze_ref,
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -18102,8 +18078,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
             DecorateInfo(unittest.skip("No fill_ op"), 'TestCudaFuserOpInfo'),
             DecorateInfo(unittest.skip("No fill_ op"), 'TestNNCOpInfo'),
-        ),
-        enable_skipped_device=('xpu',)),
+        )),
     OpInfo('resize_',
            op=lambda x, shape: x.clone().resize_(shape),
            method_variant=None,
@@ -18120,8 +18095,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
                DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
            ),
-           sample_inputs_func=sample_inputs_resize_ops,
-           enable_skipped_device=('xpu',)),
+           sample_inputs_func=sample_inputs_resize_ops),
     OpInfo('resize_as_',
            op=lambda x, other: torch.resize_as_(x.clone(), other),
            method_variant=None,
@@ -18134,8 +18108,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
                DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
            ),
-           sample_inputs_func=sample_inputs_resize_ops,
-           enable_skipped_device=('xpu',)),
+           sample_inputs_func=sample_inputs_resize_ops),
     OpInfo('take_along_dim',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
@@ -18949,6 +18922,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         aten_name="native_dropout_backward",
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        dtypesIfXPU=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_dropout_backward,
         skips=(
@@ -19244,7 +19218,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
             # FIXME: uint8 input returns uint8 instead of bool
             DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
         ),
-        enable_skipped_device=('xpu',),
     ),
     ReductionOpInfo(
         'any',
@@ -19523,6 +19496,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         ref=reference_reduction_numpy(np.sum),
         error_inputs_sparse_func=error_inputs_sparse_reduction_sum,
         sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_coo),
@@ -19934,6 +19908,58 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 op_db += opinfo.definitions.op_db
 
 
+# def enable_skipped_device(op_db_list: List[OpInfo]):
+#         if TEST_XPU:
+#             import os, yaml
+#             device = 'xpu'
+#             op_db_dict = {}
+
+#             xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
+#             if os.path.exists(xpu_op_db):
+#                 with open(xpu_op_db) as stream:
+#                     try:
+#                         op_db_dict = yaml.safe_load(stream)
+#                     except yaml.YAMLError:
+#                         print("Error in loading xpu_op_db.yaml.")
+
+#             for op in op_db_list:
+#                 if not op_db_dict or op.name not in op_db_dict['supported']:
+#                     if hasattr(op, "torch_opinfo"):
+#                         # import pdb
+#                         # pdb.set_trace()
+#                         torch_opinfo = getattr(op, "torch_opinfo")
+#                         if hasattr(torch_opinfo, 'name') and torch_opinfo.name in op_db_dict['supported']:
+#                             continue
+
+#                     if op.skips is not None:
+#                         op.skips = (*op.skips, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
+#                         op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
+#                     else:
+#                         op.skips = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
+#                         op.decorators = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
+#             return op_db_list            
+            
+#op_db = enable_skipped_device(op_db) 
+
+# import os, yaml
+# xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
+# if os.path.exists(xpu_op_db):
+#     with open(xpu_op_db) as stream:
+#         try:
+#             op_db_dict = yaml.safe_load(stream)
+#         except yaml.YAMLError:
+#             print("Error in loading xpu_op_db.yaml.")
+
+# for op in op_db:
+#     if op.name not in op_db_dict['supported']:  
+#         if op.name == "__rpow__":
+#             import pdb
+#             pdb.set_trace()
+#         print("***", op.name, op.decorators)              
+
+
+        
+
 # Separate registry for experimental Python Reference OpInfos.
 python_ref_db = [
     #
@@ -22906,6 +22932,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
 ]
 python_ref_db += opinfo.definitions.python_ref_db
+#python_ref_db = enable_skipped_device(python_ref_db)
 
 # Common operator groupings
 ops_and_refs = op_db + python_ref_db
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6e6e1c596fd48..20c1dde4dbbff 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -35,6 +35,7 @@
 import types
 import unittest
 import warnings
+import yaml
 from collections.abc import Mapping, Sequence
 from contextlib import closing, contextmanager
 from copy import deepcopy
@@ -5051,3 +5052,16 @@ def repl_frame(m):
         s = re.sub(r"Cannot export model.+\n\n", "", s)
     s = re.sub(r" +$", "", s, flags=re.M)
     return s
+
+def enable_skipped_op_dict():
+    enable_op_dict = {}
+    if TEST_XPU:
+        device = 'xpu'     
+        xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
+        if os.path.exists(xpu_op_db):
+            with open(xpu_op_db) as stream:
+                try:
+                    enable_op_dict = yaml.safe_load(stream)
+                except yaml.YAMLError:
+                    print("Error in loading xpu_op_db.yaml.")
+    return enable_op_dict
\ No newline at end of file
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 5d1ac019c8b4b..419c338611ca6 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -33,6 +33,8 @@
     TEST_WITH_ROCM,
     torch_to_numpy_dtype_dict,
     TrackedInputIter,
+    TEST_XPU,
+    enable_skipped_op_dict,
 )
 from torch.testing._internal.opinfo import utils
 
@@ -686,9 +688,6 @@ class OpInfo:
     # skip the test for a device
     skip_device: Tuple = tuple()
 
-    # enable the test for a device
-    enable_skipped_device: Tuple = tuple()
-
     # decorators to apply to generated tests
     decorators: Tuple = tuple()
 
@@ -900,6 +899,15 @@ class OpInfo:
 
     is_factory_function: bool = False
 
+    def enable_skipped_device(self):
+        op_db_dict = enable_skipped_op_dict()
+        if TEST_XPU and (not op_db_dict or self.name not in op_db_dict['supported']):
+            if self.skips is not None:
+                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+            else:
+                self.skips = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+
+
     def __post_init__(self):
         self._original_opinfo_args = asdict(self).copy()
 
@@ -1024,13 +1032,7 @@ def __post_init__(self):
             else:
                 self.inplace_operator_variant = None
 
-        # Skip XPU test by default
-        self.skip_device = ('xpu',)
-        for device in (set(self.skip_device).difference(set(self.enable_skipped_device))):
-            if self.skips is not None:
-                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
-            else:
-                self.skips = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
+        self.enable_skipped_device()
 
         self.decorators = (*self.decorators, *self.skips)
 

From d7af50b8db1a80df966b89caf8d38995e95362ef Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 26 May 2024 02:36:21 -0700
Subject: [PATCH 07/37] remove unused skip_device field in opInfo

---
 torch/testing/_internal/opinfo/core.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 419c338611ca6..d760feac3ed10 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -685,9 +685,6 @@ class OpInfo:
     # information about which tests to skip
     skips: Tuple = tuple()
 
-    # skip the test for a device
-    skip_device: Tuple = tuple()
-
     # decorators to apply to generated tests
     decorators: Tuple = tuple()
 

From d99c6a9f7a1d08cd1fb800f60c5ff5fba7580a89 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 28 May 2024 23:32:58 -0700
Subject: [PATCH 08/37] code clean up

---
 test/test_ops.py                              |  1 -
 torch/testing/_internal/common_device_type.py | 11 +---
 .../_internal/common_methods_invocations.py   | 59 ++-----------------
 torch/testing/_internal/common_utils.py       |  2 -
 torch/testing/_internal/opinfo/core.py        |  6 --
 5 files changed, 8 insertions(+), 71 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index ecefb53195788..3f4684bf85610 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -166,7 +166,6 @@ def tearDownClass(cls):
     @onlyCUDAAndXPU
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
-    #@ops(_xpu_computation_ops, dtypes=any_common_cpu_device_one)
     def test_multiple_devices(self, devices, dtype, op):
         for cuda_device_str in devices:
             cuda_device = torch.device(cuda_device_str)
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index e4455a0b912ed..1ed3ccd672ecd 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -402,6 +402,7 @@ def instantiate_test_helper(cls, name, *, test, param_kwargs=None, decorator_fn=
                 if hasattr(test, 'num_required_devices'):
                     device_arg = cls.get_all_devices()
                 _update_param_kwargs(param_kwargs, 'device', device_arg)
+
             # Apply decorators based on param kwargs.
             for decorator in decorator_fn(param_kwargs):
                 test = decorator(test)
@@ -436,8 +437,7 @@ def instantiated_test(self, param_kwargs=param_kwargs):
                 return result
 
             assert not hasattr(cls, name), f"Redefinition of test {name}"
-            #import pdb
-            #pdb.set_trace()
+
             setattr(cls, name, instantiated_test)
 
         def default_parametrize_fn(test, generic_cls, device_cls):
@@ -449,8 +449,7 @@ def default_parametrize_fn(test, generic_cls, device_cls):
 
         # If one of the @dtypes* decorators is present, also parametrize over the dtypes set by it.
         dtypes = cls._get_dtypes(test)
-        #import pdb
-        #pdb.set_trace()
+        
         if dtypes is not None:
 
             def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
@@ -476,7 +475,6 @@ def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
                 dtype_kwarg = param_kwargs['dtypes'] if 'dtypes' in param_kwargs else param_kwargs['dtype']
             test_name = f'{name}{test_suffix}{device_suffix}{_dtype_test_suffix(dtype_kwarg)}'
 
-            print(test_name)
             instantiate_test_helper(cls=cls, name=test_name, test=test, param_kwargs=param_kwargs,
                                     decorator_fn=decorator_fn)
 
@@ -1004,7 +1002,6 @@ def test_wrapper(*args, **kwargs):
                     decorator_fn = partial(op.get_decorators, generic_cls.__name__,
                                            test.__name__, device_cls.device_type, dtype)
 
-                    #print("create test {} op={} dtype={} param_kwargs={} decorator_fn={}".format(test_name, op, dtype, param_kwargs, decorator_fn))
                     yield (test_wrapper, test_name, param_kwargs, decorator_fn)
                 except Exception as ex:
                     # Provides an error message for debugging before rethrowing the exception
@@ -1605,5 +1602,3 @@ def skipPRIVATEUSE1(fn):
 #  This should probably enumerate all available device type test base classes.
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
-
-
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8c4db5c3b33cd..9bded15ad0167 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -13268,7 +13268,7 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace_all_strides'),
-               DecorateInfo(unittest.skip("Not support XPU"), 'TestCompositeCompliance', 'test_operator', device_type='xpu', dtypes=None),
+               DecorateInfo(unittest.skip("No XPU backend support in this operation"), 'TestCompositeCompliance', 'test_operator', device_type='xpu', dtypes=None),
            )),
     OpInfo('as_strided_scatter',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -13866,6 +13866,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
                # Consider making it a parameter or input, or detaching the gradient
                DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            active_if=TEST_WITH_ROCM)
            ],
            sample_inputs_func=sample_inputs_instance_norm,
            supports_expanded_weight=True,),
@@ -15187,6 +15189,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.bfloat16, torch.float16)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type="cuda", active_if=TEST_WITH_ROCM),
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}),
                             'TestCompositeCompliance', 'test_forward_ad', device_type="cpu"),
            )),
@@ -19908,58 +19912,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
 op_db += opinfo.definitions.op_db
 
 
-# def enable_skipped_device(op_db_list: List[OpInfo]):
-#         if TEST_XPU:
-#             import os, yaml
-#             device = 'xpu'
-#             op_db_dict = {}
-
-#             xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
-#             if os.path.exists(xpu_op_db):
-#                 with open(xpu_op_db) as stream:
-#                     try:
-#                         op_db_dict = yaml.safe_load(stream)
-#                     except yaml.YAMLError:
-#                         print("Error in loading xpu_op_db.yaml.")
-
-#             for op in op_db_list:
-#                 if not op_db_dict or op.name not in op_db_dict['supported']:
-#                     if hasattr(op, "torch_opinfo"):
-#                         # import pdb
-#                         # pdb.set_trace()
-#                         torch_opinfo = getattr(op, "torch_opinfo")
-#                         if hasattr(torch_opinfo, 'name') and torch_opinfo.name in op_db_dict['supported']:
-#                             continue
-
-#                     if op.skips is not None:
-#                         op.skips = (*op.skips, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
-#                         op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type=device, dtypes=None))
-#                     else:
-#                         op.skips = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
-#                         op.decorators = (DecorateInfo(unittest.skip, device_type=device, dtypes=None))
-#             return op_db_list            
-            
-#op_db = enable_skipped_device(op_db) 
-
-# import os, yaml
-# xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
-# if os.path.exists(xpu_op_db):
-#     with open(xpu_op_db) as stream:
-#         try:
-#             op_db_dict = yaml.safe_load(stream)
-#         except yaml.YAMLError:
-#             print("Error in loading xpu_op_db.yaml.")
-
-# for op in op_db:
-#     if op.name not in op_db_dict['supported']:  
-#         if op.name == "__rpow__":
-#             import pdb
-#             pdb.set_trace()
-#         print("***", op.name, op.decorators)              
-
-
-        
-
 # Separate registry for experimental Python Reference OpInfos.
 python_ref_db = [
     #
@@ -22932,7 +22884,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     ),
 ]
 python_ref_db += opinfo.definitions.python_ref_db
-#python_ref_db = enable_skipped_device(python_ref_db)
 
 # Common operator groupings
 ops_and_refs = op_db + python_ref_db
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 20c1dde4dbbff..c44b883aaf0c1 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -390,7 +390,6 @@ def composite_fn(test, generic_cls, device_cls,
                      old_parametrize_fn=old_parametrize_fn,
                      new_parametrize_fn=new_parametrize_fn):
         old_tests = list(old_parametrize_fn(test, generic_cls, device_cls))
-        
         for (old_test, old_test_name, old_param_kwargs, old_dec_fn) in old_tests:
             for (new_test, new_test_name, new_param_kwargs, new_dec_fn) in \
                     new_parametrize_fn(old_test, generic_cls, device_cls):
@@ -405,7 +404,6 @@ def composite_fn(test, generic_cls, device_cls,
                                                    old_test_name)
 
                 def merged_decorator_fn(param_kwargs, old_dec_fn=old_dec_fn, new_dec_fn=new_dec_fn):
-                    
                     return list(old_dec_fn(param_kwargs)) + list(new_dec_fn(param_kwargs))
 
                 yield (new_test, merged_test_name, full_param_kwargs, merged_decorator_fn)
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index d760feac3ed10..c9ceec66df913 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -98,14 +98,12 @@ def __init__(
         self.dtypes = dtypes
         self.active_if = active_if
 
-        
         # Validate dtypes
         if self.dtypes is not None:
             for dtype in self.dtypes:
                 assert isinstance(dtype, torch.dtype)
 
     def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
-     
         return (
             self.active_if
             and (self.cls_name is None or self.cls_name == cls_name)
@@ -1165,9 +1163,6 @@ def __post_init__(self):
             self.aliases = tuple(AliasInfo(a) for a in self.aliases)  # type: ignore[assignment]
         else:
             self.aliases = ()
-        
-        
-
 
     def __call__(self, *args, **kwargs):
         """Calls the function variant of the operator."""
@@ -1372,7 +1367,6 @@ def sample_inputs_sparse_bsc(self, device, dtype, requires_grad=False, **kwargs)
     def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
         """Returns the decorators targeting the given test."""
         result = []
-   
         for decorator in self.decorators:
             if isinstance(decorator, DecorateInfo):
                 if decorator.is_active(

From 0a227700fb36aebba5b3735fa76fc429017c83a2 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 28 May 2024 23:39:30 -0700
Subject: [PATCH 09/37] further cleanup

---
 torch/testing/_internal/common_methods_invocations.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 9bded15ad0167..9b764dd02dafe 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -15189,6 +15189,8 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
                DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
                             device_type='cpu', dtypes=(torch.bfloat16, torch.float16)),
+               # Trying to use forward AD with miopen_batch_norm that does not support it
+               # because it has not been implemented yet.
                DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
                             device_type="cuda", active_if=TEST_WITH_ROCM),
                DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}),

From 47055814b3353a5ae974cf86a5fff1c60a835d41 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Thu, 30 May 2024 19:54:15 -0700
Subject: [PATCH 10/37] instead of do unittest.skip for xpu unsupported op in
 OpInfo, move it to individual test file. added allow_xpu for xpu supported
 tests.

---
 test/test_ops.py                              | 18 +++--
 test/xpu/xpu_op_db.yaml                       | 48 ++++++++++---
 torch/testing/_internal/common_device_type.py | 11 +--
 .../_internal/common_methods_invocations.py   | 69 ++++++++++++-------
 torch/testing/_internal/common_utils.py       |  2 +-
 torch/testing/_internal/opinfo/core.py        | 37 +++++-----
 6 files changed, 124 insertions(+), 61 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 3f4684bf85610..9430977d0001c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -56,6 +56,7 @@
     SpectralFuncInfo,
     UnaryUfuncInfo,
     xfail,
+    enable_skipped_device,
 )
 
 from torch.testing._internal.common_utils import (
@@ -87,6 +88,10 @@
 
 assert torch.get_default_dtype() == torch.float32
 
+
+enable_skipped_device(op_db)
+enable_skipped_device(python_ref_db)
+
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
 _variant_ops = partial(
@@ -109,7 +114,7 @@
 )
 
 
-
+my_op_list = [op for op in python_ref_db if op.name in ['_refs.cos',]]
 
 def reduction_dtype_filter(op):
     if (
@@ -2677,12 +2682,13 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-instantiate_device_type_tests(TestCommon, globals())
-instantiate_device_type_tests(TestCompositeCompliance, globals())
-instantiate_device_type_tests(TestMathBits, globals())
+
+instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
+instantiate_device_type_tests(TestCompositeCompliance, globals(), allow_xpu=True)
+instantiate_device_type_tests(TestMathBits, globals(), allow_xpu=True)
 instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
-instantiate_device_type_tests(TestFakeTensor, globals())
-instantiate_device_type_tests(TestTags, globals())
+instantiate_device_type_tests(TestFakeTensor, globals(), allow_xpu=True)
+instantiate_device_type_tests(TestTags, globals(), allow_xpu=True)
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True
diff --git a/test/xpu/xpu_op_db.yaml b/test/xpu/xpu_op_db.yaml
index c0833b0cd30ea..a9fddfeab1a89 100644
--- a/test/xpu/xpu_op_db.yaml
+++ b/test/xpu/xpu_op_db.yaml
@@ -24,7 +24,11 @@ supported:
     - clamp_min
     - clone
     - copy
-    - cos
+    - cos:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
     - cumsum
     - empty
     - eq
@@ -39,7 +43,11 @@ supported:
     - index_select
     - isnan
     - le
-    - log
+    - log:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
     - lt
     - masked_fill
     - maximum
@@ -52,17 +60,41 @@ supported:
     - nn.functional.threshold
     - nonzero
     - normal
-    - pow
+    - pow:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
     - reciprocal
     - rsub
     - relu
     - remainder
     - reshape
-    - rsqrt
-    - sin
-    - sqrt
-    - sum
-    - tanh
+    - rsqrt:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
+    - sin:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
+    - sqrt:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
+    - sum:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
+    - tanh:
+        supported:
+            - complex32
+        supported_backward:
+            - complex32
     - unfold
     - uniform
     - view
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 1ed3ccd672ecd..b7e669143bf83 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -695,12 +695,12 @@ def filter_desired_device_types(device_type_test_bases, except_for=None, only_fo
 PYTORCH_TESTING_DEVICE_FOR_CUSTOM_KEY = 'PYTORCH_TESTING_DEVICE_FOR_CUSTOM'
 
 
-def get_desired_device_type_test_bases(except_for=None, only_for=None, include_lazy=False, allow_mps=False):
+def get_desired_device_type_test_bases(except_for=None, only_for=None, include_lazy=False, allow_mps=False, allow_xpu=False):
     # allow callers to specifically opt tests into being tested on MPS, similar to `include_lazy`
     test_bases = device_type_test_bases.copy()
     if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
         test_bases.append(MPSTestBase)
-    if (only_for == 'xpu' or 'xpu' in os.getenv(PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY)) and TEST_XPU and XPUTestBase not in test_bases:
+    if (allow_xpu or only_for == 'xpu') and TEST_XPU and XPUTestBase not in test_bases:
        test_bases.append(XPUTestBase)
     # Filter out the device types based on user inputs
     desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
@@ -745,7 +745,7 @@ def split_if_not_empty(x: str):
 # device-specific tests (NB: this supports additional @parametrize usage).
 #
 # See note "Writing Test Templates"
-def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False):
+def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False, allow_xpu=False):
     # Removes the generic test class from its enclosing scope so its tests
     # are not discoverable.
     del scope[generic_test_class.__name__]
@@ -765,7 +765,7 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None, on
     generic_tests = [x for x in generic_members if x.startswith('test')]
 
     # Creates device-specific test cases
-    for base in get_desired_device_type_test_bases(except_for, only_for, include_lazy, allow_mps):
+    for base in get_desired_device_type_test_bases(except_for, only_for, include_lazy, allow_mps, allow_xpu):
         class_name = generic_test_class.__name__ + base.device_type.upper()
 
         # type set to Any and suppressed due to unsupport runtime class:
@@ -1097,6 +1097,9 @@ def _has_sufficient_memory(device, size):
             device = 'cuda:0'
         return torch.cuda.memory.mem_get_info(device)[0] >= size
 
+    if device == 'xpu':
+        raise unittest.SkipTest('TODO: Memory availability checks for XPU?')
+    
     if device == 'xla':
         raise unittest.SkipTest('TODO: Memory availability checks for XLA?')
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 9b764dd02dafe..2673316526537 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -37,7 +37,7 @@
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, freeze_rng_state, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, TEST_XPU
+    TEST_WITH_TORCHINDUCTOR, TEST_XPU, enable_skipped_op_dict,
 )
 
 import torch._refs as refs  # noqa: F401
@@ -9515,7 +9515,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
-        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9534,7 +9533,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
-        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9586,7 +9584,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         backward_requires_result=True,
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
-        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9606,7 +9603,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types(),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half,),
-        dtypesIfXPU=floating_and_complex_types_and(torch.half,),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9633,7 +9629,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_and_complex_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_and_complex_types_and(torch.half),
-        dtypesIfXPU=floating_and_complex_types_and(torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9717,7 +9712,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=floating_types_and(torch.bfloat16),
         dtypesIfCUDA=floating_types_and(torch.half),
-        dtypesIfXPU=floating_types_and(torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9747,7 +9741,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
-        dtypesIfXPU=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -9763,7 +9756,6 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
         sample_inputs_func=foreach_inputs_sample_func(1, False, False),
         dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
         dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half),
-        dtypesIfXPU=all_types_and_complex_and(torch.bfloat16, torch.half),
         supports_autograd=True,
         supports_inplace_autograd=True,
         supports_forward_ad=True,
@@ -10429,7 +10421,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    ref=np.abs,
                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
-                   dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                    skips=(
                        DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
                                     'test_inplace_grad', dtypes=(torch.cdouble,)),
@@ -11785,7 +11776,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     variant_test_name='no_rounding_mode',
                     dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
-                    dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
                     # Runs very slowly on slow gradcheck - alternatively reduce input sizes
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
@@ -11959,7 +11949,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     ref=np.fmod,
                     dtypes=all_types_and(torch.float16, torch.bfloat16),
                     dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-                    dtypesIfXPU=all_types_and(torch.float16, torch.bfloat16),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
@@ -11984,7 +11973,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                     ref=np.remainder,
                     dtypes=all_types_and(torch.float16, torch.bfloat16),
                     dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
-                    dtypesIfXPU=all_types_and(torch.float16, torch.bfloat16),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_forward_ad=True,
@@ -14016,7 +14004,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            aten_name='im2col',
            dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
            dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
-           dtypesIfXPU=floating_and_complex_types_and(torch.half, torch.bfloat16),
            sample_inputs_func=sample_inputs_nn_unfold,
            # Runs very slowly on slow gradcheck - alternatively reduce input sizes
            gradcheck_fast_mode=True,
@@ -15566,14 +15553,12 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     BinaryUfuncInfo('pow',
                     dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
                     dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
-                    dtypesIfXPU=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
                     ref=np.power,
                     # Due to AVX2 currently not being fully supported for Float16, log_vml_cpu can't be enabled
                     # for Float16, causing this test to fail. pow's autograd for Float16 is thus currently
                     # unsupported on CPU.
                     backward_dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
                     backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
-                    backward_dtypesIfXPU=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
                     # https://github.com/pytorch/pytorch/issues/80411
                     gradcheck_fast_mode=True,
                     supports_inplace_autograd=False,
@@ -16375,7 +16360,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                    domain=(0, None),
                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
                    dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
-                   dtypesIfXPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
                    decorators=(precisionOverride({torch.half: 5e-2}),),
                    assert_autodiffed=True,
                    supports_forward_ad=True,
@@ -16854,7 +16838,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('gather',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
-           dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
            sample_inputs_func=sample_inputs_gather,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            supports_forward_ad=True,
@@ -16889,7 +16872,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
     OpInfo('index_select',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
            backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
-           backward_dtypesIfXPU=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
            sample_inputs_func=sample_inputs_index,
            reference_inputs_func=partial(sample_inputs_index, reference=True),
            error_inputs_func=error_inputs_index_select,
@@ -17707,7 +17689,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            inplace_variant=None,
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
-           dtypesIfXPU=floating_types_and(torch.bfloat16, torch.half),
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_first,
            skips=(
@@ -17737,7 +17718,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
            inplace_variant=None,
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
-           dtypesIfXPU=floating_types_and(torch.bfloat16, torch.half),
            supports_out=True,
            sample_inputs_func=sample_inputs_normal_tensor_second,
            skips=(
@@ -18928,7 +18908,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         aten_name="native_dropout_backward",
         dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
         dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
-        dtypesIfXPU=floating_types_and(torch.float16, torch.bfloat16),
         supports_out=False,
         sample_inputs_func=sample_inputs_dropout_backward,
         skips=(
@@ -19502,7 +19481,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
         promotes_int_to_int64=True,
         dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
         dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
-        dtypesIfXPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
         ref=reference_reduction_numpy(np.sum),
         error_inputs_sparse_func=error_inputs_sparse_reduction_sum,
         sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_coo),
@@ -22965,3 +22943,48 @@ def skipOps(test_case_name, base_test_name, to_skip):
     def wrapped(fn):
         return fn
     return wrapped
+
+def enable_skipped_device(op_db_list: List[OpInfo]):
+    if TEST_XPU:
+        # Get the supported op and dtypes from yaml file.
+        op_db_dict = enable_skipped_op_dict()
+        supported_op_list = [list(op_dict.keys())[0] if type(op_dict) is dict else op_dict for op_dict in op_db_dict['supported']]
+
+        for op in op_db_list:
+            # For refs ops get the name of the related torch_opinfo.
+            torch_opinfo = getattr(op, "torch_opinfo") if hasattr(op, "torch_opinfo") else None
+            name = torch_opinfo.name if torch_opinfo is not None else op.name
+
+            if name not in supported_op_list:
+                # If the op is not supported add unittest.skip decorators.
+                if op.skips is not None:
+                    op.skips = (*op.skips, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+                    op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+                else:
+                    op.skips = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+                    op.decorators = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+            else:
+                ind = supported_op_list.index(name)
+
+                if type(op_db_dict['supported'][ind]) is dict and op_db_dict['supported'][ind][name] != None:
+                    # If the op is supported check whether the supported dtypes is different with cuda
+                    for _key in op_db_dict['supported'][ind][name]:
+                        # Get the dtypes with difference
+                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None for _dtype in op_db_dict['supported'][ind][name][_key]]
+                        match _key:
+                            case "unsupported":
+                                op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU)) \
+                                    if type(op.dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
+                            case "unsupported_backward":
+                                op.backward_dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU)) \
+                                    if type(op.backward_dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU))
+                            case "supported":
+                                if type(op.dtypesIfXPU) is set:
+                                    op.dtypesIfXPU.update(_dtypes)
+                                else:
+                                    op.dtypesIfXPU = _dispatch_dtypes((*op.dtypesIfXPU, *_dtypes))
+                            case "supported_backward":
+                                if type(op.backward_dtypesIfXPU) is set:
+                                    op.backward_dtypesIfXPU.update(_dtypes)
+                                else:
+                                    op.backward_dtypesIfXPU = _dispatch_dtypes((*op.backward_dtypesIfXPU, *_dtypes))
\ No newline at end of file
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index c44b883aaf0c1..95e6ddc513810 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -228,7 +228,7 @@ def maybe_load_json(filename):
 if os.getenv("DISABLED_TESTS_FILE", ""):
     disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
 
-NATIVE_DEVICES = ('cpu', 'cuda', 'meta', 'xpu', torch._C._get_privateuse1_backend_name())
+NATIVE_DEVICES = ('cpu', 'cuda', 'xpu', 'meta', torch._C._get_privateuse1_backend_name())
 
 check_names = ['orin', 'concord', 'galen', 'xavier', 'nano', 'jetson', 'tegra']
 IS_JETSON = any(name in platform.platform() for name in check_names)
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index c9ceec66df913..24158b13ea102 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -34,7 +34,6 @@
     torch_to_numpy_dtype_dict,
     TrackedInputIter,
     TEST_XPU,
-    enable_skipped_op_dict,
 )
 from torch.testing._internal.opinfo import utils
 
@@ -726,12 +725,12 @@ class OpInfo:
     # dtypes this function is expected to work with on CUDA
     dtypesIfCUDA: _dispatch_dtypes = None
 
-    # dtypes this function is expected to work with on XPU
-    dtypesIfXPU: _dispatch_dtypes = None
-
     # dtypes this function is expected to work with on ROCM
     dtypesIfROCM: _dispatch_dtypes = None
 
+    # dtypes this function is expected to work with on XPU
+    dtypesIfXPU: _dispatch_dtypes = None
+
     # backward dtypes this function is expected to work with
     backward_dtypes: _dispatch_dtypes = None
 
@@ -894,13 +893,7 @@ class OpInfo:
 
     is_factory_function: bool = False
 
-    def enable_skipped_device(self):
-        op_db_dict = enable_skipped_op_dict()
-        if TEST_XPU and (not op_db_dict or self.name not in op_db_dict['supported']):
-            if self.skips is not None:
-                self.skips = (*self.skips, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-            else:
-                self.skips = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+    
 
 
     def __post_init__(self):
@@ -908,7 +901,12 @@ def __post_init__(self):
 
         assert self.dtypes is not None, f"OpInfo for {self.name} has no dtypes!"
 
-        dtypes_args = (self.dtypes, self.dtypesIfCUDA, self.dtypesIfROCM)
+        dtypes_args = (
+            self.dtypes,
+            self.dtypesIfCUDA,
+            self.dtypesIfROCM,
+            self.dtypesIfXPU,
+        )
 
         # Validates the dtypes are generated from the dispatch-related functions
         for dtype_list in dtypes_args:
@@ -972,7 +970,8 @@ def __post_init__(self):
                 if self.backward_dtypes is not None
                 else self.dtypesIfXPU
                 if self.dtypesIfXPU is not None
-                else self.dtypes
+                #else self.dtypes
+                else self.backward_dtypesIfCUDA
             )
         )
 
@@ -986,16 +985,16 @@ def __post_init__(self):
             set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypes
         )
 
-        self.dtypesIfXPU = (
-            set(self.dtypesIfXPU) if self.dtypesIfXPU is not None else self.dtypes
-        )
-
         self.dtypesIfROCM = (
             set(self.dtypesIfROCM)
             if self.dtypesIfROCM is not None
             else self.dtypesIfCUDA
         )
 
+        self.dtypesIfXPU = (
+            set(self.dtypesIfXPU) if self.dtypesIfXPU is not None else self.dtypesIfCUDA
+        )
+
         # NOTE: if the op is unspecified it is assumed to be under the torch namespace
         if not self.op:
             self.op = _getattr_qual(torch, self.name)
@@ -1027,8 +1026,6 @@ def __post_init__(self):
             else:
                 self.inplace_operator_variant = None
 
-        self.enable_skipped_device()
-
         self.decorators = (*self.decorators, *self.skips)
 
         # Specifying sample inputs function without specifying the
@@ -2674,6 +2671,7 @@ def __init__(
         dtypes=floating_types(),
         dtypesIfCUDA=None,
         dtypesIfROCM=None,
+        dtypesIfXPU=None,
         sample_inputs_func=None,
         **kwargs,
     ):
@@ -2682,6 +2680,7 @@ def __init__(
             dtypes=dtypes,
             dtypesIfCUDA=dtypesIfCUDA,
             dtypesIfROCM=dtypesIfROCM,
+            dtypesIfXPU=dtypesIfXPU,
             sample_inputs_func=sample_inputs_func,
             **kwargs,
         )

From d9513ecfcfee7231922b329d0a7832673904871e Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Fri, 31 May 2024 00:31:38 -0700
Subject: [PATCH 11/37] refine function naming

---
 test/test_ops.py                                  |  8 +++-----
 torch/testing/_internal/common_device_type.py     |  2 --
 .../_internal/common_methods_invocations.py       | 15 +++++++--------
 torch/testing/_internal/common_utils.py           |  8 ++++----
 torch/testing/_internal/opinfo/core.py            |  7 +------
 5 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 9430977d0001c..96a624f9f13b0 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -56,7 +56,7 @@
     SpectralFuncInfo,
     UnaryUfuncInfo,
     xfail,
-    enable_skipped_device,
+    enable_backend_test,
 )
 
 from torch.testing._internal.common_utils import (
@@ -89,8 +89,8 @@
 assert torch.get_default_dtype() == torch.float32
 
 
-enable_skipped_device(op_db)
-enable_skipped_device(python_ref_db)
+enable_backend_test(op_db)
+enable_backend_test(python_ref_db)
 
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
@@ -114,8 +114,6 @@
 )
 
 
-my_op_list = [op for op in python_ref_db if op.name in ['_refs.cos',]]
-
 def reduction_dtype_filter(op):
     if (
         not isinstance(op, ReductionPythonRefInfo)
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 5de4900723043..c4c13f9a6cb46 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -437,7 +437,6 @@ def instantiated_test(self, param_kwargs=param_kwargs):
                 return result
 
             assert not hasattr(cls, name), f"Redefinition of test {name}"
-
             setattr(cls, name, instantiated_test)
 
         def default_parametrize_fn(test, generic_cls, device_cls):
@@ -449,7 +448,6 @@ def default_parametrize_fn(test, generic_cls, device_cls):
 
         # If one of the @dtypes* decorators is present, also parametrize over the dtypes set by it.
         dtypes = cls._get_dtypes(test)
-        
         if dtypes is not None:
 
             def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 01331a14ba0db..71471148e017c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -37,7 +37,7 @@
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, TEST_XPU, enable_skipped_op_dict,
+    TEST_WITH_TORCHINDUCTOR, TEST_XPU, get_backend_op_dict,
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -14406,7 +14406,6 @@ def reference_flatten(input, start_dim=0, end_dim=-1):
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace'),
                DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace_all_strides'),
-               DecorateInfo(unittest.skip("No XPU backend support in this operation"), 'TestCompositeCompliance', 'test_operator', device_type='xpu', dtypes=None),
            )),
     OpInfo('as_strided_scatter',
            dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
@@ -24116,11 +24115,11 @@ def wrapped(fn):
         return fn
     return wrapped
 
-def enable_skipped_device(op_db_list: List[OpInfo]):
+def enable_backend_test(op_db_list: List[OpInfo]):
     if TEST_XPU:
         # Get the supported op and dtypes from yaml file.
-        op_db_dict = enable_skipped_op_dict()
-        supported_op_list = [list(op_dict.keys())[0] if type(op_dict) is dict else op_dict for op_dict in op_db_dict['supported']]
+        backend_op_dict = get_backend_op_dict()
+        supported_op_list = [list(op_dict.keys())[0] if type(op_dict) is dict else op_dict for op_dict in backend_op_dict['supported']]
 
         for op in op_db_list:
             # For refs ops get the name of the related torch_opinfo.
@@ -24138,11 +24137,11 @@ def enable_skipped_device(op_db_list: List[OpInfo]):
             else:
                 ind = supported_op_list.index(name)
 
-                if type(op_db_dict['supported'][ind]) is dict and op_db_dict['supported'][ind][name] != None:
+                if type(backend_op_dict['supported'][ind]) is dict and backend_op_dict['supported'][ind][name] != None:
                     # If the op is supported check whether the supported dtypes is different with cuda
-                    for _key in op_db_dict['supported'][ind][name]:
+                    for _key in backend_op_dict['supported'][ind][name]:
                         # Get the dtypes with difference
-                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None for _dtype in op_db_dict['supported'][ind][name][_key]]
+                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None for _dtype in backend_op_dict['supported'][ind][name][_key]]
                         match _key:
                             case "unsupported":
                                 op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU)) \
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 1243cc33ab88d..6a4f33b4baa47 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -4998,15 +4998,15 @@ def repl_frame(m):
     s = re.sub(r" +$", "", s, flags=re.M)
     return s
 
-def enable_skipped_op_dict():
-    enable_op_dict = {}
+def get_backend_op_dict():
+    backend_op_dict = {}
     if TEST_XPU:
         device = 'xpu'     
         xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
         if os.path.exists(xpu_op_db):
             with open(xpu_op_db) as stream:
                 try:
-                    enable_op_dict = yaml.safe_load(stream)
+                    backend_op_dict = yaml.safe_load(stream)
                 except yaml.YAMLError:
                     print("Error in loading xpu_op_db.yaml.")
-    return enable_op_dict
\ No newline at end of file
+    return backend_op_dict
\ No newline at end of file
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index eba7496367398..e745e1be1c67b 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -5,8 +5,7 @@
 import math
 import operator
 import unittest
-from dataclasses import InitVar, asdict, dataclass
-from typing import Dict, Optional
+from dataclasses import asdict, dataclass
 from enum import Enum
 from functools import partial
 from itertools import product
@@ -894,9 +893,6 @@ class OpInfo:
 
     is_factory_function: bool = False
 
-    
-
-
     def __post_init__(self):
         self._original_opinfo_args = asdict(self).copy()
 
@@ -2503,7 +2499,6 @@ def __init__(
             reference_inputs_func=reference_inputs_func,
             **kwargs,
         )
-        
         self.domain = domain
         self.handles_complex_extremal_values = handles_complex_extremal_values
         self.handles_large_floats = handles_large_floats

From 70283c52f17ffbfa9f72ca5a81b34d67fdae9786 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sat, 1 Jun 2024 20:56:46 -0700
Subject: [PATCH 12/37] skip unsupported xpu test by two means: define the
 unsupported dtypes in xpu_op_db.yaml or define @skipOps in test case

---
 test/test_ops.py                                   |  8 ++++++++
 test/xpu/xpu_op_db.yaml                            | 12 ++++++++----
 .../_internal/common_methods_invocations.py        | 14 +++++++-------
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 96a624f9f13b0..0d4939707c30c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -563,6 +563,12 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
         ],
     )
     @skipIfTorchInductor("Takes too long for inductor")
+    @skipOps(
+        "TestCommon", "test_python_ref_executor", (('_refs.mul', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+    )
+    @skipOps(
+        "TestCommon", "test_python_ref_executor", (('_refs.pow', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+    )
     def test_python_ref_executor(self, device, dtype, op, executor):
         if (
             TEST_WITH_ROCM
@@ -647,6 +653,8 @@ def _to_tensormeta(x):
 
         error_inputs = op.error_inputs(device)
         for ei in error_inputs:
+            import pdb
+            pdb.set_trace()
             si = ei.sample_input
             meta_sample = si.transform(_to_tensormeta)
             with self.assertRaisesRegex(ei.error_type, ei.error_regex):
diff --git a/test/xpu/xpu_op_db.yaml b/test/xpu/xpu_op_db.yaml
index a9fddfeab1a89..9639b0a36126b 100644
--- a/test/xpu/xpu_op_db.yaml
+++ b/test/xpu/xpu_op_db.yaml
@@ -12,7 +12,13 @@ supported:
     - add
     - sub
     - mul
-    - div
+    - div:
+        unsupported:
+            - float16
+            - bfloat16
+        unsupported_backward:
+            - float16
+            - bfloat16
     - abs
     - bernoulli
     - bitwise_and
@@ -114,6 +120,4 @@ supported:
     - gather
     - max_pool2d_with_indices_backward
     - nn.functional.embedding
-    - nn.functional.unfold
-    
-    
+    - nn.functional.unfold
\ No newline at end of file
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 71471148e017c..fcb4111c37f0b 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -24089,8 +24089,9 @@ def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
     return (op_name, variant_name, device_type, dtypes, False)
 
 
-def skipOps(test_case_name, base_test_name, to_skip):
-    all_opinfos = op_db
+def skipOps(test_case_name, base_test_name, to_skip, all_opinfos=op_db):
+    import pdb
+    pdb.set_trace()
     for xfail in to_skip:
         op_name, variant_name, device_type, dtypes, expected_failure = xfail
         matching_opinfos = [o for o in all_opinfos
@@ -24142,19 +24143,18 @@ def enable_backend_test(op_db_list: List[OpInfo]):
                     for _key in backend_op_dict['supported'][ind][name]:
                         # Get the dtypes with difference
                         _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None for _dtype in backend_op_dict['supported'][ind][name][_key]]
-                        match _key:
-                            case "unsupported":
+                        if _key == "unsupported":
                                 op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU)) \
                                     if type(op.dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
-                            case "unsupported_backward":
+                        if _key ==  "unsupported_backward":
                                 op.backward_dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU)) \
                                     if type(op.backward_dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU))
-                            case "supported":
+                        if _key ==  "supported":
                                 if type(op.dtypesIfXPU) is set:
                                     op.dtypesIfXPU.update(_dtypes)
                                 else:
                                     op.dtypesIfXPU = _dispatch_dtypes((*op.dtypesIfXPU, *_dtypes))
-                            case "supported_backward":
+                        if _key == "supported_backward":
                                 if type(op.backward_dtypesIfXPU) is set:
                                     op.backward_dtypesIfXPU.update(_dtypes)
                                 else:

From 9c3b81d970a8f0ca1d156c1dfe0d7725c478477c Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 2 Jun 2024 19:24:47 -0700
Subject: [PATCH 13/37] update skipOps decorators for XPU

---
 test/test_ops.py                              | 28 ++++++++++++++-----
 .../_internal/common_methods_invocations.py   |  3 --
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 0d4939707c30c..9e3bc874315a5 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -564,10 +564,12 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
     )
     @skipIfTorchInductor("Takes too long for inductor")
     @skipOps(
-        "TestCommon", "test_python_ref_executor", (('_refs.mul', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+        "TestCommon", "test_python_ref_executor", \
+            (('_refs.mul', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
     )
     @skipOps(
-        "TestCommon", "test_python_ref_executor", (('_refs.pow', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+        "TestCommon", "test_python_ref_executor", \
+            (('_refs.pow', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
     )
     def test_python_ref_executor(self, device, dtype, op, executor):
         if (
@@ -641,6 +643,10 @@ def test_errors_sparse(self, device, op, layout):
         dtypes=OpDTypes.none,
     )
     @skipIfTorchInductor("Takes too long for inductor")
+    @skipOps(
+        "TestCommon", "test_python_ref_errors", \
+            (('_refs.where', '', 'xpu', None, False),), all_opinfos=python_ref_db
+    )
     def test_python_ref_errors(self, device, op):
         mode = FakeTensorMode()
         with mode:
@@ -653,8 +659,6 @@ def _to_tensormeta(x):
 
         error_inputs = op.error_inputs(device)
         for ei in error_inputs:
-            import pdb
-            pdb.set_trace()
             si = ei.sample_input
             meta_sample = si.transform(_to_tensormeta)
             with self.assertRaisesRegex(ei.error_type, ei.error_regex):
@@ -1025,9 +1029,7 @@ def _case_two_transform(t):
                 wrong_device = "cpu"
             elif torch.cuda.is_available():
                 wrong_device = "cuda"
-            elif torch.xpu.is_available(): 
-                wrong_device = "xpu"
-
+            
             factory_fn_msg = (
                 "\n\nNOTE: If your op is a factory function (i.e., it accepts TensorOptions) you should mark its "
                 "OpInfo with `is_factory_function=True`."
@@ -1413,6 +1415,18 @@ def convert_boolean_tensors(x):
     @skipMeta
     @onlyNativeDeviceTypes
     @ops(ops_and_refs, dtypes=OpDTypes.none)
+    @skipOps(
+        "TestCommon", "test_dtypes", \
+            (('div', 'floor_rounding', 'xpu', None, False),), all_opinfos=ops_and_refs
+    )
+    @skipOps(
+        "TestCommon", "test_dtypes", \
+            (('div', 'no_rounding_mode', 'xpu', None, False),), all_opinfos=ops_and_refs
+    )
+    @skipOps(
+        "TestCommon", "test_dtypes", \
+            (('div', 'trunc_rounding', 'xpu', None, False),), all_opinfos=ops_and_refs
+    )
     def test_dtypes(self, device, op):
         # Check complex32 support only if the op claims.
         # TODO: Once the complex32 support is better, we should add check for complex32 unconditionally.
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index fcb4111c37f0b..3731e64b76b76 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -24090,8 +24090,6 @@ def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
 
 
 def skipOps(test_case_name, base_test_name, to_skip, all_opinfos=op_db):
-    import pdb
-    pdb.set_trace()
     for xfail in to_skip:
         op_name, variant_name, device_type, dtypes, expected_failure = xfail
         matching_opinfos = [o for o in all_opinfos
@@ -24126,7 +24124,6 @@ def enable_backend_test(op_db_list: List[OpInfo]):
             # For refs ops get the name of the related torch_opinfo.
             torch_opinfo = getattr(op, "torch_opinfo") if hasattr(op, "torch_opinfo") else None
             name = torch_opinfo.name if torch_opinfo is not None else op.name
-
             if name not in supported_op_list:
                 # If the op is not supported add unittest.skip decorators.
                 if op.skips is not None:

From fd3d73b86f0a0cc17c15536ac89b986b8e336a2e Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 3 Jun 2024 00:18:57 -0700
Subject: [PATCH 14/37] update according to comments

---
 torch/testing/_internal/common_device_type.py         | 6 ++++++
 torch/testing/_internal/common_methods_invocations.py | 4 ++--
 torch/testing/_internal/common_utils.py               | 4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index c4c13f9a6cb46..81c96e8de04a3 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1612,3 +1612,9 @@ def skipPRIVATEUSE1(fn):
 #  This should probably enumerate all available device type test base classes.
 def get_all_device_types() -> List[str]:
     return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+
+def any_common_cpu_device_one():
+    return OpDTypes.any_common_cpu_xpu_one if TEST_XPU else OpDTypes.any_common_cpu_cuda_one
+
+def has_gpu_device(devices: List[str]):
+    return "cuda" in devices or "xpu" in devices
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 3731e64b76b76..02c47ab0e189c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -24114,8 +24114,8 @@ def wrapped(fn):
         return fn
     return wrapped
 
-def enable_backend_test(op_db_list: List[OpInfo]):
-    if TEST_XPU:
+def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
+    if TEST_XPU and device == 'xpu':
         # Get the supported op and dtypes from yaml file.
         backend_op_dict = get_backend_op_dict()
         supported_op_list = [list(op_dict.keys())[0] if type(op_dict) is dict else op_dict for op_dict in backend_op_dict['supported']]
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 6a4f33b4baa47..3e955f9a420ce 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -4998,9 +4998,9 @@ def repl_frame(m):
     s = re.sub(r" +$", "", s, flags=re.M)
     return s
 
-def get_backend_op_dict():
+def get_backend_op_dict(device='xpu'):
     backend_op_dict = {}
-    if TEST_XPU:
+    if TEST_XPU and device == 'xpu':
         device = 'xpu'     
         xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
         if os.path.exists(xpu_op_db):

From f53f4d3b57c6a2cbcbcb1186e732a39dd42de319 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 3 Jun 2024 19:12:28 -0700
Subject: [PATCH 15/37] pass lintrunner

---
 test/test_ops.py                              | 69 +++++++++++--------
 torch/testing/_internal/common_device_type.py | 13 ++--
 .../_internal/common_methods_invocations.py   | 43 +++++++-----
 torch/testing/_internal/common_utils.py       |  7 +-
 4 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 9e3bc874315a5..e96b3386ce4b1 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -28,7 +28,9 @@
 
 from torch.testing._internal import composite_compliance, opinfo
 from torch.testing._internal.common_device_type import (
+    any_common_cpu_device_one,
     deviceCountAtLeast,
+    has_gpu_device,
     instantiate_device_type_tests,
     onlyCPU,
     onlyCUDA,
@@ -45,6 +47,7 @@
     integral_types_and,
 )
 from torch.testing._internal.common_methods_invocations import (
+    apply_op_db_for,
     BinaryUfuncInfo,
     op_db,
     ops_and_refs,
@@ -56,7 +59,6 @@
     SpectralFuncInfo,
     UnaryUfuncInfo,
     xfail,
-    enable_backend_test,
 )
 
 from torch.testing._internal.common_utils import (
@@ -88,9 +90,9 @@
 
 assert torch.get_default_dtype() == torch.float32
 
-
-enable_backend_test(op_db)
-enable_backend_test(python_ref_db)
+if TEST_XPU:
+    apply_op_db_for(op_db, device="xpu")
+    apply_op_db_for(python_ref_db, device="xpu")
 
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
@@ -135,11 +137,6 @@ def reduction_dtype_filter(op):
 
 aten = torch.ops.aten
 
-def any_common_cpu_device_one():
-    return OpDTypes.any_common_cpu_xpu_one if TEST_XPU else OpDTypes.any_common_cpu_cuda_one
-
-def has_gpu_device(devices: List[str]):
-    return "cuda" in devices or "xpu" in devices
 
 # Tests that apply to all operators and aren't related to any particular
 #   system
@@ -306,7 +303,6 @@ def to_cpu(arg):
                 return arg.to(device="cpu")
             return arg
 
- 
         samples = op.reference_inputs(device, dtype)
 
         for sample in samples:
@@ -564,12 +560,16 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
     )
     @skipIfTorchInductor("Takes too long for inductor")
     @skipOps(
-        "TestCommon", "test_python_ref_executor", \
-            (('_refs.mul', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+        "TestCommon",
+        "test_python_ref_executor",
+        (("_refs.mul", "", "xpu", (torch.complex32,), False),),
+        all_opinfos=python_ref_db,
     )
     @skipOps(
-        "TestCommon", "test_python_ref_executor", \
-            (('_refs.pow', '', 'xpu', (torch.complex32,), False),), all_opinfos=python_ref_db
+        "TestCommon",
+        "test_python_ref_executor",
+        (("_refs.pow", "", "xpu", (torch.complex32,), False),),
+        all_opinfos=python_ref_db,
     )
     def test_python_ref_executor(self, device, dtype, op, executor):
         if (
@@ -644,8 +644,10 @@ def test_errors_sparse(self, device, op, layout):
     )
     @skipIfTorchInductor("Takes too long for inductor")
     @skipOps(
-        "TestCommon", "test_python_ref_errors", \
-            (('_refs.where', '', 'xpu', None, False),), all_opinfos=python_ref_db
+        "TestCommon",
+        "test_python_ref_errors",
+        (("_refs.where", "", "xpu", None, False),),
+        all_opinfos=python_ref_db,
     )
     def test_python_ref_errors(self, device, op):
         mode = FakeTensorMode()
@@ -824,7 +826,11 @@ def _extract_strides(out):
             # NOTE: only extracts on the CPU and CUDA device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
-                if self.device_type != "cpu" and self.device_type != "cuda" and self.device_type != "xpu":
+                if (
+                    self.device_type != "cpu"
+                    and self.device_type != "cuda"
+                    and self.device_type != "xpu"
+                ):
                     return ()
 
                 if isinstance(out, torch.Tensor):
@@ -952,7 +958,11 @@ def _extract_strides(out):
             # NOTE: only extracts on the CPU and CUDA device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
-                if self.device_type != "cpu" and self.device_type != "cuda" and self.device_type != "xpu":
+                if (
+                    self.device_type != "cpu"
+                    and self.device_type != "cuda"
+                    and self.device_type != "xpu"
+                ):
                     return ()
 
                 if isinstance(out, torch.Tensor):
@@ -1029,7 +1039,7 @@ def _case_two_transform(t):
                 wrong_device = "cpu"
             elif torch.cuda.is_available():
                 wrong_device = "cuda"
-            
+
             factory_fn_msg = (
                 "\n\nNOTE: If your op is a factory function (i.e., it accepts TensorOptions) you should mark its "
                 "OpInfo with `is_factory_function=True`."
@@ -1416,16 +1426,22 @@ def convert_boolean_tensors(x):
     @onlyNativeDeviceTypes
     @ops(ops_and_refs, dtypes=OpDTypes.none)
     @skipOps(
-        "TestCommon", "test_dtypes", \
-            (('div', 'floor_rounding', 'xpu', None, False),), all_opinfos=ops_and_refs
+        "TestCommon",
+        "test_dtypes",
+        (("div", "floor_rounding", "xpu", None, False),),
+        all_opinfos=ops_and_refs,
     )
     @skipOps(
-        "TestCommon", "test_dtypes", \
-            (('div', 'no_rounding_mode', 'xpu', None, False),), all_opinfos=ops_and_refs
+        "TestCommon",
+        "test_dtypes",
+        (("div", "no_rounding_mode", "xpu", None, False),),
+        all_opinfos=ops_and_refs,
     )
     @skipOps(
-        "TestCommon", "test_dtypes", \
-            (('div', 'trunc_rounding', 'xpu', None, False),), all_opinfos=ops_and_refs
+        "TestCommon",
+        "test_dtypes",
+        (("div", "trunc_rounding", "xpu", None, False),),
+        all_opinfos=ops_and_refs,
     )
     def test_dtypes(self, device, op):
         # Check complex32 support only if the op claims.
@@ -1697,7 +1713,7 @@ def test_forward_ad(self, device, dtype, op):
             composite_compliance.check_forward_ad_formula(
                 op.get_op(), args, kwargs, op.gradcheck_wrapper, self.assertEqual
             )
-         
+
     @skipXPU
     @ops(op_db, allowed_dtypes=(torch.float,))
     def test_cow_input(self, device, dtype, op):
@@ -2702,7 +2718,6 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-
 instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
 instantiate_device_type_tests(TestCompositeCompliance, globals(), allow_xpu=True)
 instantiate_device_type_tests(TestMathBits, globals(), allow_xpu=True)
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 81c96e8de04a3..a18842c98a4eb 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -700,7 +700,7 @@ def get_desired_device_type_test_bases(except_for=None, only_for=None, include_l
     if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
         test_bases.append(MPSTestBase)
     if (allow_xpu or only_for == 'xpu') and TEST_XPU and XPUTestBase not in test_bases:
-       test_bases.append(XPUTestBase)
+        test_bases.append(XPUTestBase)
     # Filter out the device types based on user inputs
     desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
     if include_lazy:
@@ -744,7 +744,8 @@ def split_if_not_empty(x: str):
 # device-specific tests (NB: this supports additional @parametrize usage).
 #
 # See note "Writing Test Templates"
-def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False, allow_xpu=False):
+def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False,
+                                  allow_mps=False, allow_xpu=False):
     # Removes the generic test class from its enclosing scope so its tests
     # are not discoverable.
     del scope[generic_test_class.__name__]
@@ -833,7 +834,7 @@ class OpDTypes(Enum):
     any_one = 4  # Test precisely one supported dtype
     none = 5  # Instantiate no dtype variants (no dtype kwarg needed)
     any_common_cpu_cuda_one = 6  # Test precisely one supported dtype that is common to both cuda and cpu
-    any_common_cpu_xpu_one = 7 # Test precisely one supported dtype that is common to both xpu and cpu
+    any_common_cpu_xpu_one = 7  # Test precisely one supported dtype that is common to both xpu and cpu
 
 
@@ -1052,8 +1053,8 @@ def __init__(self, dep, reason):
 
 class skipXPUIf(skipIf):
 
-    def __init__(self, dep, reason):                                           
-        super().__init__(dep, reason, device_type='xpu') 
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='xpu')
 
 # Skips a test on Lazy if the condition is true.
 class skipLazyIf(skipIf):
@@ -1098,7 +1099,7 @@ def _has_sufficient_memory(device, size):
 
     if device == 'xpu':
         raise unittest.SkipTest('TODO: Memory availability checks for XPU?')
-    
+
     if device == 'xla':
         raise unittest.SkipTest('TODO: Memory availability checks for XLA?')
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 02c47ab0e189c..8510b238a418e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -24118,11 +24118,12 @@ def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
     if TEST_XPU and device == 'xpu':
         # Get the supported op and dtypes from yaml file.
         backend_op_dict = get_backend_op_dict()
-        supported_op_list = [list(op_dict.keys())[0] if type(op_dict) is dict else op_dict for op_dict in backend_op_dict['supported']]
+        supported_op_list = [next(iter(op_dict.keys())) if type(op_dict) is dict else op_dict
+                             for op_dict in backend_op_dict['supported']]
 
         for op in op_db_list:
             # For refs ops get the name of the related torch_opinfo.
-            torch_opinfo = getattr(op, "torch_opinfo") if hasattr(op, "torch_opinfo") else None
+            torch_opinfo = op.torch_opinfo if hasattr(op, "torch_opinfo") else None
             name = torch_opinfo.name if torch_opinfo is not None else op.name
             if name not in supported_op_list:
                 # If the op is not supported add unittest.skip decorators.
@@ -24135,24 +24136,30 @@ def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
             else:
                 ind = supported_op_list.index(name)
 
-                if type(backend_op_dict['supported'][ind]) is dict and backend_op_dict['supported'][ind][name] != None:
+                if type(backend_op_dict['supported'][ind]) is dict and backend_op_dict['supported'][ind][name] is not None:
                     # If the op is supported check whether the supported dtypes is different with cuda
                     for _key in backend_op_dict['supported'][ind][name]:
                         # Get the dtypes with difference
-                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None for _dtype in backend_op_dict['supported'][ind][name][_key]]
+                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None
+                                   for _dtype in backend_op_dict['supported'][ind][name][_key]]
                         if _key == "unsupported":
-                                op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU)) \
-                                    if type(op.dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
-                        if _key ==  "unsupported_backward":
-                                op.backward_dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU)) \
-                                    if type(op.backward_dtypesIfXPU) is set else _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU))
-                        if _key ==  "supported":
-                                if type(op.dtypesIfXPU) is set:
-                                    op.dtypesIfXPU.update(_dtypes)
-                                else:
-                                    op.dtypesIfXPU = _dispatch_dtypes((*op.dtypesIfXPU, *_dtypes))
+                            if type(op.dtypesIfXPU) is set:
+                                op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
+                            else:
+                                _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
+                        if _key == "unsupported_backward":
+                            if type(op.backward_dtypesIfXPU) is set:
+                                op.backward_dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU))
+                            else:
+                                op.backward_dtypesIfXPU = _dispatch_dtypes(filter(lambda x: (x not in _dtypes),
+                                                                                  op.backward_dtypesIfXPU))
+                        if _key == "supported":
+                            if type(op.dtypesIfXPU) is set:
+                                op.dtypesIfXPU.update(_dtypes)
+                            else:
+                                op.dtypesIfXPU = _dispatch_dtypes((*op.dtypesIfXPU, *_dtypes))
                         if _key == "supported_backward":
-                                if type(op.backward_dtypesIfXPU) is set:
-                                    op.backward_dtypesIfXPU.update(_dtypes)
-                                else:
-                                    op.backward_dtypesIfXPU = _dispatch_dtypes((*op.backward_dtypesIfXPU, *_dtypes))
+                            if type(op.backward_dtypesIfXPU) is set:
+                                op.backward_dtypesIfXPU.update(_dtypes)
+                            else:
+                                op.backward_dtypesIfXPU = _dispatch_dtypes((*op.backward_dtypesIfXPU, *_dtypes))
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 3e955f9a420ce..abc4c3ff946f2 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5001,12 +5001,13 @@ def repl_frame(m):
 def get_backend_op_dict(device='xpu'):
     backend_op_dict = {}
     if TEST_XPU and device == 'xpu':
-        device = 'xpu'     
-        xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") else os.getcwd() + "../xpu/xpu_op_db.yaml"
+        device = 'xpu'
+        xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") \
+            else os.getcwd() + "../xpu/xpu_op_db.yaml"
         if os.path.exists(xpu_op_db):
             with open(xpu_op_db) as stream:
                 try:
                     backend_op_dict = yaml.safe_load(stream)
                 except yaml.YAMLError:
                     print("Error in loading xpu_op_db.yaml.")
-    return backend_op_dict
\ No newline at end of file
+    return backend_op_dict

From 5a2382fc2249eeb1186f7f842a3ce34fe3fba9cf Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 4 Jun 2024 02:12:07 -0700
Subject: [PATCH 16/37] skip mul and where

---
 test/test_ops.py                              |  36 -----
 test/xpu/op_db.yaml                           |  74 +++++++++++
 test/xpu/xpu_op_db.yaml                       | 123 ------------------
 .../_internal/common_methods_invocations.py   |  60 ++-------
 torch/testing/_internal/common_utils.py       |  15 +--
 5 files changed, 93 insertions(+), 215 deletions(-)
 create mode 100644 test/xpu/op_db.yaml
 delete mode 100644 test/xpu/xpu_op_db.yaml

diff --git a/test/test_ops.py b/test/test_ops.py
index e96b3386ce4b1..01e683ab907ab 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -559,18 +559,6 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
         ],
     )
     @skipIfTorchInductor("Takes too long for inductor")
-    @skipOps(
-        "TestCommon",
-        "test_python_ref_executor",
-        (("_refs.mul", "", "xpu", (torch.complex32,), False),),
-        all_opinfos=python_ref_db,
-    )
-    @skipOps(
-        "TestCommon",
-        "test_python_ref_executor",
-        (("_refs.pow", "", "xpu", (torch.complex32,), False),),
-        all_opinfos=python_ref_db,
-    )
     def test_python_ref_executor(self, device, dtype, op, executor):
         if (
             TEST_WITH_ROCM
@@ -643,12 +631,6 @@ def test_errors_sparse(self, device, op, layout):
         dtypes=OpDTypes.none,
     )
     @skipIfTorchInductor("Takes too long for inductor")
-    @skipOps(
-        "TestCommon",
-        "test_python_ref_errors",
-        (("_refs.where", "", "xpu", None, False),),
-        all_opinfos=python_ref_db,
-    )
     def test_python_ref_errors(self, device, op):
         mode = FakeTensorMode()
         with mode:
@@ -1425,24 +1407,6 @@ def convert_boolean_tensors(x):
     @skipMeta
     @onlyNativeDeviceTypes
     @ops(ops_and_refs, dtypes=OpDTypes.none)
-    @skipOps(
-        "TestCommon",
-        "test_dtypes",
-        (("div", "floor_rounding", "xpu", None, False),),
-        all_opinfos=ops_and_refs,
-    )
-    @skipOps(
-        "TestCommon",
-        "test_dtypes",
-        (("div", "no_rounding_mode", "xpu", None, False),),
-        all_opinfos=ops_and_refs,
-    )
-    @skipOps(
-        "TestCommon",
-        "test_dtypes",
-        (("div", "trunc_rounding", "xpu", None, False),),
-        all_opinfos=ops_and_refs,
-    )
     def test_dtypes(self, device, op):
         # Check complex32 support only if the op claims.
         # TODO: Once the complex32 support is better, we should add check for complex32 unconditionally.
diff --git a/test/xpu/op_db.yaml b/test/xpu/op_db.yaml
new file mode 100644
index 0000000000000..f89fb392a3430
--- /dev/null
+++ b/test/xpu/op_db.yaml
@@ -0,0 +1,74 @@
+# Owner(s): ["module: intel"]
+# Define the supported Aten ops in XPU backend, the dtypes are aligned with other GPUs. 
+supported_ops: 
+    - fill
+    - zeros
+    - zeros_like
+    - clone
+    - view_as_real
+    - view_as_complex
+    - view
+    - resize_
+    - resize_as_
+    - add
+    - sub
+    - abs
+    - bernoulli
+    - bitwise_and
+    - bitwise_not
+    - bitwise_or
+    - bitwise_xor
+    - clamp
+    - clamp_max
+    - clamp_min
+    - clone
+    - copy
+    - cumsum
+    - empty
+    - eq
+    - fill
+    - fmod
+    - gcd
+    - ge
+    - gelu
+    - gt
+    - index_add
+    - index_put
+    - index_select
+    - isnan
+    - le
+    - lt
+    - masked_fill
+    - maximum
+    - minimum
+    - native_dropout_backward
+    - ne
+    - neg
+    - nn.functional.adaptive_avg_pool2d
+    - nn.functional.threshold
+    - nonzero
+    - normal
+    - reciprocal
+    - rsub
+    - relu
+    - remainder
+    - reshape
+    - unfold
+    - uniform
+    - view
+    - zero
+    - add
+    - any
+    - arange
+    - as_strided
+    - flip
+    - tril
+    - triu
+    - cat
+    - log_softmax
+    - softmax
+    - scatter
+    - gather
+    - max_pool2d_with_indices_backward
+    - nn.functional.embedding
+    - nn.functional.unfold
\ No newline at end of file
diff --git a/test/xpu/xpu_op_db.yaml b/test/xpu/xpu_op_db.yaml
deleted file mode 100644
index 9639b0a36126b..0000000000000
--- a/test/xpu/xpu_op_db.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-backend: XPU
-supported: 
-    - fill
-    - zeros
-    - zeros_like
-    - clone
-    - view_as_real
-    - view_as_complex
-    - view
-    - resize_
-    - resize_as_
-    - add
-    - sub
-    - mul
-    - div:
-        unsupported:
-            - float16
-            - bfloat16
-        unsupported_backward:
-            - float16
-            - bfloat16
-    - abs
-    - bernoulli
-    - bitwise_and
-    - bitwise_not
-    - bitwise_or
-    - bitwise_xor
-    - clamp
-    - clamp_max
-    - clamp_min
-    - clone
-    - copy
-    - cos:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - cumsum
-    - empty
-    - eq
-    - fill
-    - fmod
-    - gcd
-    - ge
-    - gelu
-    - gt
-    - index_add
-    - index_put
-    - index_select
-    - isnan
-    - le
-    - log:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - lt
-    - masked_fill
-    - maximum
-    - minimum
-    - mul
-    - native_dropout_backward
-    - ne
-    - neg
-    - nn.functional.adaptive_avg_pool2d
-    - nn.functional.threshold
-    - nonzero
-    - normal
-    - pow:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - reciprocal
-    - rsub
-    - relu
-    - remainder
-    - reshape
-    - rsqrt:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - sin:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - sqrt:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - sum:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - tanh:
-        supported:
-            - complex32
-        supported_backward:
-            - complex32
-    - unfold
-    - uniform
-    - view
-    - where
-    - zero
-    - add
-    - any
-    - arange
-    - as_strided
-    - flip
-    - tril
-    - triu
-    - cat
-    - log_softmax
-    - softmax
-    - scatter
-    - gather
-    - max_pool2d_with_indices_backward
-    - nn.functional.embedding
-    - nn.functional.unfold
\ No newline at end of file
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 8510b238a418e..19785e68282c8 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -24114,52 +24114,18 @@ def wrapped(fn):
         return fn
     return wrapped
 
+def apply_op_db_for_xpu(op_db_list: List[OpInfo]):
+    # Get the supported op from yaml file.
+    supported_op_list = get_backend_op_dict(device='xpu')['supported_ops']
+
+    for op in op_db_list:
+        # For refs ops get the name of the related torch_opinfo.
+        torch_opinfo = op.torch_opinfo if hasattr(op, "torch_opinfo") else None
+        name = torch_opinfo.name if torch_opinfo is not None else op.name
+        if name not in supported_op_list:
+            # Update op_db, add unittest.skip decorators to skip the op for the backend.
+            op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
+
 def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
     if TEST_XPU and device == 'xpu':
-        # Get the supported op and dtypes from yaml file.
-        backend_op_dict = get_backend_op_dict()
-        supported_op_list = [next(iter(op_dict.keys())) if type(op_dict) is dict else op_dict
-                             for op_dict in backend_op_dict['supported']]
-
-        for op in op_db_list:
-            # For refs ops get the name of the related torch_opinfo.
-            torch_opinfo = op.torch_opinfo if hasattr(op, "torch_opinfo") else None
-            name = torch_opinfo.name if torch_opinfo is not None else op.name
-            if name not in supported_op_list:
-                # If the op is not supported add unittest.skip decorators.
-                if op.skips is not None:
-                    op.skips = (*op.skips, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-                    op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-                else:
-                    op.skips = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-                    op.decorators = (DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-            else:
-                ind = supported_op_list.index(name)
-
-                if type(backend_op_dict['supported'][ind]) is dict and backend_op_dict['supported'][ind][name] is not None:
-                    # If the op is supported check whether the supported dtypes is different with cuda
-                    for _key in backend_op_dict['supported'][ind][name]:
-                        # Get the dtypes with difference
-                        _dtypes = [getattr(torch, _dtype) if hasattr(torch, _dtype) else None
-                                   for _dtype in backend_op_dict['supported'][ind][name][_key]]
-                        if _key == "unsupported":
-                            if type(op.dtypesIfXPU) is set:
-                                op.dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
-                            else:
-                                _dispatch_dtypes(filter(lambda x: (x not in _dtypes), op.dtypesIfXPU))
-                        if _key == "unsupported_backward":
-                            if type(op.backward_dtypesIfXPU) is set:
-                                op.backward_dtypesIfXPU = set(filter(lambda x: (x not in _dtypes), op.backward_dtypesIfXPU))
-                            else:
-                                op.backward_dtypesIfXPU = _dispatch_dtypes(filter(lambda x: (x not in _dtypes),
-                                                                                  op.backward_dtypesIfXPU))
-                        if _key == "supported":
-                            if type(op.dtypesIfXPU) is set:
-                                op.dtypesIfXPU.update(_dtypes)
-                            else:
-                                op.dtypesIfXPU = _dispatch_dtypes((*op.dtypesIfXPU, *_dtypes))
-                        if _key == "supported_backward":
-                            if type(op.backward_dtypesIfXPU) is set:
-                                op.backward_dtypesIfXPU.update(_dtypes)
-                            else:
-                                op.backward_dtypesIfXPU = _dispatch_dtypes((*op.backward_dtypesIfXPU, *_dtypes))
+        apply_op_db_for_xpu(op_db_list)
\ No newline at end of file
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index abc4c3ff946f2..1af948371c846 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5001,13 +5001,10 @@ def repl_frame(m):
 def get_backend_op_dict(device='xpu'):
     backend_op_dict = {}
     if TEST_XPU and device == 'xpu':
-        device = 'xpu'
-        xpu_op_db = os.getcwd() + "/xpu/xpu_op_db.yaml" if os.path.exists(os.getcwd() + "/xpu/xpu_op_db.yaml") \
-            else os.getcwd() + "../xpu/xpu_op_db.yaml"
-        if os.path.exists(xpu_op_db):
+        xpu_op_db = CI_TEST_PREFIX + "/" + device + "/op_db.yaml"
+        try:
             with open(xpu_op_db) as stream:
-                try:
-                    backend_op_dict = yaml.safe_load(stream)
-                except yaml.YAMLError:
-                    print("Error in loading xpu_op_db.yaml.")
-    return backend_op_dict
+                backend_op_dict = yaml.safe_load(stream)
+        except yaml.YAMLError or FileExistsError:
+            print("Error in loading op_db.yaml.")
+    return backend_op_dict
\ No newline at end of file

From b1c0fff3245367afa1d415d57f16ea3b0536bb1f Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Wed, 5 Jun 2024 19:21:44 -0700
Subject: [PATCH 17/37] rollback a change in ops()

---
 torch/testing/_internal/common_device_type.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index a18842c98a4eb..cd2f73c6bfacc 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -957,7 +957,7 @@ def _parametrize_test(self, test, generic_cls, device_cls):
             else:
                 raise RuntimeError(f"Unknown OpDType: {self.opinfo_dtypes}")
 
-            if self.allowed_dtypes is not None and dtypes is not None:
+            if self.allowed_dtypes is not None:
                 dtypes = dtypes.intersection(self.allowed_dtypes)
 
             # Construct the test name; device / dtype parts are handled outside.

From 775db6ebf9a4ba48d3ce443b0fa5b8e8aea9c9a5 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Wed, 5 Jun 2024 19:57:30 -0700
Subject: [PATCH 18/37] remove unused comments

---
 torch/testing/_internal/opinfo/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index e745e1be1c67b..df97f6e8ebc28 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -967,7 +967,6 @@ def __post_init__(self):
                 if self.backward_dtypes is not None
                 else self.dtypesIfXPU
                 if self.dtypesIfXPU is not None
-                #else self.dtypes
                 else self.backward_dtypesIfCUDA
             )
         )

From 10fd73155a6a2d898b32d357018693e4f164cb62 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Thu, 6 Jun 2024 23:17:04 -0700
Subject: [PATCH 19/37] fix lint issue

---
 torch/testing/_internal/opinfo/core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index df97f6e8ebc28..a5cc8689a86e1 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -31,7 +31,6 @@
     TEST_WITH_ROCM,
     torch_to_numpy_dtype_dict,
     TrackedInputIter,
-    TEST_XPU,
 )
 from torch.testing._internal.opinfo import utils
 

From 557aa73250ff02c67c215998137202a424379480 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Wed, 12 Jun 2024 22:20:14 -0700
Subject: [PATCH 20/37] disable bernoulli

---
 test/xpu/op_db.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/xpu/op_db.yaml b/test/xpu/op_db.yaml
index f89fb392a3430..8eaa39bbb010d 100644
--- a/test/xpu/op_db.yaml
+++ b/test/xpu/op_db.yaml
@@ -13,7 +13,6 @@ supported_ops:
     - add
     - sub
     - abs
-    - bernoulli
     - bitwise_and
     - bitwise_not
     - bitwise_or

From c6965a119e946d660b535a9171550c357d17d9cc Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Thu, 13 Jun 2024 23:55:04 -0700
Subject: [PATCH 21/37] disable nn.funcitonal.embedding

---
 test/xpu/op_db.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/xpu/op_db.yaml b/test/xpu/op_db.yaml
index 8eaa39bbb010d..144513cdc7f5a 100644
--- a/test/xpu/op_db.yaml
+++ b/test/xpu/op_db.yaml
@@ -69,5 +69,4 @@ supported_ops:
     - scatter
     - gather
     - max_pool2d_with_indices_backward
-    - nn.functional.embedding
-    - nn.functional.unfold
\ No newline at end of file
+    - nn.functional.unfold

From 5607564d8fcbca3f084dc67fa720f0b0c62ee0b3 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 23 Jun 2024 23:30:01 -0700
Subject: [PATCH 22/37] refine format

---
 torch/testing/_internal/common_device_type.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 32198e7cb6a1f..5a48385f0149f 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -958,8 +958,12 @@ class OpDTypes(Enum):
     unsupported_backward = 3  # Test only unsupported backward dtypes
     any_one = 4  # Test precisely one supported dtype
     none = 5  # Instantiate no dtype variants (no dtype kwarg needed)
-    any_common_cpu_cuda_one = 6  # Test precisely one supported dtype that is common to both cuda and cpu
-    any_common_cpu_xpu_one = 7  # Test precisely one supported dtype that is common to both xpu and cpu
+    any_common_cpu_cuda_one = (
+        6  # Test precisely one supported dtype that is common to both cuda and cpu
+    )
+    any_common_cpu_xpu_one = (
+        7  # Test precisely one supported dtype that is common to both xpu and cpu
+    )
 
 
 # Arbitrary order

From f80a48635e341998d7b43e51f4f61098b9c210d1 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Sun, 23 Jun 2024 23:55:00 -0700
Subject: [PATCH 23/37] lint format

---
 torch/testing/_internal/common_device_type.py | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 5a48385f0149f..4bc3a4bdfd428 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -792,7 +792,7 @@ def get_desired_device_type_test_bases(
     test_bases = device_type_test_bases.copy()
     if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
         test_bases.append(MPSTestBase)
-    if (allow_xpu or only_for == 'xpu') and TEST_XPU and XPUTestBase not in test_bases:
+    if (allow_xpu or only_for == "xpu") and TEST_XPU and XPUTestBase not in test_bases:
         test_bases.append(XPUTestBase)
     if TEST_HPU and HPUTestBase not in test_bases:
         test_bases.append(HPUTestBase)
@@ -1098,7 +1098,9 @@ def _parametrize_test(self, test, generic_cls, device_cls):
                 # Tries to pick a dtype that supports both CPU and CUDA
                 supported = set(op.dtypes).intersection(op.dtypesIfXPU)
                 if supported:
-                    dtypes = {next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)}
+                    dtypes = {
+                        next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)
+                    }
                 else:
                     dtypes = {}
             elif self.opinfo_dtypes == OpDTypes.none:
@@ -1213,9 +1215,9 @@ def __init__(self, dep, reason):
 
 
 class skipXPUIf(skipIf):
-
     def __init__(self, dep, reason):
-        super().__init__(dep, reason, device_type='xpu')
+        super().__init__(dep, reason, device_type="xpu")
+
 
 # Skips a test on Lazy if the condition is true.
 class skipLazyIf(skipIf):
@@ -1575,8 +1577,10 @@ def only_fn(self, *args, **kwargs):
 
     return only_fn
 
+
 def onlyCUDAAndXPU(fn):
-    return onlyOn(['cuda', 'xpu'])(fn)
+    return onlyOn(["cuda", "xpu"])(fn)
+
 
 def disablecuDNN(fn):
     @wraps(fn)
@@ -1848,9 +1852,11 @@ def skipLazy(fn):
 def skipMeta(fn):
     return skipMetaIf(True, "test doesn't work with meta tensors")(fn)
 
+
 def skipXPU(fn):
     return skipXPUIf(True, "test doesn't work with XPU tensors")(fn)
 
+
 def skipXLA(fn):
     return skipXLAIf(True, "Marked as skipped for XLA")(fn)
 
@@ -1870,10 +1876,16 @@ def skipPRIVATEUSE1(fn):
 # TODO: the "all" in the name isn't true anymore for quite some time as we have also have for example XLA and MPS now.
 #  This should probably enumerate all available device type test base classes.
 def get_all_device_types() -> List[str]:
-    return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
+    return ["cpu"] if not torch.cuda.is_available() else ["cpu", "cuda"]
+
 
 def any_common_cpu_device_one():
-    return OpDTypes.any_common_cpu_xpu_one if TEST_XPU else OpDTypes.any_common_cpu_cuda_one
+    return (
+        OpDTypes.any_common_cpu_xpu_one
+        if TEST_XPU
+        else OpDTypes.any_common_cpu_cuda_one
+    )
+
 
 def has_gpu_device(devices: List[str]):
     return "cuda" in devices or "xpu" in devices

From 4c7ac905560fe7cc9dded063571f7b1a23e7e1fd Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 25 Jun 2024 20:06:12 -0700
Subject: [PATCH 24/37] fix an mkldnn blas error message

---
 aten/src/ATen/native/mkldnn/xpu/Blas.cpp | 4 ++++
 test/xpu/test_conv.py                    | 2 +-
 test/xpu/test_gemm.py                    | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
index 6cba3f4c9fa18..9dcd6f6bc10c9 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -131,6 +131,10 @@ Tensor& mm_out(const Tensor& self, const Tensor& mat2, Tensor& result) {
       "x",
       mat2.sizes()[1],
       ")");
+  TORCH_CHECK(
+    self.dtype() == mat2.dtype(),
+    "expected mat1 and mat2 to have the same dtype, but got: ", self.dtype(), " != ", mat2.dtype()
+  )
 
   result.resize_({self.size(0), mat2.size(1)});
   if (self.numel() == 0 || mat2.numel() == 0) {
diff --git a/test/xpu/test_conv.py b/test/xpu/test_conv.py
index f3d4375213f02..632d4a356d286 100644
--- a/test/xpu/test_conv.py
+++ b/test/xpu/test_conv.py
@@ -1264,7 +1264,7 @@ def test_channels_last_ouput_stride(self, device, dtype):
             assert_size_stride(out, (2, 512, 7, 7), (25088, 1, 3584, 512))
 
 
-instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), only_for="xpu")
+instantiate_device_type_tests(TestConvolutionNNDeviceType, globals(), only_for="xpu", allow_xpu=True)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
index 0157677a582f2..2bc6d09eeea73 100644
--- a/test/xpu/test_gemm.py
+++ b/test/xpu/test_gemm.py
@@ -1142,7 +1142,7 @@ def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
             torch.matmul(a, b, out=c)
 
 
-instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu")
+instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu", allow_xpu=True)
 
 if __name__ == "__main__":
     run_tests()

From a3c3b038ba3ec98cbc7958032b0f5d9a0bf94d2c Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 23 Jul 2024 14:30:36 +0000
Subject: [PATCH 25/37] reverted skipOps interface and renamed
 get_backend_op_dict

---
 torch/testing/_internal/common_methods_invocations.py | 7 ++++---
 torch/testing/_internal/common_utils.py               | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index b50aea6067d79..c09ec878ddee9 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -37,7 +37,7 @@
     TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, TEST_XPU, get_backend_op_dict,
+    TEST_WITH_TORCHINDUCTOR, TEST_XPU, get_backend_ops,
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -24492,7 +24492,8 @@ def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
     return (op_name, variant_name, device_type, dtypes, False)
 
 
-def skipOps(test_case_name, base_test_name, to_skip, all_opinfos=op_db):
+def skipOps(test_case_name, base_test_name, to_skip):
+    all_opinfos = op_db
     for xfail in to_skip:
         op_name, variant_name, device_type, dtypes, expected_failure = xfail
         matching_opinfos = [o for o in all_opinfos
@@ -24519,7 +24520,7 @@ def wrapped(fn):
 
 def apply_op_db_for_xpu(op_db_list: List[OpInfo]):
     # Get the supported op from yaml file.
-    supported_op_list = get_backend_op_dict(device='xpu')['supported_ops']
+    supported_op_list = get_backend_ops(device='xpu')['supported_ops']
 
     for op in op_db_list:
         # For refs ops get the name of the related torch_opinfo.
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index bff3501cb43f3..d7ee7a6677e9d 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5120,13 +5120,13 @@ def repl_frame(m):
     s = re.sub(r" +$", "", s, flags=re.MULTILINE)
     return s
 
-def get_backend_op_dict(device='xpu'):
-    backend_op_dict = {}
+def get_backend_ops(device='xpu'):
+    backend_ops = {}
     if TEST_XPU and device == 'xpu':
         xpu_op_db = CI_TEST_PREFIX + "/" + device + "/op_db.yaml"
         try:
             with open(xpu_op_db) as stream:
-                backend_op_dict = yaml.safe_load(stream)
+                backend_ops = yaml.safe_load(stream)
         except yaml.YAMLError or FileExistsError:
             print("Error in loading op_db.yaml.")
-    return backend_op_dict
\ No newline at end of file
+    return backend_ops
\ No newline at end of file

From 0b3d57bb0ef423f41341c145b5a98ddc13b57289 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daisy.deng@intel.com>
Date: Tue, 8 Oct 2024 01:54:49 +0000
Subject: [PATCH 26/37] retrigger checks


From af96524d469d458dec8424da5cae12f9abc0f630 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Tue, 29 Oct 2024 06:26:33 +0000
Subject: [PATCH 27/37] unified the UT infrastructure api to support diffrent
 platform, passed TestCommon

---
 test/test_ops.py                              | 62 +++++++++----------
 torch/testing/_internal/common_device_type.py | 12 ++--
 .../_internal/common_methods_invocations.py   | 27 ++++++--
 torch/testing/_internal/common_utils.py       | 27 ++++++++
 torch/testing/_internal/opinfo/core.py        |  3 +-
 .../_internal/opinfo/definitions/linalg.py    | 55 ++++++++++++++--
 .../_internal/opinfo/definitions/special.py   |  4 ++
 7 files changed, 144 insertions(+), 46 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 658d19fe77ead..e121f39034c8c 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -27,16 +27,15 @@
 from torch.testing._internal.common_device_type import (
     any_common_cpu_device_one,
     deviceCountAtLeast,
-    has_gpu_device,
     instantiate_device_type_tests,
     onlyCPU,
-    onlyCUDA,
-    onlyCUDAAndXPU,
+    onlyGPU,
     onlyNativeDeviceTypesAnd,
     OpDTypes,
     ops,
     skipMeta,
     skipXPU,
+    is_gpu_device,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -87,9 +86,9 @@
 
 assert torch.get_default_dtype() == torch.float32
 
-if TEST_XPU:
-    apply_op_db_for(op_db, device="xpu")
-    apply_op_db_for(python_ref_db, device="xpu")
+# if TEST_XPU:
+#     apply_op_db_for(op_db, device="xpu")
+#     apply_op_db_for(python_ref_db, device="xpu")
 
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
@@ -124,7 +123,7 @@ def reduction_dtype_filter(op):
 
 
 # Create a list of operators that are a subset of _ref_test_ops but don't have a
-# numpy ref to compare them too, If both CPU and CUDA are compared to numpy
+# numpy ref to compare them too, If both CPU and GPU are compared to numpy
 # then they do not need to be compared to each other
 _ops_and_refs_with_no_numpy_ref = [op for op in ops_and_refs if op.ref is None]
 
@@ -155,22 +154,22 @@ def tearDownClass(cls):
 
             assert len(filtered_ops) == 0, err_msg
 
-    # Validates that each OpInfo works correctly on different CUDA devices
-    @onlyCUDAAndXPU
+    # Validates that each OpInfo works correctly on different GPU devices
+    @onlyGPU
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
     def test_multiple_devices(self, devices, dtype, op):
-        for cuda_device_str in devices:
-            cuda_device = torch.device(cuda_device_str)
+        for gpu_device_str in devices:
+            gpu_device = torch.device(gpu_device_str)
             # NOTE: only tests on first sample
-            samples = op.sample_inputs(cuda_device, dtype)
+            samples = op.sample_inputs(gpu_device, dtype)
             sample = first_sample(self, samples)
             result = op(sample.input, *sample.args, **sample.kwargs)
 
             if isinstance(result, torch.Tensor):
-                self.assertTrue(result.device == cuda_device)
+                self.assertTrue(result.device == gpu_device)
             elif is_iterable_of_tensors(result):
-                self.assertTrue(all(t.device == cuda_device for t in result))
+                self.assertTrue(all(t.device == gpu_device for t in result))
             else:
                 self.skipTest(
                     "Skipped! Only supports single tensor or iterable of tensor outputs."
@@ -275,7 +274,7 @@ def test_numpy_ref(self, device, dtype, op):
             and op.formatted_name
             in ("signal_windows_exponential", "signal_windows_bartlett")
             and dtype == torch.float64
-            and has_gpu_device(device)
+            and is_gpu_device(device)
             or "cpu" in device
         ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
@@ -288,7 +287,7 @@ def test_numpy_ref(self, device, dtype, op):
                 )
 
     # Tests that the cpu and gpu results are consistent
-    @onlyCUDAAndXPU
+    @onlyGPU
     @suppress_warnings
     @slowTest
     @ops(_ops_and_refs_with_no_numpy_ref, dtypes=any_common_cpu_device_one())
@@ -302,20 +301,20 @@ def to_cpu(arg):
 
         for sample in samples:
             cpu_sample = sample.transform(to_cpu)
-            cuda_results = op(sample.input, *sample.args, **sample.kwargs)
+            gpu_results = op(sample.input, *sample.args, **sample.kwargs)
             cpu_results = op(cpu_sample.input, *cpu_sample.args, **cpu_sample.kwargs)
 
             # output_process_fn_grad has a very unfortunate name
             # We use this function in linalg extensively to postprocess the inputs of functions
             # that are not completely well-defined. Think svd and muliplying the singular vectors by -1.
-            # CPU and CUDA implementations of the SVD can return valid SVDs that are different.
+            # CPU and GPU implementations of the SVD can return valid SVDs that are different.
             # We use this function to compare them.
-            cuda_results = sample.output_process_fn_grad(cuda_results)
+            gpu_results = sample.output_process_fn_grad(gpu_results)
             cpu_results = cpu_sample.output_process_fn_grad(cpu_results)
 
             # Lower tolerance because we are running this as a `@slowTest`
             # Don't want the periodic tests to fail frequently
-            self.assertEqual(cuda_results, cpu_results, atol=1e-3, rtol=1e-3)
+            self.assertEqual(gpu_results, cpu_results, atol=1e-3, rtol=1e-3)
 
     # Tests that experimental Python References can propagate shape, dtype,
     # and device metadata properly.
@@ -545,7 +544,7 @@ def test_python_ref_torch_fallback(self, device, dtype, op):
         self._ref_test_helper(contextlib.nullcontext, device, dtype, op)
 
     @unittest.skipIf(TEST_WITH_ASAN, "Skipped under ASAN")
-    @onlyCUDAAndXPU
+    @onlyGPU
     @ops(python_ref_db)
     @parametrize("executor", ["aten"])
     @skipIfTorchInductor("Takes too long for inductor")
@@ -791,13 +790,12 @@ def _extract_strides(out):
                 return tuple(t.stride() for t in out)
 
             # Extracts data pointers from a tensor or iterable of tensors into a tuple
-            # NOTE: only extracts on the CPU and CUDA device types since some
+            # NOTE: only extracts on the CPU and GPU device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
                 if (
                     self.device_type != "cpu"
-                    and self.device_type != "cuda"
-                    and self.device_type != "xpu"
+                    and self.device_type in GPU_TYPES
                 ):
                     return ()
 
@@ -923,13 +921,12 @@ def _extract_strides(out):
                 return tuple(t.stride() for t in out)
 
             # Extracts data pointers from a tensor or iterable of tensors into a tuple
-            # NOTE: only extracts on the CPU and CUDA device types since some
+            # NOTE: only extracts on the CPU and GPU device types since some
             #   device types don't have storage
             def _extract_data_ptrs(out):
                 if (
                     self.device_type != "cpu"
-                    and self.device_type != "cuda"
-                    and self.device_type != "xpu"
+                    and self.device_type not in GPU_TYPES
                 ):
                     return ()
 
@@ -1005,8 +1002,8 @@ def _case_two_transform(t):
             wrong_device = None
             if torch.device(device).type != "cpu":
                 wrong_device = "cpu"
-            elif torch.cuda.is_available():
-                wrong_device = "cuda"
+            elif HAS_GPU:
+                wrong_device = GPU_TYPE
 
             factory_fn_msg = (
                 "\n\nNOTE: If your op is a factory function (i.e., it accepts TensorOptions) you should mark its "
@@ -1389,7 +1386,7 @@ def convert_boolean_tensors(x):
             self.assertEqual(expect, actual)
 
     # Validates that each OpInfo specifies its forward and backward dtypes
-    #   correctly for CPU and CUDA devices
+    #   correctly for CPU and GPU devices
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(ops_and_refs, dtypes=OpDTypes.none)
@@ -2361,6 +2358,7 @@ def test_refs_are_in_decomp_table(self, op):
 # TODO: investigate/fix
 fake_autocast_device_skips["cpu"] = {"linalg.pinv"}
 fake_autocast_device_skips["cuda"] = {"linalg.pinv", "pinverse"}
+fake_autocast_device_skips["xpu"] = {"linalg.pinv", "pinverse"}
 
 
 dynamic_output_op_tests = (
@@ -2647,7 +2645,7 @@ def _test_fake_crossref_helper(self, device, dtype, op, context):
             except torch._subclasses.fake_tensor.UnsupportedOperatorException:
                 pass
 
-    @onlyCUDA
+    @onlyGPU
     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
     @skipOps(
         "TestFakeTensor", "test_fake_crossref_backward_no_amp", fake_backward_xfails
@@ -2655,7 +2653,7 @@ def _test_fake_crossref_helper(self, device, dtype, op, context):
     def test_fake_crossref_backward_no_amp(self, device, dtype, op):
         self._test_fake_crossref_helper(device, dtype, op, contextlib.nullcontext)
 
-    @onlyCUDA
+    @onlyGPU
     @ops([op for op in op_db if op.supports_autograd], allowed_dtypes=(torch.float,))
     @skipOps(
         "TestFakeTensor",
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index e4608c24f224c..8b44a404615dc 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -59,6 +59,8 @@
     TEST_WITH_UBSAN,
     TEST_XPU,
     TestCase,
+    GPU_TYPE,
+    GPU_TYPES,
 )
 
 
@@ -1628,6 +1630,10 @@ def onlyHPU(fn):
     return onlyOn("hpu")(fn)
 
 
+def onlyGPU(fn):
+    return onlyOn(GPU_TYPES)(fn)
+
+
 def onlyPRIVATEUSE1(fn):
     device_type = torch._C._get_privateuse1_backend_name()
     device_mod = getattr(torch, device_type, None)
@@ -1649,8 +1655,7 @@ def only_fn(self, *args, **kwargs):
     return only_fn
 
 
-def onlyCUDAAndXPU(fn):
-    return onlyOn(["cuda", "xpu"])(fn)
+
 
 
 def disablecuDNN(fn):
@@ -1983,6 +1988,5 @@ def any_common_cpu_device_one():
         else OpDTypes.any_common_cpu_cuda_one
     )
 
-
-def has_gpu_device(devices: List[str]):
+def is_gpu_device(devices: List[str]):
     return "cuda" in devices or "xpu" in devices
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 7fb25ff9ed705..98255b0fb0c70 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -27,7 +27,7 @@
     (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
      skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
-     toleranceOverride, tol)
+     toleranceOverride, tol, skipXPU)
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM53OrLater, SM80OrLater, SM90OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
@@ -12112,6 +12112,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_addmm,
+           decorators=[skipXPU,],
            skips=(
                # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
                DecorateInfo(
@@ -12153,6 +12154,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.half: tol(atol=1e-5, rtol=3e-3)}),
                    'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+               skipXPU,
            ],
            sample_inputs_func=sample_inputs_addmv),
     OpInfo('addbmm',
@@ -12197,6 +12199,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # https://github.com/pytorch/pytorch/issues/55907
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_numpy_ref', device_type='xpu', dtypes=[torch.float64, torch.complex128,]),
            ),
            sample_inputs_func=sample_inputs_addbmm),
     OpInfo('baddbmm',
@@ -13458,7 +13461,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
                                or (_get_torch_rocm_version() >= (5, 2))),
                           "cusparseSDDMM was added in 11.2.1"),
-               skipCPUIfNoMklSparse, ],
+               skipCPUIfNoMklSparse, skipXPU, ],
            skips=(
                # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
@@ -19984,6 +19987,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_sparse_csc=True,
            check_batched_grad=False,
            check_batched_gradgrad=False,
+           decorators=[skipXPU,],
            skips=(
                # NotImplementedError: Could not run 'aten::normal_' with arguments from the 'SparseCPU' backend
                DecorateInfo(unittest.skip(""), 'TestCommon', 'test_noncontiguous_samples'),
@@ -20635,7 +20639,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
             # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type=['cuda', 'xpu']),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases
@@ -21330,6 +21334,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         supports_gradgrad=False,
         skips=(
             DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+            DecorateInfo(unittest.skip("Skipped!"), None, None, device_type='xpu', dtypes=[torch.float64,]),
         )
     ),
     OpInfo(
@@ -22868,6 +22873,20 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 dtypes=(torch.float32,),
                 device_type='cpu',
             ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                None,
+                None, 
+                device_type='xpu',
+                dtypes=[torch.float64,],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                'TestCommon',
+                'test_dtypes',
+                device_type='xpu',
+                dtypes=None,
+            ),
         )),
     PythonRefInfo(
         "_refs.nn.functional.leaky_relu",
@@ -24737,4 +24756,4 @@ def apply_op_db_for_xpu(op_db_list: List[OpInfo]):
 
 def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
     if TEST_XPU and device == 'xpu':
-        apply_op_db_for_xpu(op_db_list)
\ No newline at end of file
+        apply_op_db_for_xpu(op_db_list)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 54ea45cbfa74b..672a215878690 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5397,3 +5397,30 @@ def get_backend_ops(device='xpu'):
         except yaml.YAMLError or FileExistsError:
             print("Error in loading op_db.yaml.")
     return backend_ops
+
+GPU_TYPES = ["cuda", "xpu"]
+
+# defines here before import torch._dynamo is for avoiding circular import
+# when get_gpu_type is imported from dynamo
+@functools.lru_cache(None)
+def get_gpu_type():
+    avail_gpus = [x for x in GPU_TYPES if getattr(torch, x).is_available()]
+    assert len(avail_gpus) <= 1
+    gpu_type = "cuda" if len(avail_gpus) == 0 else avail_gpus.pop()
+    return gpu_type
+
+HAS_CUDA = torch.cuda.is_available()
+
+HAS_XPU = torch.xpu.is_available()
+
+HAS_GPU = HAS_CUDA or HAS_XPU
+
+GPU_TYPE = get_gpu_type()
+
+HAS_MULTIGPU = any(
+    getattr(torch, gpu).is_available() and getattr(torch, gpu).device_count() >= 2
+    for gpu in GPU_TYPES
+)
+
+def get_gpu_autocast()
+    return torch.cuda.amp.autocast if HAS_CUDA else torch.xpu.amp.autocast
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index f1f590527aad6..6c91bb96646d0 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -107,7 +107,8 @@ def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
             self.active_if
             and (self.cls_name is None or self.cls_name == cls_name)
             and (self.test_name is None or self.test_name == test_name)
-            and (self.device_type is None or self.device_type == device_type)
+            and (self.device_type is None or (self.device_type == device_type
+                 if isinstance(self.device_type, str) else device_type in self.device_type))
             and (self.dtypes is None or dtype in self.dtypes)
             # Support callables over kwargs to determine if the decorator is active.
             and (
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index e94c6a6711443..51cac4000c786 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -28,6 +28,7 @@
     skipCUDAIfRocm,
     tol,
     toleranceOverride,
+    skipXPU,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex,
@@ -1430,7 +1431,7 @@ def make_input():
         check_batched_gradgrad=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, ],
         skips=(
             DecorateInfo(
                 unittest.skip("Skipped!"),
@@ -1453,6 +1454,20 @@ def make_input():
                 device_type="mps",
                 dtypes=[torch.float32],
             ),
+            DecorateInfo(
+                unittest.expectedFailure, 
+                "TestCommon", 
+                "test_types", 
+                device_type='xpu', 
+                dtypes=None,
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, 
+                None, 
+                None, 
+                device_type='xpu', 
+                dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ],
+            ),
         ),
     ),
     OpInfo(
@@ -1874,10 +1889,13 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
+
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
+            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -1890,10 +1908,12 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
+            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -1907,10 +1927,12 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
+            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -2284,7 +2306,7 @@ def make_input():
         check_batched_grad=False,
         check_batched_gradgrad=False,
         sample_inputs_func=sample_inputs_svd,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off,],
         skips=(
             DecorateInfo(
                 unittest.skip("Skipped!"),
@@ -2323,6 +2345,13 @@ def make_input():
                 dtypes=[torch.float32],
                 active_if=TEST_WITH_ROCM,
             ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                None,
+                None,
+                device_type="xpu",
+                dtypes=[torch.float64, torch.complex64, torch.complex128, ],
+            ),
         ),
     ),
     OpInfo(
@@ -2373,6 +2402,13 @@ def make_input():
                 "TestCommon",
                 "test_numpy_ref_mps",
             ),
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref",
+                device_type="xpu",
+                dtypes=[torch.float64, torch.complex128,],
+            ),
         ),
     ),
     OpInfo(
@@ -2471,6 +2507,15 @@ def make_input():
         torch_opinfo_name="linalg.svd",
         supports_out=True,
         op_db=op_db,
+        skips=(
+               DecorateInfo(
+                unittest.skip("Skipped!"),
+                None,
+                None,
+                device_type="xpu",
+                dtypes=[torch.float64, torch.complex64, torch.complex128, ],
+            ),
+        ),
     ),
     PythonRefInfo(
         "_refs.linalg.svdvals",
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index f153deacaa99e..de49b845a2640 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -13,6 +13,7 @@
     precisionOverride,
     tol,
     toleranceOverride,
+    skipXPU,
 )
 from torch.testing._internal.common_dtype import all_types_and, floating_types
 from torch.testing._internal.common_utils import (
@@ -239,6 +240,7 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         promotes_int_to_float=True,
         supports_autograd=False,
         supports_one_python_scalar=True,
+        decorators=[skipXPU,],
         skips=(
             # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
@@ -461,6 +463,7 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         "special.hermite_polynomial_h",
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
+        decorators=[skipXPU,],
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
@@ -834,6 +837,7 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         torch_opinfo_name="special.zeta",
         supports_one_python_scalar=True,
         op_db=op_db,
+        decorators = [skipXPU,],
         skips=(
             # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),

From 4e2d28c971feba8d85936e317207ecc8f6443571 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Fri, 15 Nov 2024 14:18:45 +0000
Subject: [PATCH 28/37] fix GPU_TYPES import

---
 test/test_ops.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index e121f39034c8c..ca9a12b8d19a9 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -36,6 +36,7 @@
     skipMeta,
     skipXPU,
     is_gpu_device,
+    GPU_TYPES,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -1388,6 +1389,7 @@ def convert_boolean_tensors(x):
     # Validates that each OpInfo specifies its forward and backward dtypes
     #   correctly for CPU and GPU devices
     @skipMeta
+    @skipXPU
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(ops_and_refs, dtypes=OpDTypes.none)
     def test_dtypes(self, device, op):
@@ -1978,6 +1980,7 @@ def clone_and_perform_view(input, **kwargs):
                         self.assertEqual(tensor.grad, cloned1_tensor.grad)
 
     @ops(ops_and_refs, allowed_dtypes=(torch.cfloat,))
+    @skipXPU
     def test_conj_view(self, device, dtype, op):
         if not op.test_conjugated_samples:
             self.skipTest("Operation doesn't support conjugated inputs.")
@@ -2019,6 +2022,7 @@ def test_neg_view(self, device, dtype, op):
         )
 
     @ops(ops_and_refs, allowed_dtypes=(torch.cdouble,))
+    @skipXPU
     def test_neg_conj_view(self, device, dtype, op):
         if not op.test_neg_view:
             self.skipTest("Operation not tested with tensors with negative bit.")
@@ -2673,12 +2677,12 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
-instantiate_device_type_tests(TestCompositeCompliance, globals(), allow_xpu=True)
-instantiate_device_type_tests(TestMathBits, globals(), allow_xpu=True)
+instantiate_device_type_tests(TestCommon, globals())
+instantiate_device_type_tests(TestCompositeCompliance, globals())
+instantiate_device_type_tests(TestMathBits, globals())
 instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
-instantiate_device_type_tests(TestFakeTensor, globals(), allow_xpu=True)
-instantiate_device_type_tests(TestTags, globals(), allow_xpu=True)
+instantiate_device_type_tests(TestFakeTensor, globals())
+instantiate_device_type_tests(TestTags, globals())
 
 if __name__ == "__main__":
     TestCase._default_dtype_check_enabled = True

From af203f2ee53a8a62ab1dc4922e89eea3327eccb9 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Fri, 15 Nov 2024 14:24:09 +0000
Subject: [PATCH 29/37] fix GPU_TYPES import

---
 test/test_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index ca9a12b8d19a9..33c87ac24d155 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -36,7 +36,6 @@
     skipMeta,
     skipXPU,
     is_gpu_device,
-    GPU_TYPES,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -80,6 +79,7 @@
     TEST_XPU,
     TestCase,
     unMarkDynamoStrictTest,
+    GPU_TYPES,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map

From a7759087ffec563907eac2be9d6718083ad19994 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 01:55:53 +0000
Subject: [PATCH 30/37] fix typo

---
 torch/testing/_internal/common_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index d81845e280511..b285c3a8e44e4 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5523,6 +5523,6 @@ def get_gpu_type():
     for gpu in GPU_TYPES
 )
 
-def get_gpu_autocast()
+def get_gpu_autocast():
     return torch.cuda.amp.autocast if HAS_CUDA else torch.xpu.amp.autocast
 

From 9c591c660311cd6f3f8f9dba06730de7227befce Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 05:50:48 +0000
Subject: [PATCH 31/37] remove xpu backend specific code in

---
 test/test_ops.py                              |  5 --
 test/xpu/op_db.yaml                           | 72 -------------------
 .../_internal/common_methods_invocations.py   | 40 +----------
 .../_internal/opinfo/definitions/linalg.py    | 55 ++------------
 4 files changed, 8 insertions(+), 164 deletions(-)
 delete mode 100644 test/xpu/op_db.yaml

diff --git a/test/test_ops.py b/test/test_ops.py
index 8e9de589e1417..ee99834f61184 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -43,7 +43,6 @@
     integral_types_and,
 )
 from torch.testing._internal.common_methods_invocations import (
-    apply_op_db_for,
     BinaryUfuncInfo,
     op_db,
     ops_and_refs,
@@ -87,10 +86,6 @@
 
 assert torch.get_default_dtype() == torch.float32
 
-# if TEST_XPU:
-#     apply_op_db_for(op_db, device="xpu")
-#     apply_op_db_for(python_ref_db, device="xpu")
-
 # variant testing is only done with torch.float and torch.cfloat to avoid
 #   excessive test times and maximize signal to noise ratio
 _variant_ops = partial(
diff --git a/test/xpu/op_db.yaml b/test/xpu/op_db.yaml
deleted file mode 100644
index 144513cdc7f5a..0000000000000
--- a/test/xpu/op_db.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-# Owner(s): ["module: intel"]
-# Define the supported Aten ops in XPU backend, the dtypes are aligned with other GPUs. 
-supported_ops: 
-    - fill
-    - zeros
-    - zeros_like
-    - clone
-    - view_as_real
-    - view_as_complex
-    - view
-    - resize_
-    - resize_as_
-    - add
-    - sub
-    - abs
-    - bitwise_and
-    - bitwise_not
-    - bitwise_or
-    - bitwise_xor
-    - clamp
-    - clamp_max
-    - clamp_min
-    - clone
-    - copy
-    - cumsum
-    - empty
-    - eq
-    - fill
-    - fmod
-    - gcd
-    - ge
-    - gelu
-    - gt
-    - index_add
-    - index_put
-    - index_select
-    - isnan
-    - le
-    - lt
-    - masked_fill
-    - maximum
-    - minimum
-    - native_dropout_backward
-    - ne
-    - neg
-    - nn.functional.adaptive_avg_pool2d
-    - nn.functional.threshold
-    - nonzero
-    - normal
-    - reciprocal
-    - rsub
-    - relu
-    - remainder
-    - reshape
-    - unfold
-    - uniform
-    - view
-    - zero
-    - add
-    - any
-    - arange
-    - as_strided
-    - flip
-    - tril
-    - triu
-    - cat
-    - log_softmax
-    - softmax
-    - scatter
-    - gather
-    - max_pool2d_with_indices_backward
-    - nn.functional.unfold
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index a683f3283410b..87aa9a4f949f2 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -27,7 +27,7 @@
     (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
      skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
-     toleranceOverride, tol, skipXPU)
+     toleranceOverride, tol)
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM53OrLater, SM80OrLater, SM90OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
@@ -12173,7 +12173,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_fwgrad_bwgrad=True,
            gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
            sample_inputs_func=sample_inputs_addmm,
-           decorators=[skipXPU,],
            skips=(
                # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
                DecorateInfo(
@@ -12215,7 +12214,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.half: tol(atol=1e-5, rtol=3e-3)}),
                    'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
-               skipXPU,
            ],
            sample_inputs_func=sample_inputs_addmv),
     OpInfo('addbmm',
@@ -13522,7 +13520,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
                                or (_get_torch_rocm_version() >= (5, 2))),
                           "cusparseSDDMM was added in 11.2.1"),
-               skipCPUIfNoMklSparse, skipXPU, ],
+               skipCPUIfNoMklSparse, ],
            skips=(
                # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
@@ -20034,7 +20032,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            supports_sparse_csc=True,
            check_batched_grad=False,
            check_batched_gradgrad=False,
-           decorators=[skipXPU,],
            skips=(
                # NotImplementedError: Could not run 'aten::normal_' with arguments from the 'SparseCPU' backend
                DecorateInfo(unittest.skip(""), 'TestCommon', 'test_noncontiguous_samples'),
@@ -21378,7 +21375,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         supports_gradgrad=False,
         skips=(
             DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
-            DecorateInfo(unittest.skip("Skipped!"), None, None, device_type='xpu', dtypes=[torch.float64,]),
         )
     ),
     OpInfo(
@@ -22924,20 +22920,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                 dtypes=(torch.float32,),
                 device_type='cpu',
             ),
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                None,
-                None, 
-                device_type='xpu',
-                dtypes=[torch.float64,],
-            ),
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                'TestCommon',
-                'test_dtypes',
-                device_type='xpu',
-                dtypes=None,
-            ),
         )),
     PythonRefInfo(
         "_refs.nn.functional.leaky_relu",
@@ -24791,20 +24773,4 @@ def skipOps(test_case_name, base_test_name, to_skip):
     # This decorator doesn't modify fn in any way
     def wrapped(fn):
         return fn
-    return wrapped
-
-def apply_op_db_for_xpu(op_db_list: List[OpInfo]):
-    # Get the supported op from yaml file.
-    supported_op_list = get_backend_ops(device='xpu')['supported_ops']
-
-    for op in op_db_list:
-        # For refs ops get the name of the related torch_opinfo.
-        torch_opinfo = op.torch_opinfo if hasattr(op, "torch_opinfo") else None
-        name = torch_opinfo.name if torch_opinfo is not None else op.name
-        if name not in supported_op_list:
-            # Update op_db, add unittest.skip decorators to skip the op for the backend.
-            op.decorators = (*op.decorators, DecorateInfo(unittest.skip, device_type='xpu', dtypes=None))
-
-def apply_op_db_for(op_db_list: List[OpInfo], device='xpu'):
-    if TEST_XPU and device == 'xpu':
-        apply_op_db_for_xpu(op_db_list)
+    return wrapped
\ No newline at end of file
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 51cac4000c786..e94c6a6711443 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -28,7 +28,6 @@
     skipCUDAIfRocm,
     tol,
     toleranceOverride,
-    skipXPU,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex,
@@ -1431,7 +1430,7 @@ def make_input():
         check_batched_gradgrad=False,
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
-        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, ],
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
         skips=(
             DecorateInfo(
                 unittest.skip("Skipped!"),
@@ -1454,20 +1453,6 @@ def make_input():
                 device_type="mps",
                 dtypes=[torch.float32],
             ),
-            DecorateInfo(
-                unittest.expectedFailure, 
-                "TestCommon", 
-                "test_types", 
-                device_type='xpu', 
-                dtypes=None,
-            ),
-            DecorateInfo(
-                unittest.expectedFailure, 
-                None, 
-                None, 
-                device_type='xpu', 
-                dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ],
-            ),
         ),
     ),
     OpInfo(
@@ -1889,13 +1874,10 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
-
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
-            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -1908,12 +1890,10 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
-            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -1927,12 +1907,10 @@ def make_input():
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
         sample_inputs_func=sample_inputs_linalg_lu,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, ],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_types", device_type='xpu', dtypes=None),
-            DecorateInfo(unittest.expectedFailure, None, None, device_type='xpu', dtypes=[torch.complex, torch.float64, torch.complex64, torch.complex128, ]),
         ),
     ),
     OpInfo(
@@ -2306,7 +2284,7 @@ def make_input():
         check_batched_grad=False,
         check_batched_gradgrad=False,
         sample_inputs_func=sample_inputs_svd,
-        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off,],
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
         skips=(
             DecorateInfo(
                 unittest.skip("Skipped!"),
@@ -2345,13 +2323,6 @@ def make_input():
                 dtypes=[torch.float32],
                 active_if=TEST_WITH_ROCM,
             ),
-            DecorateInfo(
-                unittest.skip("Skipped!"),
-                None,
-                None,
-                device_type="xpu",
-                dtypes=[torch.float64, torch.complex64, torch.complex128, ],
-            ),
         ),
     ),
     OpInfo(
@@ -2402,13 +2373,6 @@ def make_input():
                 "TestCommon",
                 "test_numpy_ref_mps",
             ),
-            DecorateInfo(
-                unittest.skip("Unsupported on MPS for now"),
-                "TestCommon",
-                "test_numpy_ref",
-                device_type="xpu",
-                dtypes=[torch.float64, torch.complex128,],
-            ),
         ),
     ),
     OpInfo(
@@ -2507,15 +2471,6 @@ def make_input():
         torch_opinfo_name="linalg.svd",
         supports_out=True,
         op_db=op_db,
-        skips=(
-               DecorateInfo(
-                unittest.skip("Skipped!"),
-                None,
-                None,
-                device_type="xpu",
-                dtypes=[torch.float64, torch.complex64, torch.complex128, ],
-            ),
-        ),
     ),
     PythonRefInfo(
         "_refs.linalg.svdvals",

From af351f031c78097e63e995b8c2113225cb3cd528 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 05:57:18 +0000
Subject: [PATCH 32/37] further remove xpu backend specific skips

---
 torch/testing/_internal/common_methods_invocations.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 87aa9a4f949f2..d6a83a259bdee 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -38,7 +38,7 @@
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, TEST_XPU, get_backend_ops,
+    TEST_WITH_TORCHINDUCTOR, 
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -12258,7 +12258,6 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
                # https://github.com/pytorch/pytorch/issues/55907
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
-               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_numpy_ref', device_type='xpu', dtypes=[torch.float64, torch.complex128,]),
            ),
            sample_inputs_func=sample_inputs_addbmm),
     OpInfo('baddbmm',
@@ -20685,7 +20684,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
             # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type=['cuda', 'xpu']),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type=['cuda']),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases

From 8381e1845da4cd2d12dfe810f2967c7f3e5e4e13 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 05:59:56 +0000
Subject: [PATCH 33/37] further remove xpu backend specific skips

---
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d6a83a259bdee..d751091f421dc 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -20684,7 +20684,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
             # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type=['cuda']),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases

From 5f5d50fd8be399f7f17a42119aca1417deb66d8c Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 06:02:25 +0000
Subject: [PATCH 34/37] further remove xpu backend specific skips

---
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index d751091f421dc..846c539305aed 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -38,7 +38,7 @@
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, 
+    TEST_WITH_TORCHINDUCTOR 
 )
 from torch.testing._utils import wrapper_set_seed
 

From bd2f0b8863fd2ce3b7a62afa7a5d6c0af4ec7fa8 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 06:24:59 +0000
Subject: [PATCH 35/37] remove yaml dependency

---
 torch/testing/_internal/common_methods_invocations.py | 2 +-
 torch/testing/_internal/common_utils.py               | 1 -
 torch/testing/_internal/opinfo/definitions/special.py | 4 ----
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 846c539305aed..1b88cb99fc1fb 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -38,7 +38,7 @@
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR 
+    TEST_WITH_TORCHINDUCTOR
 )
 from torch.testing._utils import wrapper_set_seed
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index b285c3a8e44e4..c5165a05d1793 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -36,7 +36,6 @@
 import types
 import unittest
 import warnings
-import yaml
 from collections.abc import Mapping, Sequence
 from contextlib import closing, contextmanager
 from copy import deepcopy
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index de49b845a2640..f153deacaa99e 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -13,7 +13,6 @@
     precisionOverride,
     tol,
     toleranceOverride,
-    skipXPU,
 )
 from torch.testing._internal.common_dtype import all_types_and, floating_types
 from torch.testing._internal.common_utils import (
@@ -240,7 +239,6 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         promotes_int_to_float=True,
         supports_autograd=False,
         supports_one_python_scalar=True,
-        decorators=[skipXPU,],
         skips=(
             # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
@@ -463,7 +461,6 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         "special.hermite_polynomial_h",
         dtypes=all_types_and(torch.bool),
         promotes_int_to_float=True,
-        decorators=[skipXPU,],
         skips=(
             DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
@@ -837,7 +834,6 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
         torch_opinfo_name="special.zeta",
         supports_one_python_scalar=True,
         op_db=op_db,
-        decorators = [skipXPU,],
         skips=(
             # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),

From 6f513cbee126eb18422ed7e3f137c16d5dd39be2 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Mon, 18 Nov 2024 13:04:00 +0000
Subject: [PATCH 36/37] fix HAS_GPU import issue

---
 test/test_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index ee99834f61184..14ade931a6833 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -75,10 +75,10 @@
     TEST_WITH_TORCHDYNAMO,
     TEST_WITH_TORCHINDUCTOR,
     TEST_WITH_UBSAN,
-    TEST_XPU,
     TestCase,
     unMarkDynamoStrictTest,
     GPU_TYPES,
+    HAS_GPU,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map

From b29b5934d3fd9477f7f1cc2203fda8c39782a8b6 Mon Sep 17 00:00:00 2001
From: "Deng, Daisy" <daisy.deng@intel.com>
Date: Thu, 21 Nov 2024 12:33:33 +0000
Subject: [PATCH 37/37] remove unused function get_backend_ops as design
 changes, return torch.autocast in get_gpu_autocast

---
 test/test_ops.py                        |  3 ++-
 torch/testing/_internal/common_utils.py | 15 +--------------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index a28db437ba72f..104adc2641ab5 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -79,6 +79,7 @@
     unMarkDynamoStrictTest,
     GPU_TYPES,
     HAS_GPU,
+    get_gpu_autocast,
 )
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
@@ -2840,7 +2841,7 @@ def test_fake_crossref_backward_no_amp(self, device, dtype, op):
         fake_backward_xfails | fake_autocast_backward_xfails,
     )
     def test_fake_crossref_backward_amp(self, device, dtype, op):
-        self._test_fake_crossref_helper(device, dtype, op, torch.cuda.amp.autocast)
+        self._test_fake_crossref_helper(device, dtype, op, get_gpu_autocast())
 
     @ops([op for op in ops_and_refs if op.is_factory_function])
     def test_strided_layout(self, device, dtype, op):
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 3e2dc60e1efa7..e592ffcf86720 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -5532,21 +5532,8 @@ def load_inline(*args, **kwargs):
     return wrapper
 
 
-def get_backend_ops(device='xpu'):
-    backend_ops = {}
-    if TEST_XPU and device == 'xpu':
-        xpu_op_db = CI_TEST_PREFIX + "/" + device + "/op_db.yaml"
-        try:
-            with open(xpu_op_db) as stream:
-                backend_ops = yaml.safe_load(stream)
-        except yaml.YAMLError or FileExistsError:
-            print("Error in loading op_db.yaml.")
-    return backend_ops
-
 GPU_TYPES = ["cuda", "xpu"]
 
-# defines here before import torch._dynamo is for avoiding circular import
-# when get_gpu_type is imported from dynamo
 @functools.lru_cache(None)
 def get_gpu_type():
     avail_gpus = [x for x in GPU_TYPES if getattr(torch, x).is_available()]
@@ -5568,5 +5555,5 @@ def get_gpu_type():
 )
 
 def get_gpu_autocast():
-    return torch.cuda.amp.autocast if HAS_CUDA else torch.xpu.amp.autocast
+    return torch.cuda.amp.autocast if HAS_CUDA else torch.autocast