From e78c92235b86748854eb10c984d0bc3b686837fb Mon Sep 17 00:00:00 2001 From: "Deng, Daisy" Date: Tue, 10 Dec 2024 08:36:47 +0000 Subject: [PATCH] make op_db general for GPU, sample input generalization is TBD --- .../_internal/common_methods_invocations.py | 574 +++++++++--------- torch/testing/_internal/opinfo/core.py | 84 ++- 2 files changed, 349 insertions(+), 309 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 54a7c1da8a892..2155d557a81ff 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -38,7 +38,7 @@ TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, TEST_SCIPY, torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN, GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW, - TEST_WITH_TORCHINDUCTOR + TEST_WITH_TORCHINDUCTOR, GPU_TYPE, ) from torch.testing._utils import wrapper_set_seed @@ -6922,7 +6922,7 @@ def make_mvlgamma_opinfo(variant_test_name, domain, skips, sample_kwargs): domain=domain, decorators=(precisionOverride({torch.float16: 5e-2}),), dtypes=all_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_mvlgamma, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -9954,7 +9954,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs): ), 'TestForeach', 'test_parity', - device_type='cuda' + device_type=GPU_TYPE ), ), ), @@ -9991,7 +9991,7 @@ def __call__(self, opinfo, device, dtype, requires_grad, **kwargs): ), 'TestForeach', 'test_parity', - device_type='cuda' + device_type=GPU_TYPE ), ), ), @@ -11670,7 +11670,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('absolute', ), ref=np.abs, dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), skips=( DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients', @@ -11712,7 +11712,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=np.arccos, domain=(-1, 1), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -11723,16 +11723,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.complex64: 1e-2}),), skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', - device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS), + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS), + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), @@ -11754,7 +11754,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=np.arccosh, domain=(1, None), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), decorators=(precisionOverride({torch.bfloat16: 5e-2}),), supports_inplace_autograd=False, @@ -11763,22 +11763,22 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): promotes_int_to_float=True, skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', - device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS), + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS), + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), ), # acosh is not defined at x < 1 (real) @@ -12067,7 +12067,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), # dispatch to lazy test failed DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'), # test error disabled since rhs non-tensor python scalar is supported @@ -12086,7 +12086,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), # dispatch to lazy test failed DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'), # test error disabled since rhs non-tensor python scalar is supported @@ -12149,7 +12149,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # trigger addmm being decomposed by a jit pass. dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -12168,7 +12168,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # When alpha=beta=1 as compile-time constants, JIT will decompose addmm into mm and add. variant_test_name='decomposed', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -12189,7 +12189,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('addmv', dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128, + dtypesIfGPU=floating_types_and(torch.float16, torch.complex64, torch.complex128, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -12204,7 +12204,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): np.multiply(np.asarray(alpha, dtype=batch1.dtype), np.sum(np.matmul(batch1, batch2), axis=0))), dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), @@ -12236,7 +12236,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ], skips=( # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type=GPU_TYPE, active_if=not SM53OrLater), # addbmm does not correctly warn when resizing out= inputs DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'), # https://github.com/pytorch/pytorch/issues/55907 @@ -12245,9 +12245,9 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_addbmm), OpInfo('baddbmm', dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128, + dtypesIfGPU=floating_types_and(torch.float16, torch.complex64, torch.complex128, torch.bfloat16), - backward_dtypesIfCUDA=floating_types_and(torch.float16, + backward_dtypesIfGPU=floating_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else [], torch.complex64, torch.complex128), # Runs very slowly on slow gradcheck - alternatively reduce input sizes @@ -12258,10 +12258,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), - 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + 'TestCommon', 'test_variant_consistency_eager', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), - 'TestMathBits', 'test_conj_view', device_type='cuda'), + 'TestMathBits', 'test_conj_view', device_type=GPU_TYPE), ], sample_inputs_func=sample_inputs_baddbmm, skips=( @@ -12274,7 +12274,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('dot', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, sample_inputs_func=sample_inputs_dot_vdot, @@ -12291,7 +12291,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('vdot', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), sample_inputs_func=sample_inputs_dot_vdot, error_inputs_func=error_inputs_dot_vdot, @@ -12307,7 +12307,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('bmm', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), @@ -12317,14 +12317,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_fwgrad_bwgrad=True, skips=( # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type=GPU_TYPE, active_if=not SM53OrLater), DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}), "TestCommon", "test_out") ), sample_inputs_func=sample_inputs_bmm), OpInfo('mv', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -12382,17 +12382,17 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_fwgrad_bwgrad=True, promotes_int_to_float=True, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, decorators=[ DecorateInfo( toleranceOverride({torch.float16: tol(atol=1e-05, rtol=1e-03)}), - 'TestUnaryUfuncs', device_type='cuda' + 'TestUnaryUfuncs', device_type=GPU_TYPE ), DecorateInfo( toleranceOverride({torch.float32: tol(atol=8e-5, rtol=4e-5)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda' + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE ), precisionOverride({torch.bfloat16: 1e-2}), ], @@ -12402,10 +12402,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped! sparse backward not supported"), 'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'), @@ -12415,7 +12415,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('arcsinh', ), ref=np.arcsinh, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), decorators=(precisionOverride({torch.bfloat16: 5e-2}),), supports_inplace_autograd=False, @@ -12437,10 +12437,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped! sparse backward not supported"), 'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'), @@ -12449,7 +12449,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('arctan', ), ref=np.arctan, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -12469,10 +12469,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped! sparse backward not supported"), 'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'), @@ -12494,7 +12494,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=np.arctanh, domain=(-1, 1), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), decorators=[ precisionOverride({torch.bfloat16: 1e-2}), @@ -12522,10 +12522,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cfloat], + device_type=GPU_TYPE, dtypes=[torch.cfloat], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped! sparse backward not supported"), 'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'), @@ -12618,7 +12618,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('bitwise_left_shift', op=torch.bitwise_left_shift, dtypes=integral_types(), - dtypesIfCUDA=integral_types(), + dtypesIfGPU=integral_types(), dtypesIfHpu=custom_types(torch.int32, torch.int8, torch.bool), operator_variant=operator.lshift, inplace_operator_variant=operator.ilshift, @@ -12633,7 +12633,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('bitwise_right_shift', op=torch.bitwise_right_shift, dtypes=integral_types(), - dtypesIfCUDA=integral_types(), + dtypesIfGPU=integral_types(), dtypesIfHpu=custom_types(torch.int32, torch.int8, torch.bool), operator_variant=operator.rshift, inplace_operator_variant=operator.irshift, @@ -12923,7 +12923,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('cos', ref=np.cos, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, handles_large_floats=False, @@ -12936,7 +12936,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS), # This fails on CUDA but passes on ROCm DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - dtypes=(torch.cdouble,), device_type='cuda'), + dtypes=(torch.cdouble,), device_type=GPU_TYPE), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', @@ -12946,13 +12946,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Greatest absolute difference: nan at index (700,) (up to 1e-05 allowed) # Greatest relative difference: nan at index (700,) (up to 0.001 allowed) DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', + device_type=GPU_TYPE, dtypes=(torch.chalf,), active_if=IS_WINDOWS), )), UnaryUfuncInfo('cosh', ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -12978,7 +12978,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Greatest absolute difference: nan at index (6000,) (up to 1e-05 allowed) # Greatest relative difference: nan at index (6000,) (up to 0.001 allowed) DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', + device_type=GPU_TYPE, dtypes=(torch.chalf,), active_if=IS_WINDOWS), )), OpInfo('cov', @@ -13092,7 +13092,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('divide',), variant_test_name='no_rounding_mode', dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.int32, torch.int8), # Runs very slowly on slow gradcheck - alternatively reduce input sizes gradcheck_fast_mode=True, @@ -13165,7 +13165,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), BinaryUfuncInfo('true_divide', dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), supports_forward_ad=True, promotes_int_to_float=True, supports_fwgrad_bwgrad=True, @@ -13183,7 +13183,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('exp', ref=np_unary_ufunc_integer_promotion_wrapper(np.exp), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), skips=( # Reference: https://github.com/pytorch/pytorch/issues/48010 @@ -13232,7 +13232,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('diag', ref=np.diag, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -13311,7 +13311,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('fmod', ref=np.fmod, dtypes=all_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.int32), # https://github.com/pytorch/pytorch/issues/80411 gradcheck_fast_mode=True, @@ -13345,7 +13345,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('remainder', ref=np.remainder, dtypes=all_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.int32, torch.int8, torch.bool), # https://github.com/pytorch/pytorch/issues/80411 gradcheck_fast_mode=True, @@ -13396,7 +13396,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('frac', ref=lambda x: np.modf(x)[0], dtypes=floating_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -13887,8 +13887,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=np.log, domain=(0, None), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.chalf), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, @@ -13964,14 +13964,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ], ), BinaryUfuncInfo('logaddexp', dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.float16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_rhs_python_scalar=False, skips=( # TODO: FIXME: RuntimeError: not implemented for 'ComplexFloat' - DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type=GPU_TYPE), )), OpInfo('logaddexp2', dtypes=floating_types_and(torch.bfloat16, torch.half), @@ -14116,7 +14116,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('matmul', aliases=('linalg.matmul',), dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), @@ -14130,13 +14130,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=partial(sample_inputs_matmul, is_rmatmul=False), decorators=[ # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type=GPU_TYPE, active_if=not SM53OrLater), # ROCm intermittently fails the test with standard atol/rtol DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}), - 'TestCommon', 'test_noncontiguous_samples', device_type='cuda', + 'TestCommon', 'test_noncontiguous_samples', device_type=GPU_TYPE, active_if=TEST_WITH_ROCM), DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}), - 'TestCommon', 'test_out', device_type='cuda', + 'TestCommon', 'test_out', device_type=GPU_TYPE, active_if=TEST_WITH_ROCM), # mv for the sample with shapes (S, S, M, M), (M,) has some variance in the # backward on CPU @@ -14370,7 +14370,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Incorrectly attempts to use a scalar for the second argument DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'), # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat' - DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type=GPU_TYPE), )), BinaryUfuncInfo( 'maximum', @@ -14382,7 +14382,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_rhs_python_scalar=False, skips=( # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat' - DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type=GPU_TYPE), )), BinaryUfuncInfo( 'min', @@ -14402,7 +14402,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), )), BinaryUfuncInfo( 'minimum', @@ -14417,7 +14417,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), ), ), BinaryUfuncInfo('logical_and', @@ -14454,7 +14454,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( # RuntimeError: "bitwise_and_cuda" not implemented for 'Half' DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', - 'test_type_promotion', device_type='cuda'), + 'test_type_promotion', device_type=GPU_TYPE), )), BinaryUfuncInfo('bitwise_or', ref=np.bitwise_or, @@ -14469,7 +14469,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), )), BinaryUfuncInfo('bitwise_xor', ref=np.bitwise_xor, @@ -14484,7 +14484,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', - device_type='cuda'), + device_type=GPU_TYPE), )), BinaryUfuncInfo('heaviside', ref=lambda a, b: ( @@ -14909,7 +14909,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_adaptive_avg_pool2d), OpInfo('nn.functional.adaptive_avg_pool3d', dtypes=floating_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.float16), decorators=( # RuntimeError: @@ -14993,7 +14993,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.float16), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, error_inputs_func=error_inputs_avg_pool1d, @@ -15004,7 +15004,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.int64), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16, torch.float16), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, error_inputs_func=error_inputs_avg_pool3d, @@ -15056,7 +15056,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aten_name='conv_transpose1d', aliases=('conv_transpose1d',), dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), sample_inputs_func=sample_inputs_conv_transpose1d, supports_forward_ad=True, @@ -15066,7 +15066,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=( DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }), - 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + 'TestCommon', 'test_variant_consistency_eager', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.chalf: tol(atol=5e-2, rtol=5e-2), }), 'TestCommon', 'test_complex_half_reference_testing'), @@ -15101,7 +15101,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose2d), dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), sample_inputs_func=sample_inputs_conv_transpose2d, # Runs very slowly on slow-gradcheck for complex. @@ -15113,10 +15113,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }), - 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + 'TestCommon', 'test_variant_consistency_eager', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.float32: tol(atol=2e-05, rtol=5e-05), }), - 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'), + 'TestCommon', 'test_noncontiguous_samples', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.chalf: tol(atol=8e-2, rtol=8e-2), }), 'TestCommon', 'test_complex_half_reference_testing'), @@ -15149,7 +15149,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # corresponding `conv*d` ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose3d), dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and( + dtypesIfGPU=floating_and_complex_types_and( torch.float16, torch.chalf, torch.bfloat16), sample_inputs_func=sample_inputs_conv_transpose3d, supports_forward_ad=True, @@ -15161,25 +15161,25 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.float16: tol(atol=5e-2, rtol=5e-2), }), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda'), + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}), - 'TestCommon', 'test_variant_consistency_eager', device_type='cuda'), + 'TestCommon', 'test_variant_consistency_eager', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.float32: tol(atol=2e-04, rtol=2e-04), }), - 'TestCompositeCompliance', 'test_operator', device_type='cuda'), + 'TestCompositeCompliance', 'test_operator', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-06), torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}), - 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'), + 'TestCommon', 'test_noncontiguous_samples', device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-04, rtol=2e-05), }), - 'TestCompositeCompliance', 'test_forward_ad', device_type='cuda', + 'TestCompositeCompliance', 'test_forward_ad', device_type=GPU_TYPE, active_if=TEST_CUDNN), DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1e-4)}), - "TestMathBits", "test_conj_view", device_type='cuda'), + "TestMathBits", "test_conj_view", device_type=GPU_TYPE), DecorateInfo( toleranceOverride({torch.chalf: tol(atol=9e-2, rtol=9e-2), }), 'TestCommon', 'test_complex_half_reference_testing'), @@ -15208,7 +15208,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('conv1d',), aten_name='conv1d', dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), sample_inputs_func=sample_inputs_conv1d, @@ -15224,7 +15224,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ), DecorateInfo( toleranceOverride({torch.float16: tol(atol=2e-3, rtol=1e-3)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda', + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE, ), ), skips=( @@ -15246,7 +15246,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('conv2d',), aten_name='conv2d', dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), sample_inputs_func=partial(sample_inputs_conv2d), @@ -15281,7 +15281,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('conv3d',), aten_name='conv3d', dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), sample_inputs_func=sample_inputs_conv3d, error_inputs_func=error_inputs_conv3d, @@ -15391,7 +15391,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): error_inputs_func=error_inputs_rms_norm,), OpInfo('nn.functional.local_response_norm', dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -15501,7 +15501,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('nn.functional.unfold', aten_name='im2col', dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.bool), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.bool), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.bool), sample_inputs_func=sample_inputs_nn_unfold, # Runs very slowly on slow gradcheck - alternatively reduce input sizes gradcheck_fast_mode=True, @@ -15572,7 +15572,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_autograd=True, supports_forward_ad=True, dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'), reference_inputs_func=partial(reference_inputs_interpolate, 'bilinear'), @@ -15590,7 +15590,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'), reference_inputs_func=partial(reference_inputs_interpolate, 'bicubic'), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, @@ -15624,7 +15624,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), sample_inputs_func=partial(sample_inputs_interpolate, 'area'), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, skips=( @@ -15639,7 +15639,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'), reference_inputs_func=partial(reference_inputs_upsample, 'bilinear'), @@ -15657,7 +15657,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.uint8), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, sample_inputs_func=partial(sample_inputs_upsample_aa, 'bilinear'), supports_out=False, @@ -15703,7 +15703,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo( "nn.functional.multi_margin_loss", dtypes=floating_types(), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.float16), supports_out=False, supports_gradgrad=False, sample_inputs_func=sample_inputs_multi_margin_loss, @@ -15720,7 +15720,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo( "nn.functional.multilabel_margin_loss", dtypes=floating_types(), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.float16), supports_out=False, supports_gradgrad=False, sample_inputs_func=sample_inputs_multilabel_margin_loss, @@ -15782,11 +15782,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), error_inputs_func=error_inputs_avg_pool2d, sample_inputs_func=sample_inputs_avgpool2d, skips=( - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type=GPU_TYPE), )), OpInfo('nn.functional.fractional_max_pool2d', supports_autograd=True, @@ -15842,7 +15842,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # TODO: add shape checks assert_jit_shape_analysis=False, dtypes=floating_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), skips=( # Pre-existing condition; Needs to be fixed DecorateInfo(unittest.skip("Works on some configs"), 'TestNNCOpInfo', @@ -15867,7 +15867,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): check_batched_forward_grad=False, assert_jit_shape_analysis=True, dtypes=all_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), error_inputs_func=error_inputs_max_pool2d, sample_inputs_func=sample_inputs_max_pool), OpInfo('max_pool2d_with_indices_backward', @@ -15907,7 +15907,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # TODO: add shape checks assert_jit_shape_analysis=False, dtypes=all_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), # TODO: investigate nondeterminism gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, error_inputs_func=error_inputs_max_pool3d, @@ -16023,8 +16023,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_linear, dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), # linear calls mm under the hood which is nondeterministic on CUDA # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, @@ -16042,7 +16042,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_autograd=True, sample_inputs_func=sample_inputs_bilinear, dtypes=all_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, + dtypesIfGPU=floating_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []), decorators=( DecorateInfo(toleranceOverride({torch.float16: tol(atol=2e-03, rtol=1.3e-03)}), @@ -16050,7 +16050,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ), skips=( # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type=GPU_TYPE, active_if=not SM53OrLater), DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)), ), # Runs very slowly on slow gradcheck - alternatively reduce input sizes @@ -16089,7 +16089,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-03, rtol=1.2e-03), torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], ), # Marked as a Unary function because it has some rather odd broadcasting semantics in its @@ -16139,7 +16139,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-03, rtol=1.2e-03), torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], ), UnaryUfuncInfo( @@ -16150,7 +16150,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): inplace_variant=lambda input, *args, **kwargs: wrapper_set_seed(torch.nn.functional.rrelu, input, *args, inplace=True, **kwargs), dtypes=floating_types_and(torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), gradcheck_wrapper=wrapper_set_seed, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -16165,7 +16165,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-03, rtol=1.2e-03), torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ),), skips=( # lambda impl @@ -16204,14 +16204,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-2, rtol=1.8e-2), torch.bfloat16: tol(atol=1e-2, rtol=1.8e-2) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], ), OpInfo( 'torch._scaled_mm', sample_inputs_func=sample_inputs_scaled_mm, dtypes=empty_types(), - dtypesIfCUDA=empty_types() + (torch.float8_e4m3fn,), + dtypesIfGPU=empty_types() + (torch.float8_e4m3fn,), supports_out=True, supports_forward_ad=False, supports_autograd=False, @@ -16219,7 +16219,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( # Sample inputs isn't really parametrized on dtype DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', - device_type='cuda'), + device_type=GPU_TYPE), # "mul_cuda" not implemented for float8_e4m3fn # https://github.com/pytorch/pytorch/issues/107256 DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness', @@ -16255,10 +16255,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): {torch.float32: tol(atol=5e-05, rtol=5e-6)}), 'TestCommon',), ], skips=( # When attn mask is a composite tensor this fails backward by returning a none - DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward', device_type='cuda'), + DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward', device_type=GPU_TYPE), # This is only failing on Linux Bionic 3.10 Cuda 11.6 DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', - device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)), + device_type=GPU_TYPE, active_if=_get_torch_cuda_version() >= (11, 6)), DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples', dtypes=(torch.float32,)), # AssertionError: JIT Test does not execute any logic @@ -16288,19 +16288,19 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip('This is '), 'TestInductorOpInfo', 'test_comprehensive'), # skip for sm < 80 DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness', - device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater), + device_type=GPU_TYPE, dtypes=(torch.bfloat16,), active_if=not SM80OrLater), # FIXME DecorateInfo(unittest.skip('test_cow_input does not work with efficient attention on ROCM'), 'TestCompositeCompliance', 'test_cow_input', - device_type='cuda', dtypes=(torch.bfloat16, torch.float16, torch.float32), + device_type=GPU_TYPE, dtypes=(torch.bfloat16, torch.float16, torch.float32), active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_MEM_EFF_ATTENTION), DecorateInfo(unittest.skip('test_fake_crossref_backward_amp does not work with efficient attention on ROCM'), 'TestFakeTensor', 'test_fake_crossref_backward_amp', - device_type='cuda', dtypes=(torch.bfloat16, torch.float16, torch.float32), + device_type=GPU_TYPE, dtypes=(torch.bfloat16, torch.float16, torch.float32), active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_MEM_EFF_ATTENTION), DecorateInfo(unittest.skip('test_fake_crossref_backward_no_amp does not work with efficient attention on ROCM'), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp', - device_type='cuda', dtypes=(torch.bfloat16, torch.float16, torch.float32), + device_type=GPU_TYPE, dtypes=(torch.bfloat16, torch.float16, torch.float32), active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_MEM_EFF_ATTENTION), # for element 1, was torch.Size([4, 4, 0]) but real shape was torch.Size([16, 3, 0]) DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace", device_type="cuda", @@ -16318,7 +16318,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'torch.ops.aten._flash_attention_forward', sample_inputs_func=sample_inputs_flash_attention_forward, dtypes=empty_types(), - dtypesIfCUDA=custom_types(torch.float16) + dtypesIfGPU=custom_types(torch.float16) if not SM80OrLater else custom_types(torch.float16, torch.bfloat16), supports_out=False, @@ -16329,18 +16329,18 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[skipCUDAIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "This platform doesn't support Flash Attention")], skips=( # Checking the scalar value of the philox seed and offset - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'), - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'), - DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type=GPU_TYPE), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type=GPU_TYPE), + DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type=GPU_TYPE), # None Mismatch Tensor - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type=GPU_TYPE), ) ), OpInfo( 'torch.ops.aten._efficient_attention_forward', sample_inputs_func=sample_inputs_efficient_attention_forward, dtypes=empty_types(), - dtypesIfCUDA=custom_types(torch.float16, torch.float32) + dtypesIfGPU=custom_types(torch.float16, torch.float32) if not SM80OrLater else custom_types(torch.float16, torch.float32, torch.bfloat16), supports_out=False, @@ -16356,11 +16356,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skipCUDAIf(TEST_WITH_ROCM, "Efficient attention on ROCM doesn't support custom_mask_type==2")], skips=( # Checking the scaler value of the philox seed and offset - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'), - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'), - DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type=GPU_TYPE), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type=GPU_TYPE), + DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type=GPU_TYPE), # None Mismatch Tensor - DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type=GPU_TYPE), ) ), UnaryUfuncInfo( @@ -16380,7 +16380,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-3, rtol=1e-3), torch.bfloat16: tol(atol=1e-4, rtol=1e-4) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', @@ -16403,7 +16403,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=lambda x, inplace=False: x / (1 + np.exp(-x)), dtypes=complex_types(), - dtypesIfCUDA=complex_types(), + dtypesIfGPU=complex_types(), supports_forward_ad=False, supports_autograd=False, assert_autodiffed=False, @@ -16415,7 +16415,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-3, rtol=1e-3), torch.bfloat16: tol(atol=1e-4, rtol=1e-4) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', @@ -16445,14 +16445,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): inplace_variant=partial(torch.nn.functional.hardsigmoid, inplace=True), decorators=[ DecorateInfo( - toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type='cuda',), ], + toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type=GPU_TYPE,), ], skips=[ # still want to test that first derivative works though second derivative isn't supported DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad"), # produces 0 instead of nan on ROCM DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', "test_reference_numerics_extremal", - device_type='cuda', + device_type=GPU_TYPE, active_if=(TEST_WITH_ROCM)), ] ), UnaryUfuncInfo( @@ -16503,7 +16503,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'nn.functional.softsign', ref=lambda x: x / (np.abs(x) + 1), dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool), + dtypesIfGPU=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_autograd=True, @@ -16534,7 +16534,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}), 'TestUnaryUfuncs',), DecorateInfo(toleranceOverride({torch.complex64: tol(atol=6e-04, rtol=1e-05), torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}), - 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'), + 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type=GPU_TYPE), ], skips=( # in each case, pytorch will produce a nan while numpy will not @@ -16673,7 +16673,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name='without_cudnn', aten_name='batch_norm', dtypes=empty_types(), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -16690,7 +16690,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aten_backward_name='binary_cross_entropy_backward', sample_inputs_func=sample_inputs_binary_cross_entropy, dtypes=floating_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, gradcheck_fast_mode=False, supports_autograd=True, @@ -16713,7 +16713,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): unittest.skip("Skipped!"), "TestCompositeCompliance", "test_cow_input", - device_type='cuda', + device_type=GPU_TYPE, ), DecorateInfo( toleranceOverride({torch.float32: tol(atol=1e-3, rtol=1e-3)}), @@ -16739,7 +16739,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('igamma', dtypes=floating_types_and(torch.bfloat16, torch.float16), aliases=('torch.special.gammainc',), - dtypesIfCUDA=floating_types(), + dtypesIfGPU=floating_types(), # TODO: FIXME supports_rhs_python_scalar=False, supports_autograd=False, @@ -16761,8 +16761,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # rhs_make_tensor_kwargs=dict(requires_grad=False), # dtypes=floating_types_and(torch.bfloat16, torch.float16), # backward_dtypesIfCPU=floating_types_and(torch.bfloat16), - # dtypesIfCUDA=floating_types(), - # backward_dtypesIfCUDA=floating_types(), + # dtypesIfGPU=floating_types(), + # backward_dtypesIfGPU=floating_types(), # supports_inplace_autograd=False, # skips=( # # Derivative wrt first tensor not implemented @@ -16780,7 +16780,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('igammac', dtypes=floating_types_and(torch.bfloat16, torch.float16), aliases=('torch.special.gammaincc',), - dtypesIfCUDA=floating_types(), + dtypesIfGPU=floating_types(), supports_autograd=False, supports_rhs_python_scalar=False, skips=( @@ -16801,8 +16801,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # rhs_make_tensor_kwargs=dict(requires_grad=False), # dtypes=floating_types_and(torch.bfloat16, torch.float16), # backward_dtypesIfCPU=floating_types_and(torch.bfloat16), - # dtypesIfCUDA=floating_types(), - # backward_dtypesIfCUDA=floating_types(), + # dtypesIfGPU=floating_types(), + # backward_dtypesIfGPU=floating_types(), # supports_inplace_autograd=False, # decorators=[ # # Derivative wrt first tensor not implemented @@ -16842,7 +16842,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aten_backward_name='hardtanh_backward', dtypes=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.half, torch.bfloat16), backward_dtypes=all_types_and(torch.half, torch.bfloat16), - backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + backward_dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), assert_autodiffed=True, sample_inputs_func=sample_inputs_hardtanh, error_inputs_func=error_inputs_hardtanh, @@ -16880,7 +16880,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): autodiff_nonfusible_nodes=["aten::relu6"]), OpInfo('mm', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -16966,11 +16966,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'), # Could not run 'aten::narrow_copy.out' with arguments from the 'CUDA' backend DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace', - device_type='cuda'), + device_type=GPU_TYPE), DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace', - device_type='cuda'), + device_type=GPU_TYPE), DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace', - device_type='cuda'), + device_type=GPU_TYPE), DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'), )), OpInfo('view_copy', @@ -17065,13 +17065,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), BinaryUfuncInfo('pow', dtypes=all_types_and_complex_and(torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf), ref=np.power, # Due to AVX2 currently not being fully supported for Float16, log_vml_cpu can't be enabled # for Float16, causing this test to fail. pow's autograd for Float16 is thus currently # unsupported on CPU. backward_dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf), # https://github.com/pytorch/pytorch/issues/80411 gradcheck_fast_mode=True, supports_inplace_autograd=False, @@ -17256,7 +17256,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name='decimals_3', aliases=('special.round',), dtypes=floating_types_and(torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), sample_kwargs=lambda device, dtype, input: ({'decimals': 3}, {'decimals': 3}), sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': 3}), skips=( @@ -17282,7 +17282,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name='decimals_neg_3', aliases=('special.round',), dtypes=floating_types_and(torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), sample_kwargs=lambda device, dtype, input: ({'decimals': -3}, {'decimals': -3}), sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': -3}), skips=( @@ -17300,7 +17300,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('sin', ref=np.sin, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), assert_autodiffed=True, handles_large_floats=False, supports_sparse=True, @@ -17314,7 +17314,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( # Fails on CUDA but passes on ROCm DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - dtypes=(torch.cdouble,), device_type='cuda'), + dtypes=(torch.cdouble,), device_type=GPU_TYPE), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', @@ -17334,7 +17334,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('sinh', ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -17363,7 +17363,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('sign', ref=reference_sign, dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half), - dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.half), + dtypesIfGPU=all_types_and(torch.bool, torch.bfloat16, torch.half), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_sparse=True, @@ -17380,7 +17380,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=reference_sgn, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf), backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_sparse=True, @@ -17517,7 +17517,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('__rmatmul__', op=torch.Tensor.__rmatmul__, dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, + dtypesIfGPU=floating_and_complex_types_and(torch.float16, *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []), assert_autodiffed=True, @@ -17530,7 +17530,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): check_batched_forward_grad=False, decorators=( # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type=GPU_TYPE, active_if=not SM53OrLater), DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}), 'TestMathBits', 'test_conj_view'), DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}), @@ -17556,7 +17556,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): BinaryUfuncInfo('__rmod__', op=torch.Tensor.__rmod__, dtypes=floating_types_and(torch.bfloat16, torch.half,), - dtypesIfCUDA=all_types_and(torch.bfloat16, torch.half), + dtypesIfGPU=all_types_and(torch.bfloat16, torch.half), # https://github.com/pytorch/pytorch/issues/80411 gradcheck_fast_mode=True, supports_out=False, @@ -17657,11 +17657,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('tan', ref=np.tan, dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), decorators=(DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=1e-05)}), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda'),), + device_type=GPU_TYPE),), assert_autodiffed=True, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -17703,9 +17703,9 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=2e-05)}), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda'),), + device_type=GPU_TYPE),), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), assert_autodiffed=True, assert_jit_shape_analysis=True, supports_forward_ad=True, @@ -17734,7 +17734,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('tensor_split', ref=np.array_split, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -17850,14 +17850,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): assert_autodiffed=True, skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.complex128]), + device_type=GPU_TYPE, dtypes=[torch.complex128]), DecorateInfo(unittest.skip("Skipped! sparse backward not supported"), 'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'), )), UnaryUfuncInfo('nan_to_num', ref=np.nan_to_num, dtypes=all_types_and(torch.half, torch.bool, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.half, torch.bool, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.half, torch.bool, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_sparse=True, @@ -17888,7 +17888,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=lambda x: np.reciprocal(np.sqrt(x)), domain=(0, None), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), decorators=(precisionOverride({torch.half: 5e-2}),), assert_autodiffed=True, supports_forward_ad=True, @@ -17908,7 +17908,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_sparse=True, domain=(0, None), dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), assert_autodiffed=True, supports_forward_ad=True, supports_sparse_csr=True, @@ -17949,7 +17949,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # >>> t.cuda().square() # tensor(inf+nanj, device='cuda:0') DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]), + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_inplace', dtypes=[torch.bool]), DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace', @@ -17959,7 +17959,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ),), OpInfo('lerp', dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half), - dtypesIfCUDA=floating_and_complex_types_and(torch.chalf, torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.chalf, torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_lerp, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -17967,11 +17967,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): UnaryUfuncInfo('angle', ref=np.angle, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool), decorators=(precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2}),), backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.chalf), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_sparse_csr=True, @@ -18037,8 +18037,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # TODO(@heitorschueroff) update SampleInput to handle such cases op=lambda tensors, equation: torch.einsum(equation, tensors), dtypes=all_types_and_complex_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -18147,7 +18147,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): dtypes=[torch.complex128]), DecorateInfo( toleranceOverride({torch.float32: tol(atol=3e-5, rtol=1e-3)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda'), + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE), ], skips=( # test does not work with passing lambda for op @@ -18184,7 +18184,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name='polygamma_n_0', ref=reference_polygamma if TEST_SCIPY else None, dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, promotes_int_to_float=True, @@ -18201,7 +18201,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name=f'polygamma_n_{n_}', ref=reference_polygamma if TEST_SCIPY else None, dtypes=all_types_and(torch.bool, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, promotes_int_to_float=True, @@ -18391,7 +18391,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('gather', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_gather, gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, supports_forward_ad=True, @@ -18425,7 +18425,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): gradcheck_nondet_tol=GRADCHECK_NONDET_TOL), OpInfo('index_select', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), sample_inputs_func=sample_inputs_index, reference_inputs_func=partial(sample_inputs_index, reference=True), error_inputs_func=error_inputs_index_select, @@ -18505,7 +18505,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"), # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 104448 - DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),), + DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type=GPU_TYPE),), sample_inputs_func=sample_inputs_getitem), OpInfo('index_put', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), @@ -18519,17 +18519,17 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_index_put, skips=( DecorateInfo(unittest.skip("Skipped"), 'TestBwdGradients', 'test_fn_grad', dtypes=[torch.float64], - device_type='cuda', active_if=(TEST_WITH_ROCM and TEST_WITH_TORCHINDUCTOR)), + device_type=GPU_TYPE, active_if=(TEST_WITH_ROCM and TEST_WITH_TORCHINDUCTOR)), )), OpInfo('sort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_sort, supports_forward_ad=True, supports_fwgrad_bwgrad=True, skips=( DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_non_standard_bool_values', - dtypes=[torch.bool], device_type='cuda'), + dtypes=[torch.bool], device_type=GPU_TYPE), )), OpInfo('unique', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16, torch.uint16, torch.uint32, torch.uint64), @@ -19272,7 +19272,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # The inplace variant (Tensor.normal_) is different from torch.normal inplace_variant=None, dtypes=floating_types_and(torch.bfloat16, torch.half), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.half), supports_out=True, sample_inputs_func=sample_inputs_normal_tensor_first, skips=( @@ -19301,7 +19301,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # The inplace variant (Tensor.normal_) is different from torch.normal inplace_variant=None, dtypes=floating_types_and(torch.bfloat16, torch.half), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.half), supports_out=True, sample_inputs_func=sample_inputs_normal_tensor_second, skips=( @@ -19324,7 +19324,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # AssertionError DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'), # AssertionError in CUDA variant - DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', device_type='cuda'), + DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', device_type=GPU_TYPE), DecorateInfo(unittest.skip("Skipped!"), 'TestDeviceUtils', 'test_device_mode_ops'))), OpInfo('bernoulli', op=lambda inp, *args, **kwargs: @@ -19386,13 +19386,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ), BinaryUfuncInfo('hypot', dtypes=floating_types_and(torch.bfloat16, torch.half), - dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, supports_rhs_python_scalar=False), OpInfo('histogram', dtypes=floating_types(), - dtypesIfCUDA=_dispatch_dtypes(), # histogram is only implemented on CPU + dtypesIfGPU=_dispatch_dtypes(), # histogram is only implemented on CPU sample_inputs_func=sample_inputs_histogram, supports_autograd=False, skips=( @@ -19410,20 +19410,20 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('histogramdd', dtypes=floating_types(), - dtypesIfCUDA=_dispatch_dtypes(), # histogramdd is only implemented on CPU + dtypesIfGPU=_dispatch_dtypes(), # histogramdd is only implemented on CPU sample_inputs_func=sample_inputs_histogramdd, error_inputs_func=error_inputs_histogramdd, supports_autograd=False, skips=( # Not implemented on CUDA - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors', device_type=GPU_TYPE), # JIT tests don't work with Tensor keyword arguments # https://github.com/pytorch/pytorch/issues/58507 DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'), )), OpInfo('histc', dtypes=floating_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), + dtypesIfGPU=floating_types_and(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), sample_inputs_func=sample_inputs_histc, supports_out=True, supports_autograd=False, @@ -19431,7 +19431,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # CUDA histc returns a float tensor but does not correctly warn when passed an integral out tensor # "AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast # from a result of dtype torch.float32 into an out= with dtype torch.long" - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type=GPU_TYPE), )), OpInfo('bincount', dtypes=integral_types_and(), @@ -19445,7 +19445,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('bucketize', dtypes=all_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=all_types_and(torch.bfloat16, torch.float16), sample_inputs_func=sample_inputs_bucketize, reference_inputs_func=reference_inputs_bucketize, error_inputs_func=error_inputs_bucketize, @@ -19456,7 +19456,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('searchsorted', dtypes=all_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=all_types_and(torch.bfloat16, torch.float16), sample_inputs_func=sample_inputs_searchsorted, supports_autograd=False, ref=reference_searchsorted, @@ -19554,7 +19554,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_unfold), OpInfo('msort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.float16, torch.bfloat16), check_batched_gradgrad=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -19562,7 +19562,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( # https://github.com/pytorch/pytorch/issues/139972 DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_non_standard_bool_values', - dtypes=[torch.bool], device_type='cuda', active_if=TEST_WITH_ROCM), + dtypes=[torch.bool], device_type=GPU_TYPE, active_if=TEST_WITH_ROCM), )), OpInfo('movedim', aliases=('moveaxis',), @@ -19703,7 +19703,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_resize_ops), OpInfo('take_along_dim', dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), supports_inplace_autograd=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -19734,7 +19734,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.half: tol(atol=9e-4, rtol=4.3e-3)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda' + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE ), ], sample_inputs_func=sample_trapezoid), @@ -19748,7 +19748,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.half: tol(atol=9e-4, rtol=4.3e-3)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda' + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE ), ], sample_inputs_func=sample_trapezoid), @@ -19838,7 +19838,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): reference_inputs_func=reference_inputs_logsumexp), OpInfo('trace', dtypes=all_types_and_complex(), - dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16), error_inputs_func=error_inputs_trace, supports_inplace_autograd=False, supports_out=False, @@ -19968,7 +19968,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('kron', dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), # Runs very slowly on slow gradcheck - alternatively reduce input sizes gradcheck_fast_mode=True, supports_inplace_autograd=False, @@ -19981,7 +19981,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): )), OpInfo('inner', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -19991,7 +19991,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ), OpInfo('tensordot', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -20009,7 +20009,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_to_sparse, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), backward_dtypes=floating_types(), - backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + backward_dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, supports_sparse_csr=True, supports_sparse_csc=True, @@ -20037,12 +20037,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo('logcumsumexp', dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half), backward_dtypes=floating_and_complex_types_and(torch.bfloat16), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, skips=( # AssertionError: UserWarning not triggered : Resized a non-empty tensor but did not warn about it. - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type=GPU_TYPE), # RuntimeError: "max_values_cpu" not implemented for 'ComplexDouble' # Falling back to non-numerically stablized exp, causing nan in the results. DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD', dtypes=[torch.complex128]), @@ -20072,7 +20072,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', dtypes=[torch.chalf, torch.complex64, torch.cdouble])), dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.complex32, torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.complex32, torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, promotes_int_to_float=True, @@ -20087,7 +20087,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('special.psi', 'special.digamma',), decorators=(precisionOverride({torch.float16: 5e-1}),), dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, promotes_int_to_float=True), @@ -20129,7 +20129,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.bfloat16: 1e-2, torch.float32: 1e-4}),), dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.half, torch.bfloat16), supports_sparse_csr=True, supports_sparse_csc=True, supports_sparse_bsr=True, @@ -20152,8 +20152,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): sample_inputs_func=sample_inputs_smooth_l1_loss, dtypes=floating_types_and(torch.float16, torch.bfloat16), backward_dtypes=floating_types_and(torch.bfloat16), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), - backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), + backward_dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -20186,7 +20186,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): aliases=('special.gammaln', ), decorators=(precisionOverride({torch.float16: 7e-1}),), dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.half, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, promotes_int_to_float=True, @@ -20460,7 +20460,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): "norm", sample_inputs_func=sample_inputs_norm, dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), # TODO Benchmark again with the new implementation # Runs very slowly on slow gradcheck - alternatively reduce input sizes gradcheck_fast_mode=True, @@ -20484,7 +20484,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_forward_ad=True, supports_fwgrad_bwgrad=True, dtypes=floating_and_complex_types(), - dtypesIfCUDA=floating_and_complex_types(), + dtypesIfGPU=floating_and_complex_types(), skips=( # Dispatches in Python to matrix_norm. Not sure how to make this test happy DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', @@ -20494,7 +20494,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name='fro', sample_inputs_func=sample_inputs_norm_fro, dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), supports_forward_ad=True, # torch.autograd.gradcheck.GradcheckError: While computing batched gradients # got: Could not allocate memory to change Tensor SizesAndStrides! @@ -20523,7 +20523,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): variant_test_name="inf", sample_inputs_func=sample_inputs_norm_inf, dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), supports_forward_ad=True, check_batched_forward_grad=False, supports_fwgrad_bwgrad=True, @@ -20532,7 +20532,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( DecorateInfo( toleranceOverride({torch.float16: tol(atol=2e-3, rtol=1e-3)}), - 'TestInductorOpInfo', 'test_comprehensive', device_type='cuda', + 'TestInductorOpInfo', 'test_comprehensive', device_type=GPU_TYPE, ), # Dispatches in Python to vector_norm. Not sure how to make this test happy # Happens to pass on complex64. Also a mystery @@ -20581,7 +20581,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # inplace variant dispatches to dropout kernel, while on CUDA # the op dispatches to _fused_dropout (with a few more conditions) # hence, different values and this skip here - DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'), + DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type=GPU_TYPE), DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')), supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -20596,7 +20596,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): op=torch.ops.aten.native_dropout_backward.default, aten_name="native_dropout_backward", dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=floating_types_and(torch.float16, torch.bfloat16), supports_out=False, sample_inputs_func=sample_inputs_dropout_backward, skips=( @@ -20668,7 +20668,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # AssertionError: Tensor-likes are not close! # Fails in cuda11.7 # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757 - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type=GPU_TYPE), DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),), # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases @@ -20741,7 +20741,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Fails on CI https://github.com/pytorch/pytorch/issues/85377 DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_compare_cpu'), # Reference: https://github.com/pytorch/pytorch/issues/67084 - DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'), + DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type=GPU_TYPE), # Not a problem: embedding does weird stuff to its input (it renormalizes) DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'), # Fails due to non-determinism (see issue #74679) @@ -20760,9 +20760,9 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # is tested in gradient tests. op=lambda weight, idx, **kwargs: torch.nn.functional.embedding_bag(idx, weight, **kwargs), dtypes=floating_types_and(torch.bfloat16, torch.float16), - dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16), + dtypesIfGPU=floating_types_and(torch.bfloat16, torch.float16), # backward is not supported for mode `max` and dtype `bfloat16` - backward_dtypesIfCUDA=floating_types_and(torch.float16), + backward_dtypesIfGPU=floating_types_and(torch.float16), sample_inputs_func=sample_inputs_embedding_bag, skips=( # lambda impl @@ -20996,7 +20996,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result # of dtype torch.float32 into an out= with dtype torch.long - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out', device_type='cuda', dtypes=[torch.float32]), + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out', device_type=GPU_TYPE, dtypes=[torch.float32]), # FIXME: mean does not support passing keepdim without passing dim DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'), # FIXME: mean reduces all dimensions when dim=[] @@ -21006,7 +21006,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input', dtypes=[torch.float16]), DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_extremal_values', - device_type='cuda', dtypes=[torch.complex64]), + device_type=GPU_TYPE, dtypes=[torch.complex64]), ), ), ReductionOpInfo( @@ -21018,7 +21018,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): check_batched_forward_grad=False, supports_fwgrad_bwgrad=True, dtypes=floating_types_and(torch.float16, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), + dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True), ref=reference_reduction_numpy(np.nanmean), skips=( @@ -21032,9 +21032,9 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input', dtypes=[torch.float16]), DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values', - device_type='cuda', dtypes=[torch.float16]), + device_type=GPU_TYPE, dtypes=[torch.float16]), DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_extremal_values', - device_type='cuda', dtypes=[torch.complex64]), + device_type=GPU_TYPE, dtypes=[torch.complex64]), ), ), ReductionOpInfo( @@ -21048,7 +21048,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): promotes_int_to_float=True, check_batched_forward_grad=False, dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_std_var, ref=reference_std_var(np.std), generate_args_kwargs=generate_std_var_kwargs, @@ -21077,7 +21077,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): promotes_int_to_float=True, check_batched_forward_grad=False, dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_std_var_unbiased, skips=( # FIXME: dim=[] reduces all dimensions @@ -21096,7 +21096,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_fwgrad_bwgrad=True, check_batched_forward_grad=False, dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_std_var, ref=reference_std_var(np.var), generate_args_kwargs=generate_std_var_kwargs, @@ -21125,7 +21125,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): promotes_int_to_float=True, check_batched_forward_grad=False, dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16), - dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16), + dtypesIfGPU=floating_and_complex_types_and(torch.half, torch.bfloat16), sample_inputs_func=sample_inputs_std_var_unbiased, skips=( # FIXME: dim=[] reduces all dimensions @@ -21146,7 +21146,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): promotes_int_to_int64=True, gradcheck_nondet_tol=GRADCHECK_NONDET_TOL, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), sample_inputs_func=sample_inputs_prod, ref=prod_numpy, skips=( @@ -21176,7 +21176,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): supports_fwgrad_bwgrad=True, promotes_int_to_int64=True, dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), ref=reference_reduction_numpy(np.sum), error_inputs_sparse_func=error_inputs_sparse_reduction_sum, sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_coo), @@ -21209,7 +21209,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): check_batched_forward_grad=False, supports_fwgrad_bwgrad=True, dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True), ref=reference_reduction_numpy(np.nansum), skips=( @@ -21293,7 +21293,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): unittest.skip("Skipped!"), "TestCompositeCompliance", "test_cow_input", - device_type='cuda', + device_type=GPU_TYPE, ), DecorateInfo(unittest.skip("FP16 nll_loss cases have not been enabled on MPS yet"), dtypes=(torch.half,), device_type="mps"), @@ -21372,7 +21372,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): OpInfo( "argsort", dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.bool, torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_sort, supports_out=False, supports_autograd=False, @@ -21388,14 +21388,14 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): "TestCommon", "test_non_standard_bool_values", dtypes=[torch.bool], - device_type='cuda', + device_type=GPU_TYPE, ), ), ), OpInfo( "repeat_interleave", dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf), - backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), + backward_dtypesIfGPU=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf), sample_inputs_func=sample_inputs_repeat_interleave, supports_out=False, supports_forward_ad=True, @@ -21492,7 +21492,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): ref=lambda input, offset=0: np.diagflat(input, k=offset), sample_inputs_func=sample_inputs_diagflat, dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16), - dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -21515,7 +21515,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # complex not added to dtypes as complex gradients are not properly handled # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), sample_inputs_func=sample_inputs_scatter_reduce, skips=( @@ -21532,7 +21532,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet dtypes=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), supports_forward_ad=True, supports_fwgrad_bwgrad=True, sample_inputs_func=sample_inputs_scatter_reduce, @@ -21541,7 +21541,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'scatter_reduce', variant_test_name='amin', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), supports_forward_ad=True, check_batched_forward_grad=False, @@ -21552,7 +21552,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'scatter_reduce', variant_test_name='amax', dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool), - dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), + dtypesIfGPU=all_types_and(torch.float16, torch.bfloat16), dtypesIfHpu=custom_types(torch.float32, torch.bfloat16), supports_forward_ad=True, check_batched_forward_grad=False, @@ -21624,21 +21624,21 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', @@ -21654,11 +21654,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', @@ -21668,16 +21668,16 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), # Failing with wrong imaginary sign on at least some Windows jobs DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), ), ), @@ -21687,7 +21687,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.float16: tol(atol=1e-05, rtol=1e-03)}), - 'TestUnaryUfuncs', device_type='cuda'), + 'TestUnaryUfuncs', device_type=GPU_TYPE), precisionOverride({torch.bfloat16: 1e-2}), ], skips=( @@ -21699,11 +21699,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), ), ), @@ -21726,11 +21726,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cdouble], active_if=IS_WINDOWS), ), ), @@ -21829,7 +21829,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'test_python_ref'), DecorateInfo(unittest.skip("Expected: geometric is not comparable"), 'TestCommon', - 'test_python_ref_executor', device_type='cuda'), + 'test_python_ref_executor', device_type=GPU_TYPE), # AssertionError: Tensor-likes are not close! DecorateInfo(unittest.skip("Expected: geometric is not comparable"), @@ -21856,7 +21856,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'test_python_ref'), DecorateInfo(unittest.skip("Expected: log_normal is not comparable"), 'TestCommon', - 'test_python_ref_executor', device_type='cuda'), + 'test_python_ref_executor', device_type=GPU_TYPE), # AssertionError: Tensor-likes are not close! DecorateInfo(unittest.skip("Expected: log_normal is not comparable"), @@ -22163,11 +22163,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), ), ), @@ -22187,11 +22187,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble], + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', dtypes=[torch.cfloat], + device_type=GPU_TYPE, dtypes=[torch.cfloat], active_if=IS_WINDOWS), ), ), @@ -22230,7 +22230,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # This fails on CUDA but passes on ROCm DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - dtypes=(torch.cdouble,), device_type='cuda'), + dtypes=(torch.cdouble,), device_type=GPU_TYPE), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), @@ -22243,7 +22243,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Greatest relative difference: nan at index (700,) (up to 0.001 allowed) DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', + device_type=GPU_TYPE, dtypes=(torch.chalf,), active_if=IS_WINDOWS), ), ), @@ -22277,7 +22277,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Greatest relative difference: nan at index (6000,) (up to 0.001 allowed) DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large', - device_type='cuda', + device_type=GPU_TYPE, dtypes=(torch.chalf,), active_if=IS_WINDOWS), ), ), @@ -22623,7 +22623,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # Fails on CUDA but passes on ROCm DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large', - dtypes=(torch.cdouble,), device_type='cuda'), + dtypes=(torch.cdouble,), device_type=GPU_TYPE), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', @@ -22707,7 +22707,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): dtypes=[torch.cfloat, torch.cdouble]), DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal', - device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]), + device_type=GPU_TYPE, dtypes=[torch.cfloat, torch.cdouble]), ), ), ElementwiseUnaryPythonRefInfo( @@ -22716,7 +22716,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=1e-05)}), - 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'), + 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type=GPU_TYPE), ], skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', @@ -22735,7 +22735,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): decorators=[ DecorateInfo( toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=2e-05)}), - 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'), + 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type=GPU_TYPE), ], skips=( DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', @@ -22789,7 +22789,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): 'test_python_ref_torch_fallback'), DecorateInfo(unittest.skip("Expected: dropout is not comparable"), 'TestCommon', - 'test_python_ref_executor', device_type='cuda'), + 'test_python_ref_executor', device_type=GPU_TYPE), # AssertionError: Tensor-likes are not close! DecorateInfo(unittest.skip("Expected: dropout is not comparable"), 'TestMathBits', @@ -22855,7 +22855,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-03, rtol=1.2e-03), torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], ), ElementwiseUnaryPythonRefInfo( @@ -22960,7 +22960,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): torch.float16: tol(atol=1e-2, rtol=1.8e-2), torch.bfloat16: tol(atol=1e-2, rtol=1.8e-2) }), - 'TestUnaryUfuncs', device_type='cuda', + 'TestUnaryUfuncs', device_type=GPU_TYPE, ), ], ), PythonRefInfo( @@ -23040,7 +23040,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): DecorateInfo( toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02), torch.complex64: tol(atol=6e-04, rtol=1e-05)}), - 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'), + 'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type=GPU_TYPE), ], skips=( # in each case, pytorch will produce a nan while numpy will not @@ -23400,13 +23400,13 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): # than the torch result was (nan)! DecorateInfo( unittest.expectedFailure, 'TestCommon', 'test_python_ref', - dtypes=(torch.complex32,), device_type='cuda' + dtypes=(torch.complex32,), device_type=GPU_TYPE ), # Reference result was farther (0.0) from the precise computation # than the torch result was (nan)! DecorateInfo( unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback', - dtypes=(torch.complex32,), device_type='cuda' + dtypes=(torch.complex32,), device_type=GPU_TYPE ), ) ), @@ -24591,7 +24591,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs): op=lambda self, condition, other: refs.where(condition, self, other), supports_out=False, skips=( - DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type='cuda'), + DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type=GPU_TYPE), ), ), PythonRefInfo( diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py index b861af5b9c50d..cc8fd6b71a4e0 100644 --- a/torch/testing/_internal/opinfo/core.py +++ b/torch/testing/_internal/opinfo/core.py @@ -731,6 +731,9 @@ class OpInfo: # the following dtypesIf... options override the dtypes value on their respective device types + # dtypes this function is expected to work with on GPUs defined in GPU_TYPES + dtypesIfGPU: _dispatch_dtypes = None + # dtypes this function is expected to work with on CUDA dtypesIfCUDA: _dispatch_dtypes = None @@ -745,6 +748,9 @@ class OpInfo: # backward dtypes this function is expected to work with backward_dtypes: _dispatch_dtypes = None + # backward dtypes this function is expected to work with on GPUs defined in GPU_TYPES + backward_dtypesIfGPU: _dispatch_dtypes = None + # backward dtypes this function is expected to work with on CUDA backward_dtypesIfCUDA: _dispatch_dtypes = None @@ -914,6 +920,7 @@ def __post_init__(self): dtypes_args = ( self.dtypes, + self.dtypesIfGPU, self.dtypesIfCUDA, self.dtypesIfROCM, self.dtypesIfXPU, @@ -943,6 +950,25 @@ def __post_init__(self): self.dtypes = set(self.dtypes) + self.dtypesIfGPU = ( + set(self.dtypesIfGPU) if self.dtypesIfGPU is not None else self.dtypes + ) + self.dtypesIfCUDA = ( + set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypesIfGPU + ) + self.dtypesIfROCM = ( + set(self.dtypesIfROCM) + if self.dtypesIfROCM is not None + else self.dtypesIfCUDA + ) + self.dtypesIfXPU = ( + set(self.dtypesIfXPU) if self.dtypesIfXPU is not None else self.dtypesIfGPU + ) + + self.dtypesIfHpu = ( + set(self.dtypesIfHpu) if self.dtypesIfHpu is not None else self.dtypes + ) + # NOTE: backward dtypes must be acquired before forward dtypes # since they fallback to explicit (not implicit!) specifications of # forward dtypes @@ -950,14 +976,25 @@ def __post_init__(self): set(self.backward_dtypesIfROCM) if self.backward_dtypesIfROCM is not None else ( - self.backward_dtypesIfCUDA - if self.backward_dtypesIfCUDA is not None + self.backward_dtypesIfGPU + if self.backward_dtypesIfGPU is not None else self.backward_dtypes if self.backward_dtypes is not None else self.dtypesIfROCM if self.dtypesIfROCM is not None - else self.dtypesIfCUDA - if self.dtypesIfCUDA is not None + else self.dtypesIfGPU + if self.dtypesIfGPU is not None + else self.dtypes + ) + ) + self.backward_dtypesIfGPU = ( + set(self.backward_dtypesIfGPU) + if self.backward_dtypesIfGPU is not None + else ( + self.backward_dtypes + if self.backward_dtypes is not None + else self.dtypesIfGPU + if self.dtypesIfGPU is not None else self.dtypes ) ) @@ -965,7 +1002,9 @@ def __post_init__(self): set(self.backward_dtypesIfCUDA) if self.backward_dtypesIfCUDA is not None else ( - self.backward_dtypes + self.backward_dtypesIfGPU + if self.backward_dtypesIfGPU is not None + else self.backward_dtypes if self.backward_dtypes is not None else self.dtypesIfCUDA if self.dtypesIfCUDA is not None @@ -981,28 +1020,25 @@ def __post_init__(self): else self.dtypes ) ) + self.backward_dtypesIfXPU = ( + set(self.backward_dtypesIfXPU) + if self.backward_dtypesIfXPU is not None + else ( + self.backward_dtypesIfGPU + if self.backward_dtypesIfGPU is not None + else self.backward_dtypes + if self.backward_dtypes is not None + else self.dtypesIfXPU + if self.dtypesIfXPU is not None + else self.dtypes + ) + ) self.backward_dtypes = ( set(self.backward_dtypes) if self.backward_dtypes is not None else self.dtypes - ) - - self.dtypesIfCUDA = ( - set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypes - ) - self.dtypesIfROCM = ( - set(self.dtypesIfROCM) - if self.dtypesIfROCM is not None - else self.dtypesIfCUDA - ) - self.dtypesIfXPU = ( - set(self.dtypesIfXPU) if self.dtypesIfXPU is not None else self.dtypesIfCUDA - ) - - self.dtypesIfHpu = ( - set(self.dtypesIfHpu) if self.dtypesIfHpu is not None else self.dtypes - ) + ) # NOTE: if the op is unspecified it is assumed to be under the torch namespace if not self.op: @@ -1546,6 +1582,8 @@ def supported_backward_dtypes(self, device_type): ) elif device_type == "hpu": backward_dtypes = self.backward_dtypesIfHpu + elif device_type == "xpu": + backward_dtypes = self.backward_dtypesIfXPU else: backward_dtypes = self.backward_dtypes @@ -3008,6 +3046,7 @@ def __init__( dtypesIfCUDA=None, dtypesIfROCM=None, dtypesIfXPU=None, + dtypesIfGPU=None, sample_inputs_func=None, **kwargs, ): @@ -3017,6 +3056,7 @@ def __init__( dtypesIfCUDA=dtypesIfCUDA, dtypesIfROCM=dtypesIfROCM, dtypesIfXPU=dtypesIfXPU, + dtypesIfGPU=dtypesIfGPU, sample_inputs_func=sample_inputs_func, **kwargs, )