From 8dff8965a00044485e7cdbdceebc227228836cf5 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 14 Oct 2024 18:56:47 +0100 Subject: [PATCH] [GlobalISel] Support vector G_UNMERGE_VALUES in computeKnownBits. This adds computeKnownBits support for vector->vector G_UNMERGE_VALUES, grabbing the known bits with an adjusted DemandedElts mask. --- .../lib/CodeGen/GlobalISel/GISelKnownBits.cpp | 24 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 252 +++++---- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 500 +++++++++--------- .../GlobalISel/KnownBitsVectorTest.cpp | 45 ++ 4 files changed, 435 insertions(+), 386 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index 52f5d408c8edd..2c98b129a1a89 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -514,15 +514,12 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, break; } case TargetOpcode::G_UNMERGE_VALUES: { - if (DstTy.isVector()) - break; unsigned NumOps = MI.getNumOperands(); Register SrcReg = MI.getOperand(NumOps - 1).getReg(); - if (MRI.getType(SrcReg).isVector()) - return; // TODO: Handle vectors. + LLT SrcTy = MRI.getType(SrcReg); - KnownBits SrcOpKnown; - computeKnownBitsImpl(SrcReg, SrcOpKnown, DemandedElts, Depth + 1); + if (SrcTy.isVector() && SrcTy.getScalarType() != DstTy.getScalarType()) + return; // TODO: Handle vector->subelement unmerges // Figure out the result operand index unsigned DstIdx = 0; @@ -530,7 +527,20 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known, ++DstIdx) ; - Known = SrcOpKnown.extractBits(BitWidth, BitWidth * DstIdx); + APInt SubDemandedElts = DemandedElts; + if (SrcTy.isVector()) { + unsigned DstLanes = DstTy.isVector() ? DstTy.getNumElements() : 1; + SubDemandedElts = + DemandedElts.zext(SrcTy.getNumElements()).shl(DstIdx * DstLanes); + } + + KnownBits SrcOpKnown; + computeKnownBitsImpl(SrcReg, SrcOpKnown, SubDemandedElts, Depth + 1); + + if (SrcTy.isVector()) + Known = SrcOpKnown; + else + Known = SrcOpKnown.extractBits(BitWidth, BitWidth * DstIdx); break; } case TargetOpcode::G_BSWAP: { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 81abe91b283f9..0b5706aa45b69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1184,73 +1184,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_trunc_f32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1265,40 +1266,39 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 ; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v12, v[1:2] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 ; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] +; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] ; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 @@ -1307,7 +1307,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 @@ -1318,95 +1318,93 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v14, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v10, v12, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index cfac0c2fa56aa..3ed864d463ee9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1112,73 +1112,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_trunc_f32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1191,40 +1192,39 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v8, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2] ; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v13, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v4 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 ; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v1, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] +; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v4 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] ; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 @@ -1233,7 +1233,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 @@ -1244,93 +1244,91 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 @@ -1707,73 +1705,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 +; GISEL-NEXT: v_trunc_f32_e32 v6, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v9 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v7, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v5, v[9:10] ; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v5, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v12, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v5, v4 ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v7, v6, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v12, v[4:5] +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v0, v6 +; GISEL-NEXT: v_mul_lo_u32 v0, v12, v8 +; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_xor_b32_e32 v13, v1, v6 ; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v1, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v9 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 @@ -1786,40 +1785,39 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_mul_hi_u32 v11, v13, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v8, v[1:2] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v8, v[1:2] ; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb -; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v13, v8, vcc ; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v5 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v10, v4 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb ; GISEL-NEXT: v_cndmask_b32_e64 v12, -1, v1, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v5 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] +; GISEL-NEXT: s_subb_u32 s7, 0, 0 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v4 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v5, v[8:9] ; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 +; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8 +; GISEL-NEXT: v_mul_hi_u32 v15, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc ; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 @@ -1828,7 +1826,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v13, v5, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 @@ -1839,93 +1837,91 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v5, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, v7, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s6, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], s7, v5, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v9 ; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v11, v5, v7 ; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v11, v5, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v0 ; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GISEL-NEXT: v_mul_lo_u32 v5, v12, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v2 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v6 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v2 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v5 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v12, v5 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5] ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp index dd6edd35a8468..2f3336e9085b6 100644 --- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp @@ -1548,3 +1548,48 @@ TEST_F(AArch64GISelMITest, TestNumSignBitsUAddoOverflow) { // Assert sign-extension from vector boolean EXPECT_EQ(32u, Info.computeNumSignBits(CopyOverflow)); } + +TEST_F(AArch64GISelMITest, TestKnwonBitsUnmergeVectorScalar) { + StringRef MIRString = R"( + %copy_x0:_(<2 x s16>) = COPY $w0 + %maskf:_(s16) = G_CONSTANT i16 15 + %x0_x1:_(<2 x s16>) = G_BUILD_VECTOR %maskf, %maskf + %and:_(<2 x s16>) = G_AND %copy_x0, %x0_x1 + %x0_0:_(s16), %x0_1:_(s16) = G_UNMERGE_VALUES %and + %result:_(s16) = COPY %x0_0 +)"; + + setUp(MIRString); + if (!TM) + GTEST_SKIP(); + + Register CopyOverflow = Copies[Copies.size() - 1]; + + GISelKnownBits Info(*MF); + + EXPECT_EQ(0xFFF0u, Info.getKnownBits(CopyOverflow).Zero.getZExtValue()); +} + +TEST_F(AArch64GISelMITest, TestKnwonBitsUnmergeVectorVector) { + StringRef MIRString = R"( + %copy_x0:_(<4 x s8>) = COPY $w0 + %maskff:_(s8) = G_CONSTANT i8 255 + %maskf:_(s8) = G_CONSTANT i8 15 + %x0_x1:_(<4 x s8>) = G_BUILD_VECTOR %maskf, %maskf, %maskff, %maskff + %and:_(<4 x s8>) = G_AND %copy_x0, %x0_x1 + %x0_0:_(<2 x s8>), %x0_1:_(<2 x s8>) = G_UNMERGE_VALUES %and + %result1:_(<2 x s8>) = COPY %x0_0 + %result2:_(<2 x s8>) = COPY %x0_1 +)"; + + setUp(MIRString); + if (!TM) + GTEST_SKIP(); + + GISelKnownBits Info(*MF); + + Register CopyOverflow1 = Copies[Copies.size() - 2]; + EXPECT_EQ(0xF0u, Info.getKnownBits(CopyOverflow1).Zero.getZExtValue()); + Register CopyOverflow2 = Copies[Copies.size() - 1]; + EXPECT_EQ(0x00u, Info.getKnownBits(CopyOverflow2).Zero.getZExtValue()); +}