diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index e591879060f30..9587fad1ecd63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -134,6 +134,22 @@ def combine_fmul_with_select_to_fldexp : GICombineRule< [{ return Helper.matchCombineFmulWithSelectToFldexp(*${root}, *${sel}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +// (shift x, (zext amt)) -> (shift x, (and (anyext amt), mask) +// +// The pattern is longer, but is better for matching during ISel. +class canonicalize_zext_shift_amt : GICombineRule< + (defs root:$dst), + (match (G_ZEXT $amt, $amtsrc):$zext, + (opc $dst, $src, $amt):$shift), + (apply [{ applyCanonicalizeZextShiftAmt(*${shift}, *${zext}); }])>; + +def canonicalize_zext_lshr : canonicalize_zext_shift_amt; +def canonicalize_zext_ashr : canonicalize_zext_shift_amt; +def canonicalize_zext_shl : canonicalize_zext_shift_amt; + +def zext_of_shift_amount_combines : GICombineGroup<[ + canonicalize_zext_lshr, canonicalize_zext_ashr, canonicalize_zext_shl +]>; let Predicates = [Has16BitInsts, NotHasMed3_16] in { // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This @@ -182,5 +198,5 @@ def AMDGPURegBankCombiner : GICombiner< zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, identity_combines, redundant_and, constant_fold_cast_op, - cast_of_cast_combines, sext_trunc]> { + cast_of_cast_combines, sext_trunc, zext_of_shift_amount_combines]> { } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 8f9ad38d101a1..d41ea17cd6f05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -87,6 +87,8 @@ class AMDGPURegBankCombinerImpl : public Combiner { void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; void applyClamp(MachineInstr &MI, Register &Reg) const; + void applyCanonicalizeZextShiftAmt(MachineInstr &MI, MachineInstr &Ext) const; + private: SIModeRegisterDefaults getMode() const; bool getIEEE() const; @@ -362,6 +364,34 @@ void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } +void AMDGPURegBankCombinerImpl::applyCanonicalizeZextShiftAmt( + MachineInstr &MI, MachineInstr &Ext) const { + unsigned ShOpc = MI.getOpcode(); + assert(ShOpc == AMDGPU::G_SHL || ShOpc == AMDGPU::G_LSHR || + ShOpc == AMDGPU::G_ASHR); + assert(Ext.getOpcode() == AMDGPU::G_ZEXT); + + Register AmtReg = Ext.getOperand(1).getReg(); + Register ShDst = MI.getOperand(0).getReg(); + Register ShSrc = MI.getOperand(1).getReg(); + + LLT ExtAmtTy = MRI.getType(Ext.getOperand(0).getReg()); + LLT AmtTy = MRI.getType(AmtReg); + + auto &RB = *MRI.getRegBank(AmtReg); + + auto NewExt = B.buildAnyExt(ExtAmtTy, AmtReg); + auto Mask = B.buildConstant( + ExtAmtTy, maskTrailingOnes(AmtTy.getScalarSizeInBits())); + auto And = B.buildAnd(ExtAmtTy, NewExt, Mask); + B.buildInstr(ShOpc, {ShDst}, {ShSrc, And}); + + MRI.setRegBank(NewExt.getReg(0), RB); + MRI.setRegBank(Mask.getReg(0), RB); + MRI.setRegBank(And.getReg(0), RB); + MI.eraseFromParent(); +} + SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo()->getMode(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir new file mode 100644 index 0000000000000..77d30f6fa5223 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s + +--- +name: lshr_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: lshr_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_LSHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: ashr_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: ashr_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_ASHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: shl_zext_i16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: shl_zext_i16 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s16) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_SHL %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: lshr_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: lshr_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_LSHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_LSHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: ashr_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: ashr_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_ASHR %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_ASHR %src, %zextamt + $sgpr0 = COPY %res +... + +--- +name: shl_zext_i8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0, $sgpr1 + + ; CHECK-LABEL: name: shl_zext_i8 + ; CHECK: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:sgpr(s32) = COPY $sgpr0 + ; CHECK-NEXT: %regamt:sgpr(s32) = COPY $sgpr1 + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND %regamt, [[C]] + ; CHECK-NEXT: %res:sgpr(s32) = G_SHL %src, [[AND]](s32) + ; CHECK-NEXT: $sgpr0 = COPY %res(s32) + %src:sgpr(s32) = COPY $sgpr0 + %regamt:sgpr(s32) = COPY $sgpr1 + %amt:sgpr(s8) = G_TRUNC %regamt + %zextamt:sgpr(s32) = G_ZEXT %amt + %res:sgpr(s32) = G_SHL %src, %zextamt + $sgpr0 = COPY %res +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 07fcb02d98649..18019fa31410e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3329,9 +3329,7 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3486,10 +3484,8 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: ; return to shader part epilog @@ -3793,20 +3789,16 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3942,18 +3934,14 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4450,28 +4438,22 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -4790,37 +4772,29 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 2e8c918e4c67e..d441a9a62fc1c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3077,11 +3077,9 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) { ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3235,9 +3233,7 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3570,26 +3566,22 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 14, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3735,32 +3727,28 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_bfe_u32 s4, s2, 0xf0001 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, 14 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 ; GFX6-NEXT: s_lshr_b32 s4, s4, 14 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -4358,26 +4346,22 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 14, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 16, v6 ; GFX6-NEXT: v_and_b32_e32 v9, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX6-NEXT: v_and_b32_e32 v3, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_bfe_u32 v3, v5, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 @@ -4388,9 +4372,7 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v5, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX6-NEXT: v_bfe_u32 v3, v3, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 @@ -4782,26 +4764,22 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 14, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 @@ -4818,20 +4796,16 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 ; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: s_setpc_b64 s[30:31] ;