diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a24b6430378cc..a212a9218ca0d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -859,6 +859,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum if (Subtarget->hasMinimum3Maximum3F32()) setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal); + + if (Subtarget->hasMinimum3Maximum3PKF16()) + setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal); } setOperationAction(ISD::INTRINSIC_WO_CHAIN, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index ae5a6581a3b20..7d202de6643bc 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -255,6 +255,17 @@ multiclass MadFmaMixPats; } +class MinimumMaximumByMinimum3Maximum3VOP3P : GCNPat< + (v2f16 (node (VOP3PMods v2f16:$src0, i32:$src0_mods), (VOP3PMods v2f16:$src1, i32:$src1_mods))), + (inst $src0_mods, $src0, $src1_mods, $src1, $src1_mods, $src1) +>; + +let SubtargetPredicate = HasMinimum3Maximum3PKF16 in { +def : MinimumMaximumByMinimum3Maximum3VOP3P; +def : MinimumMaximumByMinimum3Maximum3VOP3P; +} + let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in { // These are VOP3a-like opcodes which accept no omod. diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index e771e5801f2ed..f0fa621e3b4bc 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1772,30 +1772,38 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v2, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0) ret <2 x half> %max1 @@ -1814,30 +1822,38 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) ret <2 x half> %max1 @@ -1859,32 +1875,43 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX9-NEXT: v_pk_max_f16 v3, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX940-NEXT: v_pk_max_f16 v3, v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c) @@ -1906,30 +1933,38 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b %c.fneg = fneg <2 x half> %c @@ -1951,30 +1986,38 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) ret <2 x half> %max1 @@ -1993,30 +2036,38 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v2f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> ) ret <2 x half> %max1 @@ -2037,42 +2088,51 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v5, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v5, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0) ret <3 x half> %max1 @@ -2093,42 +2153,51 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) ret <3 x half> %max1 @@ -2156,46 +2225,61 @@ define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 -; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 +; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 +; GFX940-NEXT: v_pk_max_f16 v7, v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v11 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX940-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c) @@ -2219,42 +2303,51 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b %c.fneg = fneg <3 x half> %c @@ -2278,39 +2371,48 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 -; GFX9-NEXT: s_mov_b32 s1, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_movk_i32 s0, 0x7e00 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX9-NEXT: v_pk_max_f16 v4, v4, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX9-NEXT: v_pk_max_f16 v7, v7, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v7, v1, 2.0 +; GFX940-NEXT: s_mov_b32 s1, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_movk_i32 s0, 0x7e00 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX940-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX940-NEXT: v_pk_max_f16 v7, v7, v3 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) ret <3 x half> %max1 @@ -2331,42 +2433,51 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 -; GFX9-NEXT: v_pk_max_f16 v1, v1, 4.0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v3f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX940-NEXT: v_pk_max_f16 v1, v1, 4.0 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> ) ret <3 x half> %max1 @@ -2387,48 +2498,57 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v5, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v4, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v5, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v4, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v5, v1, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0) ret <4 x half> %max1 @@ -2449,48 +2569,57 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v5 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) ret <4 x half> %max1 @@ -2518,52 +2647,67 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 -; GFX9-NEXT: v_pk_max_f16 v7, v7, v9 -; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v6, v6, v8 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v6, v6, v10 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 +; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX940-NEXT: v_pk_max_f16 v7, v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v6, v6, v8 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v11 +; GFX940-NEXT: v_perm_b32 v6, v9, v0, s0 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v6, v6, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c) @@ -2587,48 +2731,57 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b %c.fneg = fneg <4 x half> %c @@ -2652,46 +2805,55 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX9-NEXT: v_pk_max_f16 v4, v4, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_pk_max_f16 v8, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX940-NEXT: v_pk_max_f16 v4, v4, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_perm_b32 v8, v5, v0, s0 +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX940-NEXT: v_pk_max_f16 v8, v8, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) ret <4 x half> %max1 @@ -2712,48 +2874,57 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_max_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX9-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fmaximum3_v4f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_max_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v4, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX940-NEXT: v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> ) ret <4 x half> %max1 @@ -3523,30 +3694,38 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX12-NEXT: v_pk_maximum_f16 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fmaximum3_v2f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 -; GFX9-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_no_fmaximum3_v2f16__multi_use: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_max_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s0 +; GFX940-NEXT: v_pk_max_f16 v3, v0, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fmaximum3_v2f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.maximum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.maximum.f16(<2 x half> %max0, <2 x half> %c) %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 6a697aadbf3ba..7a8a224c76a83 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1772,30 +1772,38 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v0, v2, v0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v2, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v2, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v2, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0) ret <2 x half> %max1 @@ -1814,30 +1822,38 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) ret <2 x half> %max1 @@ -1859,32 +1875,43 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 -; GFX9-NEXT: v_pk_min_f16 v3, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v4, v0, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v1 +; GFX940-NEXT: v_pk_min_f16 v3, v3, v4 +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v4, v0, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c) @@ -1906,30 +1933,38 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v5, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v5, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v5, -v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x half> %a %b.fneg = fneg <2 x half> %b %c.fneg = fneg <2 x half> %c @@ -1951,30 +1986,38 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v3, v0, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v1 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v3, v0, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v1 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> ) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) ret <2 x half> %max1 @@ -1993,30 +2036,38 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v0, v4, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v2f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v0, v4, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v2f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> ) ret <2 x half> %max1 @@ -2037,42 +2088,51 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v5, v1 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v5, v1 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0) ret <3 x half> %max1 @@ -2093,42 +2153,51 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) ret <3 x half> %max1 @@ -2156,46 +2225,61 @@ define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 -; GFX9-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 -; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 -; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v2, v8, v0, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_perm_b32 v6, v9, v1, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v1 +; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v3 +; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v2 +; GFX940-NEXT: v_pk_min_f16 v7, v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v4 +; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v5 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v2, v8, v0, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v11 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_perm_b32 v6, v9, v1, s0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX940-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c) @@ -2219,42 +2303,51 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x half> %a %b.fneg = fneg <3 x half> %b %c.fneg = fneg <3 x half> %c @@ -2278,39 +2371,48 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 -; GFX9-NEXT: s_mov_b32 s1, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_movk_i32 s0, 0x7e00 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v4, v5, v0, s1 -; GFX9-NEXT: v_pk_min_f16 v4, v4, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX9-NEXT: v_pack_b32_f16 v7, v1, s0 -; GFX9-NEXT: v_pk_min_f16 v7, v7, v3 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v7, v1, 2.0 +; GFX940-NEXT: s_mov_b32 s1, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_movk_i32 s0, 0x7e00 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_perm_b32 v4, v5, v0, s1 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX940-NEXT: v_pack_b32_f16 v7, v1, s0 +; GFX940-NEXT: v_pk_min_f16 v7, v7, v3 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s1 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> ) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) ret <3 x half> %max1 @@ -2331,42 +2433,51 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s0 -; GFX9-NEXT: v_pk_min_f16 v1, v1, 4.0 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v3f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v4, s0 +; GFX940-NEXT: v_pk_min_f16 v1, v1, 4.0 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v3f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> ) ret <3 x half> %max1 @@ -2387,48 +2498,57 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c ; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v5, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v4, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v5, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v6 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v4, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v8 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v5, v1, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v4, v0, v0 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0) ret <4 x half> %max1 @@ -2449,48 +2569,57 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16_commute: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16_commute: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v5 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v8, v4 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16_commute: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) ret <4 x half> %max1 @@ -2518,52 +2647,67 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16__fabs_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 -; GFX9-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 -; GFX9-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 -; GFX9-NEXT: v_pk_min_f16 v7, v7, v9 -; GFX9-NEXT: v_mov_b32_e32 v12, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v6, v6, v8 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 -; GFX9-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v2, v8, v1, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v11 -; GFX9-NEXT: v_perm_b32 v6, v9, v0, s0 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v6, v6, v10 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| -; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc -; GFX9-NEXT: v_perm_b32 v0, v7, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16__fabs_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_and_b32_e32 v7, 0x7fff7fff, v0 +; GFX940-NEXT: v_and_b32_e32 v9, 0x7fff7fff, v2 +; GFX940-NEXT: v_and_b32_e32 v6, 0x7fff7fff, v1 +; GFX940-NEXT: v_and_b32_e32 v8, 0x7fff7fff, v3 +; GFX940-NEXT: v_pk_min_f16 v7, v7, v9 +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v6, v6, v8 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v9, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_and_b32_e32 v11, 0x7fff7fff, v5 +; GFX940-NEXT: v_and_b32_e32 v10, 0x7fff7fff, v4 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v2| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, |v1|, |v3| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v2, v8, v1, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v11 +; GFX940-NEXT: v_perm_b32 v6, v9, v0, s0 +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v6, v6, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v12, v7, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v1, |v5| +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v0, |v4| +; GFX940-NEXT: v_perm_b32 v1, v3, v1, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v6, vcc +; GFX940-NEXT: v_perm_b32 v0, v7, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16__fabs_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX950-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c) @@ -2587,48 +2731,57 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16__fneg_all: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: v_mov_b32_e32 v7, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v8, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] -; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16__fneg_all: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: v_mov_b32_e32 v7, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v7, v6, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, -v1, -v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v6, -v5 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v8, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX940-NEXT: v_cmp_o_f16_e64 vcc, v8, -v4 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16__fneg_all: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 neg_lo:[1,1,1] neg_hi:[1,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 neg_lo:[0,1,1] neg_hi:[0,1,1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <4 x half> %a %b.fneg = fneg <4 x half> %b %c.fneg = fneg <4 x half> %c @@ -2652,46 +2805,55 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16__inlineimm1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_mov_b32_e32 v6, 0x7e00 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc -; GFX9-NEXT: v_perm_b32 v4, v8, v1, s0 -; GFX9-NEXT: v_pk_min_f16 v4, v4, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v8, v5, v0, s0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_pk_min_f16 v8, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: v_perm_b32 v1, v7, v1, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v5, v0, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16__inlineimm1: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7e00 +; GFX940-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0] +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v8, v6, v8, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX940-NEXT: v_perm_b32 v4, v8, v1, s0 +; GFX940-NEXT: v_pk_min_f16 v4, v4, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: v_perm_b32 v8, v5, v0, s0 +; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX940-NEXT: v_pk_min_f16 v8, v8, v2 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v6, v7, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v8 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v9, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: v_perm_b32 v1, v7, v1, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc +; GFX940-NEXT: v_perm_b32 v0, v5, v0, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 2.0, 2.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> ) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) ret <4 x half> %max1 @@ -2712,48 +2874,57 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { ; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fminimum3_v4f16__inlineimm2: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_pk_min_f16 v2, v1, v3 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v1, v4, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v2, v0, v6, s0 -; GFX9-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 -; GFX9-NEXT: v_perm_b32 v1, v1, v3, s0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_perm_b32 v0, v0, v4, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_fminimum3_v4f16__inlineimm2: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX940-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: v_pk_min_f16 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v1, v4, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v4, v4 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v1, v1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v2, v0, v6, s0 +; GFX940-NEXT: v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v6, v6 +; GFX940-NEXT: v_perm_b32 v1, v1, v3, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc +; GFX940-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fminimum3_v4f16__inlineimm2: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, 4.0, 4.0 op_sel_hi:[1,0,0] +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> ) ret <4 x half> %max1 @@ -3523,30 +3694,38 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b, ; GFX12-NEXT: v_pk_minimum_f16 v1, v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_no_fminimum3_v2f16__multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v3, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v5, s0 -; GFX9-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_perm_b32 v1, v1, v5, s0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX940-LABEL: v_no_fminimum3_v2f16__multi_use: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_pk_min_f16 v3, v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 +; GFX940-NEXT: s_mov_b32 s0, 0x5040100 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v0, v1, v5, s0 +; GFX940-NEXT: v_pk_min_f16 v3, v0, v2 +; GFX940-NEXT: v_cmp_o_f16_e32 vcc, v5, v2 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v4, v3, vcc +; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX940-NEXT: v_perm_b32 v1, v1, v5, s0 +; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_no_fminimum3_v2f16__multi_use: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x half> @llvm.minimum.f16(<2 x half> %a, <2 x half> %b) %max1 = call <2 x half> @llvm.minimum.f16(<2 x half> %max0, <2 x half> %c) %concat = shufflevector <2 x half> %max0, <2 x half> %max1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index e782f53cee608..1d0367db70143 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -572,17 +572,7 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX950-LABEL: v_maximum_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16: @@ -650,11 +640,17 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16__nnan: ; GFX10: ; %bb.0: @@ -735,17 +731,7 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX950-LABEL: v_maximum_v2f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16__nsz: @@ -813,11 +799,17 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v2f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v2f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v2f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -918,21 +910,9 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX950-LABEL: s_maximum_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-NEXT: s_lshr_b32 s1, s1, 16 -; GFX950-NEXT: v_pk_max_f16 v1, s0, v1 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_lshr_b32 s0, s0, 16 -; GFX950-NEXT: v_mov_b32_e32 v3, s1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND @@ -1061,20 +1041,8 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX950-LABEL: v_maximum_v3f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16: @@ -1156,12 +1124,19 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16__nnan: ; GFX10: ; %bb.0: @@ -1258,20 +1233,8 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX950-LABEL: v_maximum_v3f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16__nsz: @@ -1353,12 +1316,19 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v3f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v3f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v3f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -1473,26 +1443,8 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX950-LABEL: v_maximum_v4f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16: @@ -1590,12 +1542,19 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16__nnan: ; GFX10: ; %bb.0: @@ -1710,26 +1669,8 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX950-LABEL: v_maximum_v4f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v3, v0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16__nsz: @@ -1827,12 +1768,19 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maximum_v4f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maximum_v4f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maximum_v4f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -2009,44 +1957,10 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX950-LABEL: v_maximum_v8f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v7, v2, v6 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v6, v1, v5 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v5, v0, v4 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_maximum3_f16 v2, v2, v6, v6 +; GFX950-NEXT: v_pk_maximum3_f16 v3, v3, v7, v7 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f16: @@ -2414,80 +2328,14 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX950-LABEL: v_maximum_v16f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v13, v4, v12 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v12, v3, v11 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v11, v2, v10 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v10, v1, v9 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_max_f16 v9, v0, v8 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v8, v8 +; GFX950-NEXT: v_pk_maximum3_f16 v1, v1, v9, v9 +; GFX950-NEXT: v_pk_maximum3_f16 v2, v2, v10, v10 +; GFX950-NEXT: v_pk_maximum3_f16 v3, v3, v11, v11 +; GFX950-NEXT: v_pk_maximum3_f16 v4, v4, v12, v12 +; GFX950-NEXT: v_pk_maximum3_f16 v5, v5, v13, v13 +; GFX950-NEXT: v_pk_maximum3_f16 v6, v6, v14, v14 +; GFX950-NEXT: v_pk_maximum3_f16 v7, v7, v15, v15 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 329a85f91c251..f8c2c54af2783 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -462,17 +462,7 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX950-LABEL: v_minimum_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16: @@ -525,11 +515,17 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16__nnan: ; GFX10: ; %bb.0: @@ -590,17 +586,7 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX950-LABEL: v_minimum_v2f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16__nsz: @@ -653,11 +639,17 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v2f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v2f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v2f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -731,21 +723,9 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX950-LABEL: s_minimum_v2f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_mov_b32_e32 v0, s1 -; GFX950-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-NEXT: s_lshr_b32 s1, s1, 16 -; GFX950-NEXT: v_pk_min_f16 v1, s0, v1 -; GFX950-NEXT: v_mov_b32_e32 v2, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v0 -; GFX950-NEXT: s_lshr_b32 s0, s0, 16 -; GFX950-NEXT: v_mov_b32_e32 v3, s1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, s0, v3 -; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, s1, s1 ; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX950-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX950-NEXT: ;;#ASMSTART ; GFX950-NEXT: ; use v0 ; GFX950-NEXT: ;;#ASMEND @@ -847,20 +827,8 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX950-LABEL: v_minimum_v3f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16: @@ -922,12 +890,19 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16__nnan: ; GFX10: ; %bb.0: @@ -997,20 +972,8 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX950-LABEL: v_minimum_v3f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16__nsz: @@ -1072,12 +1035,19 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v3f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v3f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v3f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -1158,26 +1128,8 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX950-LABEL: v_minimum_v4f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16: @@ -1250,12 +1202,19 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16__nnan: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16__nnan: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16__nnan: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16__nnan: ; GFX10: ; %bb.0: @@ -1336,26 +1295,8 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX950-LABEL: v_minimum_v4f16__nsz: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX950-NEXT: v_mov_b32_e32 v5, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v3 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v3, v0, v2 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v2 -; GFX950-NEXT: v_perm_b32 v1, v1, v6, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v4, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16__nsz: @@ -1428,12 +1369,19 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minimum_v4f16__nnan_nsz: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v3 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minimum_v4f16__nnan_nsz: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX900-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minimum_v4f16__nnan_nsz: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v2, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v3, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz: ; GFX10: ; %bb.0: @@ -1548,44 +1496,10 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX950-LABEL: v_minimum_v8f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX950-NEXT: v_mov_b32_e32 v9, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v7 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v10, v9, v8, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v7, v2, v6 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v6 -; GFX950-NEXT: v_perm_b32 v3, v3, v10, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v8, v9, v7, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v6, v1, v5 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v9, v7, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v5 -; GFX950-NEXT: v_perm_b32 v2, v2, v8, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v5, v0, v4 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v6, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v4 -; GFX950-NEXT: v_perm_b32 v1, v1, v7, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v9, v5, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v6, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v4, v4 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v5, v5 +; GFX950-NEXT: v_pk_minimum3_f16 v2, v2, v6, v6 +; GFX950-NEXT: v_pk_minimum3_f16 v3, v3, v7, v7 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f16: @@ -1833,80 +1747,14 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX950-LABEL: v_minimum_v16f16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX950-NEXT: v_mov_b32_e32 v17, 0x7e00 -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v7, v15 -; GFX950-NEXT: s_mov_b32 s0, 0x5040100 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v18, v17, v16, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v7, v15 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v7, v17, v16, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v6, v14 -; GFX950-NEXT: v_perm_b32 v7, v7, v18, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v6, v14 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v6, v17, v15, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v5, v13 -; GFX950-NEXT: v_perm_b32 v6, v6, v16, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v15, v17, v14, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v5, v13 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v13, v4, v12 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v5, v17, v14, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v4, v12 -; GFX950-NEXT: v_perm_b32 v5, v5, v15, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v14, v17, v13, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v4, v12 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v12, v3, v11 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v4, v17, v13, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v3, v11 -; GFX950-NEXT: v_perm_b32 v4, v4, v14, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v3, v11 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v11, v2, v10 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v2, v10 -; GFX950-NEXT: v_perm_b32 v3, v3, v13, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v12, v17, v11, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v2, v10 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v10, v1, v9 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v2, v17, v11, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v1, v9 -; GFX950-NEXT: v_perm_b32 v2, v2, v12, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v11, v17, v10, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v1, v9 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: v_pk_min_f16 v9, v0, v8 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v10, vcc -; GFX950-NEXT: v_cmp_o_f16_e32 vcc, v0, v8 -; GFX950-NEXT: v_perm_b32 v1, v1, v11, s0 -; GFX950-NEXT: s_nop 0 -; GFX950-NEXT: v_cndmask_b32_e32 v10, v17, v9, vcc -; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX950-NEXT: v_cmp_o_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:WORD_1 -; GFX950-NEXT: s_nop 1 -; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v9, vcc -; GFX950-NEXT: v_perm_b32 v0, v0, v10, s0 +; GFX950-NEXT: v_pk_minimum3_f16 v0, v0, v8, v8 +; GFX950-NEXT: v_pk_minimum3_f16 v1, v1, v9, v9 +; GFX950-NEXT: v_pk_minimum3_f16 v2, v2, v10, v10 +; GFX950-NEXT: v_pk_minimum3_f16 v3, v3, v11, v11 +; GFX950-NEXT: v_pk_minimum3_f16 v4, v4, v12, v12 +; GFX950-NEXT: v_pk_minimum3_f16 v5, v5, v13, v13 +; GFX950-NEXT: v_pk_minimum3_f16 v6, v6, v14, v14 +; GFX950-NEXT: v_pk_minimum3_f16 v7, v7, v15, v15 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f16: