Skip to content

Commit 91a7aa4

Browse files
tsymallatsymalla-AMD
authored andcommitted
[AMDGPU] Improve abs modifier usage
If a call to the llvm.fabs intrinsic has users in another reachable BB, SelectionDAG will not apply the abs modifier to these users and instead generate a v_and ..., 0x7fffffff instruction. For fneg instructions, the issue is similar. This patch implements `AMDGPUIselLowering::shouldSinkOperands`, which allows CodegenPrepare to call `tryToSinkFreeOperands`. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D150347
1 parent c10138a commit 91a7aa4

File tree

5 files changed

+121
-80
lines changed

5 files changed

+121
-80
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5133,3 +5133,22 @@ bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
51335133
return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
51345134
Ty2 == LLT::scalar(32);
51355135
}
5136+
5137+
/// Whether it is profitable to sink the operands of an
5138+
/// Instruction I to the basic block of I.
5139+
/// This helps using several modifiers (like abs and neg) more often.
5140+
bool AMDGPUTargetLowering::shouldSinkOperands(
5141+
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
5142+
using namespace PatternMatch;
5143+
5144+
for (auto &Op : I->operands()) {
5145+
// Ensure we are not already sinking this operand.
5146+
if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
5147+
continue;
5148+
5149+
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
5150+
Ops.push_back(&Op);
5151+
}
5152+
5153+
return !Ops.empty();
5154+
}

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,9 @@ class AMDGPUTargetLowering : public TargetLowering {
353353

354354
bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
355355
LLT Ty2) const override;
356+
357+
bool shouldSinkOperands(Instruction *I,
358+
SmallVectorImpl<Use *> &Ops) const override;
356359
};
357360

358361
namespace AMDGPUISD {

llvm/test/CodeGen/AMDGPU/andorbitset.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
5353
; Make sure there's no verifier error with an undef source.
5454
; SI-LABEL: {{^}}bitset_verifier_error:
5555
; SI-NOT: %bb.1:
56-
; SI: s_bitset0_b32 s{{[0-9]+}}, 31
56+
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
5757
define void @bitset_verifier_error() local_unnamed_addr #0 {
5858
bb:
5959
%i = call float @llvm.fabs.f32(float undef) #0

llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll

Lines changed: 47 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -13,35 +13,28 @@ define void @f(i32 %arg, ptr %ptr) {
1313
; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1414
; ISA-NEXT: v_mov_b32_e32 v7, 0
1515
; ISA-NEXT: s_waitcnt lgkmcnt(0)
16-
; ISA-NEXT: s_cmp_lg_u32 s4, 0
17-
; ISA-NEXT: s_cselect_b32 s6, -1, 0
18-
; ISA-NEXT: s_and_b32 s6, s6, exec_lo
19-
; ISA-NEXT: s_cselect_b32 s6, s5, 0
16+
; ISA-NEXT: s_lshr_b32 s6, s5, 1
2017
; ISA-NEXT: s_lshr_b32 s7, 1, s4
2118
; ISA-NEXT: s_cmp_lg_u32 s4, 0
22-
; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6
23-
; ISA-NEXT: s_cselect_b32 s8, -1, 0
24-
; ISA-NEXT: s_and_b32 s8, s8, exec_lo
25-
; ISA-NEXT: s_cselect_b32 s7, s7, 0
26-
; ISA-NEXT: s_lshr_b32 s5, s5, 1
27-
; ISA-NEXT: s_cmp_lg_u32 s4, 0
28-
; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7
2919
; ISA-NEXT: s_cselect_b32 s4, -1, 0
30-
; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4
20+
; ISA-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4
3121
; ISA-NEXT: s_and_b32 s4, s4, exec_lo
32-
; ISA-NEXT: s_cselect_b32 s4, s5, 0
33-
; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4
22+
; ISA-NEXT: s_cselect_b32 s4, s6, 0
23+
; ISA-NEXT: s_cselect_b32 s6, s7, 0
24+
; ISA-NEXT: s_cselect_b32 s5, s5, 0
25+
; ISA-NEXT: v_cvt_f32_i32_e32 v3, s4
26+
; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6
27+
; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5
3428
; ISA-NEXT: s_mov_b32 s4, 0
35-
; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5
3629
; ISA-NEXT: .LBB0_1: ; %bb14
3730
; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
3831
; ISA-NEXT: v_mov_b32_e32 v6, v7
3932
; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo
4033
; ISA-NEXT: s_or_b32 s4, s5, s4
41-
; ISA-NEXT: v_add_f32_e32 v7, v6, v3
42-
; ISA-NEXT: v_add_f32_e32 v7, v7, v5
34+
; ISA-NEXT: v_add_f32_e32 v7, v6, v0
35+
; ISA-NEXT: v_add_f32_e64 v7, v7, |v3|
4336
; ISA-NEXT: v_add_f32_e32 v7, v7, v4
44-
; ISA-NEXT: v_add_f32_e32 v7, v7, v0
37+
; ISA-NEXT: v_add_f32_e32 v7, v7, v5
4538
; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
4639
; ISA-NEXT: s_cbranch_execnz .LBB0_1
4740
; ISA-NEXT: ; %bb.2: ; %bb21
@@ -58,64 +51,58 @@ define void @f(i32 %arg, ptr %ptr) {
5851
; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
5952
; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
6053
; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
54+
; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
55+
; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
6156
; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
6257
; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4)
63-
; MIR-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
64-
; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
65-
; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
66-
; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
67-
; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
68-
; MIR-NEXT: $scc = COPY [[COPY5]]
69-
; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc
70-
; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
71-
; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc
72-
; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
58+
; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
59+
; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
60+
; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
61+
; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
62+
; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc
63+
; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
64+
; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc
7365
; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
7466
; MIR-NEXT: $scc = COPY [[COPY6]]
75-
; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc
76-
; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc
77-
; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
78-
; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
79-
; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
80-
; MIR-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
81-
; MIR-NEXT: $scc = COPY [[COPY7]]
82-
; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc
83-
; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
84-
; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
85-
; MIR-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]]
86-
; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc
87-
; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
88-
; MIR-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
89-
; MIR-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]]
90-
; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec
91-
; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
67+
; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc
68+
; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
69+
; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]]
70+
; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
71+
; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
72+
; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
73+
; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec
74+
; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
75+
; MIR-NEXT: $scc = COPY [[COPY6]]
76+
; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc
9277
; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec
93-
; MIR-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
94-
; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
95-
; MIR-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
96-
; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec
97-
; MIR-NEXT: [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
78+
; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
79+
; MIR-NEXT: $scc = COPY [[COPY6]]
80+
; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc
81+
; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
82+
; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
83+
; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
84+
; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
9885
; MIR-NEXT: {{ $}}
9986
; MIR-NEXT: bb.1.bb14:
10087
; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000)
10188
; MIR-NEXT: {{ $}}
102-
; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1
103-
; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1
104-
; MIR-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]]
105-
; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc
106-
; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
107-
; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec
108-
; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec
109-
; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec
110-
; MIR-NEXT: [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
89+
; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1
90+
; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1
91+
; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]]
92+
; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc
93+
; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec
94+
; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec
95+
; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec
96+
; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
97+
; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
11198
; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
11299
; MIR-NEXT: S_BRANCH %bb.2
113100
; MIR-NEXT: {{ $}}
114101
; MIR-NEXT: bb.2.bb21:
115102
; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1
116103
; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1
117104
; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
118-
; MIR-NEXT: FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
105+
; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
119106
; MIR-NEXT: SI_RETURN
120107
bb:
121108
%i = load <2 x i32>, ptr addrspace(4) null, align 4294967296

llvm/test/CodeGen/AMDGPU/fold-fabs.ll

Lines changed: 51 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2-
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 -start-before=amdgpu-late-codegenprepare < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s
33

44
define float @fold_abs_in_branch(float %arg1, float %arg2) {
55
; GFX10-LABEL: fold_abs_in_branch:
@@ -10,10 +10,9 @@ define float @fold_abs_in_branch(float %arg1, float %arg2) {
1010
; GFX10-NEXT: s_mov_b32 s4, exec_lo
1111
; GFX10-NEXT: v_add_f32_e32 v1, v0, v1
1212
; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1|
13-
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
1413
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
1514
; GFX10-NEXT: ; %bb.1: ; %if
16-
; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1
15+
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1|
1716
; GFX10-NEXT: ; %bb.2: ; %exit
1817
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
1918
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -41,16 +40,15 @@ define float @fold_abs_in_branch_multiple_users(float %arg1, float %arg2) {
4140
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4241
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
4342
; GFX10-NEXT: s_mov_b32 s4, exec_lo
44-
; GFX10-NEXT: v_add_f32_e32 v1, v0, v1
45-
; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1|
46-
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
47-
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
43+
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
44+
; GFX10-NEXT: v_add_f32_e64 v1, |v0|, |v0|
45+
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1
4846
; GFX10-NEXT: ; %bb.1: ; %if
49-
; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1
47+
; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, |v0|
5048
; GFX10-NEXT: ; %bb.2: ; %exit
5149
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
52-
; GFX10-NEXT: v_add_f32_e32 v1, 2.0, v1
53-
; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0
50+
; GFX10-NEXT: v_add_f32_e64 v0, |v0|, 2.0
51+
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
5452
; GFX10-NEXT: s_setpc_b64 s[30:31]
5553
entry:
5654
%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
@@ -77,11 +75,10 @@ define float @fold_abs_in_branch_undef(float %arg1, float %arg2) {
7775
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7876
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
7977
; GFX10-NEXT: v_add_f32_e64 v0, |s4|, |s4|
80-
; GFX10-NEXT: s_bitset0_b32 s4, 31
8178
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v0
8279
; GFX10-NEXT: s_cbranch_vccnz .LBB2_2
8380
; GFX10-NEXT: ; %bb.1: ; %if
84-
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, s4
81+
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |s4|
8582
; GFX10-NEXT: .LBB2_2: ; %exit
8683
; GFX10-NEXT: s_setpc_b64 s[30:31]
8784
entry:
@@ -107,11 +104,10 @@ define float @fold_abs_in_branch_poison(float %arg1, float %arg2) {
107104
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108105
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
109106
; GFX10-NEXT: v_add_f32_e64 v0, |s4|, |s4|
110-
; GFX10-NEXT: s_bitset0_b32 s4, 31
111107
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v0
112108
; GFX10-NEXT: s_cbranch_vccnz .LBB3_2
113109
; GFX10-NEXT: ; %bb.1: ; %if
114-
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, s4
110+
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |s4|
115111
; GFX10-NEXT: .LBB3_2: ; %exit
116112
; GFX10-NEXT: s_setpc_b64 s[30:31]
117113
entry:
@@ -140,7 +136,6 @@ define float @fold_abs_in_branch_fabs(float %arg1, float %arg2) {
140136
; GFX10-NEXT: s_mov_b32 s4, exec_lo
141137
; GFX10-NEXT: v_add_f32_e32 v1, v0, v1
142138
; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1|
143-
; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
144139
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
145140
; GFX10-NEXT: ; %bb.1: ; %if
146141
; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1|
@@ -175,14 +170,16 @@ define float @fold_abs_in_branch_phi(float %arg1, float %arg2) {
175170
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
176171
; GFX10-NEXT: v_add_f32_e64 v0, |v0|, |v0|
177172
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
178-
; GFX10-NEXT: s_cbranch_execz .LBB5_2
179-
; GFX10-NEXT: .LBB5_1: ; %l
173+
; GFX10-NEXT: s_cbranch_execz .LBB5_3
174+
; GFX10-NEXT: ; %bb.1: ; %header.preheader
175+
; GFX10-NEXT: ; implicit-def: $vgpr0
176+
; GFX10-NEXT: .LBB5_2: ; %header
180177
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
181178
; GFX10-NEXT: v_mul_f32_e32 v0, 0x40400000, v0
182179
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, -1.0, v0
183180
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
184-
; GFX10-NEXT: s_cbranch_vccnz .LBB5_1
185-
; GFX10-NEXT: .LBB5_2: ; %exit
181+
; GFX10-NEXT: s_cbranch_vccnz .LBB5_2
182+
; GFX10-NEXT: .LBB5_3: ; %Flow1
186183
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
187184
; GFX10-NEXT: s_setpc_b64 s[30:31]
188185
entry:
@@ -209,5 +206,40 @@ exit:
209206
ret float %ret
210207
}
211208

209+
define float @fold_neg_in_branch(float %arg1, float %arg2) {
210+
; GFX10-LABEL: fold_neg_in_branch:
211+
; GFX10: ; %bb.0: ; %entry
212+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
213+
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
214+
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
215+
; GFX10-NEXT: s_mov_b32 s4, exec_lo
216+
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
217+
; GFX10-NEXT: v_mov_b32_e32 v1, v0
218+
; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0
219+
; GFX10-NEXT: ; %bb.1: ; %if
220+
; GFX10-NEXT: v_rcp_f32_e64 v1, -v0
221+
; GFX10-NEXT: v_mul_f32_e64 v1, |v0|, v1
222+
; GFX10-NEXT: ; %bb.2: ; %exit
223+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
224+
; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v1
225+
; GFX10-NEXT: s_setpc_b64 s[30:31]
226+
entry:
227+
%0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2
228+
%1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2
229+
%2 = fneg reassoc nnan nsz arcp contract afn float %1
230+
%3 = fcmp ule float %1, 1.000000e+00
231+
br i1 %3, label %if, label %exit
232+
233+
if:
234+
%if.fabs = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1)
235+
%if.3 = fdiv reassoc nnan nsz arcp contract afn float %if.fabs, %2
236+
br label %exit
237+
238+
exit:
239+
%ret = phi float [ %1, %entry ], [ %if.3, %if ]
240+
%ret.2 = fmul reassoc nnan nsz arcp contract afn float %2, %ret
241+
ret float %ret.2
242+
}
243+
212244
declare float @llvm.fabs.f32(float)
213245
declare float @llvm.fmuladd.f32(float, float, float) #0

0 commit comments

Comments
 (0)