Skip to content

Commit 9effe38

Browse files
author
Jessica Paquette
committed
[AArch64][GlobalISel] Fold G_XOR into TB(N)Z bit calculation
This ports the existing case for G_XOR from `getTestBitOperand` in AArch64ISelLowering into GlobalISel. The idea is to flip between TBZ and TBNZ while walking through G_XORs. Let's say we have ``` tbz (xor x, c), b ``` Let's say the `b`-th bit in `c` is 1. Then - If the `b`-th bit in `x` is 1, the `b`-th bit in `(xor x, c)` is 0. - If the `b`-th bit in `x` is 0, then the `b`-th bit in `(xor x, c)` is 1. So, then ``` tbz (xor x, c), b == tbnz x, b ``` Let's say the `b`-th bit in `c` is 0. Then - If the `b`-th bit in `x` is 1, the `b`-th bit in `(xor x, c)` is 1. - If the `b`-th bit in `x` is 0, then the `b`-th bit in `(xor x, c)` is 0. So, then ``` tbz (xor x, c), b == tbz x, b ``` Differential Revision: https://reviews.llvm.org/D73929
1 parent 2252cac commit 9effe38

File tree

2 files changed

+208
-5
lines changed

2 files changed

+208
-5
lines changed

llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -991,7 +991,7 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
991991
}
992992

993993
/// Return a register which can be used as a bit to test in a TB(N)Z.
994-
static Register getTestBitReg(Register Reg, uint64_t &Bit,
994+
static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
995995
MachineRegisterInfo &MRI) {
996996
assert(Reg.isValid() && "Expected valid register!");
997997
while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
@@ -1018,7 +1018,8 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit,
10181018
switch (Opc) {
10191019
default:
10201020
break;
1021-
case TargetOpcode::G_AND: {
1021+
case TargetOpcode::G_AND:
1022+
case TargetOpcode::G_XOR: {
10221023
TestReg = MI->getOperand(1).getReg();
10231024
Register ConstantReg = MI->getOperand(2).getReg();
10241025
auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
@@ -1066,6 +1067,19 @@ static Register getTestBitReg(Register Reg, uint64_t &Bit,
10661067
Bit = Bit - *C;
10671068
}
10681069
break;
1070+
case TargetOpcode::G_XOR:
1071+
// We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1072+
// appropriate.
1073+
//
1074+
// e.g. If x' = xor x, c, and the b-th bit is set in c then
1075+
//
1076+
// tbz x', b -> tbnz x, b
1077+
//
1078+
// Because x' only has the b-th bit set if x does not.
1079+
if ((*C >> Bit) & 1)
1080+
Invert = !Invert;
1081+
NextReg = TestReg;
1082+
break;
10691083
}
10701084

10711085
// Check if we found anything worth folding.
@@ -1124,20 +1138,21 @@ bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
11241138
// Try to optimize the TB(N)Z.
11251139
uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
11261140
Register TestReg = AndInst->getOperand(1).getReg();
1127-
TestReg = getTestBitReg(TestReg, Bit, MRI);
1141+
bool Invert = Pred == CmpInst::Predicate::ICMP_NE;
1142+
TestReg = getTestBitReg(TestReg, Bit, Invert, MRI);
11281143

11291144
// Choose the correct TB(N)Z opcode to use.
11301145
unsigned Opc = 0;
11311146
if (Bit < 32) {
11321147
// When the bit is less than 32, we have to use a TBZW even if we're on a 64
11331148
// bit register.
1134-
Opc = Pred == CmpInst::Predicate::ICMP_EQ ? AArch64::TBZW : AArch64::TBNZW;
1149+
Opc = Invert ? AArch64::TBNZW : AArch64::TBZW;
11351150
TestReg = narrowExtendRegIfNeeded(TestReg, MIB);
11361151
} else {
11371152
// Same idea for when Bit >= 32. We don't have to narrow here, because if
11381153
// Bit > 32, then the G_CONSTANT must be outside the range of valid 32-bit
11391154
// values. So, we must have a s64.
1140-
Opc = Pred == CmpInst::Predicate::ICMP_EQ ? AArch64::TBZX : AArch64::TBNZX;
1155+
Opc = Invert ? AArch64::TBNZX : AArch64::TBZX;
11411156
}
11421157

11431158
// Construct the branch.
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
3+
...
4+
---
5+
name: flip_eq
6+
alignment: 4
7+
legalized: true
8+
regBankSelected: true
9+
body: |
10+
; CHECK-LABEL: name: flip_eq
11+
; CHECK: bb.0:
12+
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
13+
; CHECK: %copy:gpr64all = COPY $x0
14+
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
15+
; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
16+
; CHECK: TBNZW [[COPY1]], 3, %bb.1
17+
; CHECK: B %bb.0
18+
; CHECK: bb.1:
19+
; CHECK: RET_ReallyLR
20+
bb.0:
21+
successors: %bb.0, %bb.1
22+
liveins: $x0
23+
%copy:gpr(s64) = COPY $x0
24+
25+
; Check bit 3.
26+
%bit:gpr(s64) = G_CONSTANT i64 8
27+
%zero:gpr(s64) = G_CONSTANT i64 0
28+
29+
; 8 has the third bit set.
30+
%fold_cst:gpr(s64) = G_CONSTANT i64 8
31+
32+
; This only has the third bit set if %copy does not. So, to walk through
33+
; this, we want to use a TBNZW on %copy.
34+
%fold_me:gpr(s64) = G_XOR %copy, %fold_cst
35+
36+
%and:gpr(s64) = G_AND %fold_me, %bit
37+
%cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
38+
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
39+
G_BRCOND %cmp_trunc(s1), %bb.1
40+
G_BR %bb.0
41+
bb.1:
42+
RET_ReallyLR
43+
...
44+
---
45+
name: flip_ne
46+
alignment: 4
47+
legalized: true
48+
regBankSelected: true
49+
body: |
50+
; CHECK-LABEL: name: flip_ne
51+
; CHECK: bb.0:
52+
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
53+
; CHECK: %copy:gpr64all = COPY $x0
54+
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
55+
; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
56+
; CHECK: TBZW [[COPY1]], 3, %bb.1
57+
; CHECK: B %bb.0
58+
; CHECK: bb.1:
59+
; CHECK: RET_ReallyLR
60+
bb.0:
61+
successors: %bb.0, %bb.1
62+
liveins: $x0
63+
64+
; Same as eq case, but we should get a TBZW instead.
65+
66+
%copy:gpr(s64) = COPY $x0
67+
%bit:gpr(s64) = G_CONSTANT i64 8
68+
%zero:gpr(s64) = G_CONSTANT i64 0
69+
%fold_cst:gpr(s64) = G_CONSTANT i64 8
70+
%fold_me:gpr(s64) = G_XOR %copy, %fold_cst
71+
%and:gpr(s64) = G_AND %fold_me, %bit
72+
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
73+
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
74+
G_BRCOND %cmp_trunc(s1), %bb.1
75+
G_BR %bb.0
76+
bb.1:
77+
RET_ReallyLR
78+
...
79+
---
80+
name: dont_flip_eq
81+
alignment: 4
82+
legalized: true
83+
regBankSelected: true
84+
body: |
85+
; CHECK-LABEL: name: dont_flip_eq
86+
; CHECK: bb.0:
87+
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
88+
; CHECK: %copy:gpr64all = COPY $x0
89+
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
90+
; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
91+
; CHECK: TBZW [[COPY1]], 3, %bb.1
92+
; CHECK: B %bb.0
93+
; CHECK: bb.1:
94+
; CHECK: RET_ReallyLR
95+
bb.0:
96+
successors: %bb.0, %bb.1
97+
liveins: $x0
98+
%copy:gpr(s64) = COPY $x0
99+
100+
; Check bit 3.
101+
%bit:gpr(s64) = G_CONSTANT i64 8
102+
%zero:gpr(s64) = G_CONSTANT i64 0
103+
104+
; 7 does not have the third bit set.
105+
%fold_cst:gpr(s64) = G_CONSTANT i64 7
106+
107+
; This only has the third bit set if %copy does. So, to walk through this,
108+
; we should have a TBZW on %copy.
109+
%fold_me:gpr(s64) = G_XOR %fold_cst, %copy
110+
111+
%and:gpr(s64) = G_AND %fold_me, %bit
112+
%cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
113+
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
114+
G_BRCOND %cmp_trunc(s1), %bb.1
115+
G_BR %bb.0
116+
bb.1:
117+
RET_ReallyLR
118+
...
119+
---
120+
name: dont_flip_ne
121+
alignment: 4
122+
legalized: true
123+
regBankSelected: true
124+
body: |
125+
; CHECK-LABEL: name: dont_flip_ne
126+
; CHECK: bb.0:
127+
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
128+
; CHECK: %copy:gpr64all = COPY $x0
129+
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
130+
; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
131+
; CHECK: TBNZW [[COPY1]], 3, %bb.1
132+
; CHECK: B %bb.0
133+
; CHECK: bb.1:
134+
; CHECK: RET_ReallyLR
135+
bb.0:
136+
successors: %bb.0, %bb.1
137+
liveins: $x0
138+
139+
; Same as eq case, but we should get a TBNZW instead.
140+
141+
%copy:gpr(s64) = COPY $x0
142+
%bit:gpr(s64) = G_CONSTANT i64 8
143+
%zero:gpr(s64) = G_CONSTANT i64 0
144+
%fold_cst:gpr(s64) = G_CONSTANT i64 7
145+
%fold_me:gpr(s64) = G_XOR %fold_cst, %copy
146+
%and:gpr(s64) = G_AND %fold_me, %bit
147+
%cmp:gpr(s32) = G_ICMP intpred(ne), %and(s64), %zero
148+
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
149+
G_BRCOND %cmp_trunc(s1), %bb.1
150+
G_BR %bb.0
151+
bb.1:
152+
RET_ReallyLR
153+
...
154+
---
155+
name: xor_chain
156+
alignment: 4
157+
legalized: true
158+
regBankSelected: true
159+
body: |
160+
; CHECK-LABEL: name: xor_chain
161+
; CHECK: bb.0:
162+
; CHECK: successors: %bb.0(0x40000000), %bb.1(0x40000000)
163+
; CHECK: %copy:gpr64all = COPY $x0
164+
; CHECK: [[COPY:%[0-9]+]]:gpr32all = COPY %copy.sub_32
165+
; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
166+
; CHECK: TBZW [[COPY1]], 3, %bb.1
167+
; CHECK: B %bb.0
168+
; CHECK: bb.1:
169+
; CHECK: RET_ReallyLR
170+
bb.0:
171+
successors: %bb.0, %bb.1
172+
liveins: $x0
173+
%copy:gpr(s64) = COPY $x0
174+
%bit:gpr(s64) = G_CONSTANT i64 8
175+
%zero:gpr(s64) = G_CONSTANT i64 0
176+
%fold_cst:gpr(s64) = G_CONSTANT i64 8
177+
178+
; The G_XORs cancel each other out, so we should get a TBZW.
179+
%xor1:gpr(s64) = G_XOR %copy, %fold_cst
180+
%xor2:gpr(s64) = G_XOR %xor1, %fold_cst
181+
182+
%and:gpr(s64) = G_AND %xor2, %bit
183+
%cmp:gpr(s32) = G_ICMP intpred(eq), %and(s64), %zero
184+
%cmp_trunc:gpr(s1) = G_TRUNC %cmp(s32)
185+
G_BRCOND %cmp_trunc(s1), %bb.1
186+
G_BR %bb.0
187+
bb.1:
188+
RET_ReallyLR

0 commit comments

Comments
 (0)