Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 7fa1a5f

Browse files
author
Evandro Menezes
committed
[AArch64] Use the reciprocal estimation machinery
This patch adds support for estimating the square root, its reciprocal and division or reciprocal using the combiner generic reciprocal machinery. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@268539 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 0e4fcac commit 7fa1a5f

File tree

7 files changed

+339
-3
lines changed

7 files changed

+339
-3
lines changed

lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -970,6 +970,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
970970
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
971971
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
972972
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
973+
case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
974+
case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
973975
}
974976
return nullptr;
975977
}
@@ -4624,6 +4626,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
46244626
// AArch64 Optimization Hooks
46254627
//===----------------------------------------------------------------------===//
46264628

4629+
/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
4630+
/// or the reciprocal square root.
4631+
static SDValue getEstimate(const AArch64Subtarget &ST,
4632+
const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
4633+
const SDValue &Operand, unsigned &ExtraSteps) {
4634+
if (!ST.hasNEON())
4635+
return SDValue();
4636+
4637+
EVT VT = Operand.getValueType();
4638+
4639+
std::string RecipOp;
4640+
RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
4641+
RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
4642+
RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
4643+
4644+
TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
4645+
if (!Recips.isEnabled(RecipOp))
4646+
return SDValue();
4647+
4648+
ExtraSteps = Recips.getRefinementSteps(RecipOp);
4649+
return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
4650+
}
4651+
4652+
SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
4653+
DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
4654+
return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
4655+
}
4656+
4657+
SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
4658+
DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
4659+
UseOneConst = true;
4660+
return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
4661+
}
4662+
46274663
//===----------------------------------------------------------------------===//
46284664
// AArch64 Inline Assembly Support
46294665
//===----------------------------------------------------------------------===//

lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,10 @@ enum NodeType : unsigned {
187187
SMULL,
188188
UMULL,
189189

190+
// Reciprocal estimates.
191+
FRECPE,
192+
FRSQRTE,
193+
190194
// NEON Load/Store with post-increment base updates
191195
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
192196
LD3post,
@@ -511,6 +515,11 @@ class AArch64TargetLowering : public TargetLowering {
511515

512516
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
513517
std::vector<SDNode *> *Created) const override;
518+
SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
519+
unsigned &RefinementSteps,
520+
bool &UseOneConstNR) const override;
521+
SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
522+
unsigned &RefinementSteps) const override;
514523
unsigned combineRepeatedFPDivisors() const override;
515524

516525
ConstraintType getConstraintType(StringRef Constraint) const override;

lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
283283
def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
284284
def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
285285

286+
def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
287+
def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
288+
286289
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
287290
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
288291
def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -3401,6 +3404,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
34013404
def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
34023405
(FRECPEv1i64 FPR64:$Rn)>;
34033406

3407+
def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
3408+
(FRECPEv1i32 FPR32:$Rn)>;
3409+
def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
3410+
(FRECPEv2f32 V64:$Rn)>;
3411+
def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
3412+
(FRECPEv4f32 FPR128:$Rn)>;
3413+
def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
3414+
(FRECPEv1i64 FPR64:$Rn)>;
3415+
def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
3416+
(FRECPEv1i64 FPR64:$Rn)>;
3417+
def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
3418+
(FRECPEv2f64 FPR128:$Rn)>;
3419+
34043420
def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
34053421
(FRECPXv1i32 FPR32:$Rn)>;
34063422
def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3413,6 +3429,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
34133429
def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
34143430
(FRSQRTEv1i64 FPR64:$Rn)>;
34153431

3432+
def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
3433+
(FRSQRTEv1i32 FPR32:$Rn)>;
3434+
def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
3435+
(FRSQRTEv2f32 V64:$Rn)>;
3436+
def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
3437+
(FRSQRTEv4f32 FPR128:$Rn)>;
3438+
def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
3439+
(FRSQRTEv1i64 FPR64:$Rn)>;
3440+
def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
3441+
(FRSQRTEv1i64 FPR64:$Rn)>;
3442+
def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
3443+
(FRSQRTEv2f64 FPR128:$Rn)>;
3444+
34163445
// If an integer is about to be converted to a floating point value,
34173446
// just load it on the floating point unit.
34183447
// Here are the patterns for 8 and 16-bits to float.

lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,30 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {
136136
return "E-m:e-i64:64-i128:128-n32:64-S128";
137137
}
138138

139+
// Helper function to set up the defaults for reciprocals.
140+
static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
141+
{
142+
// For the estimates, convergence is quadratic, so essentially the number of
143+
// digits is doubled after each iteration. ARMv8, the minimum architected
144+
// accuracy of the initial estimate is 2^-8. Therefore, the number of extra
145+
// steps to refine the result for float (23 mantissa bits) and for double
146+
// (52 mantissa bits) are 2 and 3, respectively.
147+
unsigned ExtraStepsF = 2,
148+
ExtraStepsD = ExtraStepsF + 1;
149+
// FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
150+
bool UseRsqrt = ST.isExynosM1();
151+
152+
TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
153+
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
154+
TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF);
155+
TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD);
156+
157+
TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF);
158+
TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD);
159+
TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF);
160+
TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD);
161+
}
162+
139163
/// TargetMachine ctor - Create an AArch64 architecture model.
140164
///
141165
AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
@@ -149,7 +173,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
149173
: LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,
150174
Options, RM, CM, OL),
151175
TLOF(createTLOF(getTargetTriple())),
152-
isLittle(LittleEndian) {
176+
Subtarget(TT, CPU, FS, *this, LittleEndian) {
177+
initReciprocals(*this, Subtarget);
153178
initAsmInfo();
154179
}
155180

@@ -189,7 +214,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
189214
// function that reside in TargetOptions.
190215
resetTargetOptions(F);
191216
I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
192-
isLittle);
217+
Subtarget.isLittleEndian());
193218
#ifndef LLVM_BUILD_GLOBAL_ISEL
194219
GISelAccessor *GISel = new GISelAccessor();
195220
#else

lib/Target/AArch64/AArch64TargetMachine.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class AArch64TargetMachine : public LLVMTargetMachine {
4646
}
4747

4848
private:
49-
bool isLittle;
49+
AArch64Subtarget Subtarget;
5050
};
5151

5252
// AArch64leTargetMachine - AArch64 little endian target machine.
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!div,!vec-div | FileCheck %s --check-prefix=FAULT
2+
; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=div,vec-div | FileCheck %s
3+
4+
define float @frecp(float %x) #0 {
5+
%div = fdiv fast float 1.0, %x
6+
ret float %div
7+
8+
; FAULT-LABEL: frecp:
9+
; FAULT-NEXT: BB#0
10+
; FAULT-NEXT: fmov
11+
; FAULT-NEXT: fdiv
12+
13+
; CHECK-LABEL: frecp:
14+
; CHECK-NEXT: BB#0
15+
; CHECK-NEXT: frecpe
16+
; CHECK-NEXT: fmov
17+
}
18+
19+
define <2 x float> @f2recp(<2 x float> %x) #0 {
20+
%div = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
21+
ret <2 x float> %div
22+
23+
; FAULT-LABEL: f2recp:
24+
; FAULT-NEXT: BB#0
25+
; FAULT-NEXT: fmov
26+
; FAULT-NEXT: fdiv
27+
28+
; CHECK-LABEL: f2recp:
29+
; CHECK-NEXT: BB#0
30+
; CHECK-NEXT: fmov
31+
; CHECK-NEXT: frecpe
32+
}
33+
34+
define <4 x float> @f4recp(<4 x float> %x) #0 {
35+
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
36+
ret <4 x float> %div
37+
38+
; FAULT-LABEL: f4recp:
39+
; FAULT-NEXT: BB#0
40+
; FAULT-NEXT: fmov
41+
; FAULT-NEXT: fdiv
42+
43+
; CHECK-LABEL: f4recp:
44+
; CHECK-NEXT: BB#0
45+
; CHECK-NEXT: fmov
46+
; CHECK-NEXT: frecpe
47+
}
48+
49+
define double @drecp(double %x) #0 {
50+
%div = fdiv fast double 1.0, %x
51+
ret double %div
52+
53+
; FAULT-LABEL: drecp:
54+
; FAULT-NEXT: BB#0
55+
; FAULT-NEXT: fmov
56+
; FAULT-NEXT: fdiv
57+
58+
; CHECK-LABEL: drecp:
59+
; CHECK-NEXT: BB#0
60+
; CHECK-NEXT: frecpe
61+
; CHECK-NEXT: fmov
62+
}
63+
64+
define <2 x double> @d2recp(<2 x double> %x) #0 {
65+
%div = fdiv fast <2 x double> <double 1.0, double 1.0>, %x
66+
ret <2 x double> %div
67+
68+
; FAULT-LABEL: d2recp:
69+
; FAULT-NEXT: BB#0
70+
; FAULT-NEXT: fmov
71+
; FAULT-NEXT: fdiv
72+
73+
; CHECK-LABEL: d2recp:
74+
; CHECK-NEXT: BB#0
75+
; CHECK-NEXT: fmov
76+
; CHECK-NEXT: frecpe
77+
}
78+
79+
attributes #0 = { nounwind "unsafe-fp-math"="true" }

0 commit comments

Comments
 (0)