Skip to content

Commit 17743ba

Browse files
committed
[AArch64] Use 0-cycle copy for FPR32, FPR16, FPR8
This change emits zero cycle copy instructions for FPR32, FPR16, FPR8 register classes on targets that support it. The implementation is similar to what has been done for GPR32. It adds a regression test with 2 variants for FPR32 and FPR16.
1 parent bb47589 commit 17743ba

File tree

2 files changed

+165
-14
lines changed

2 files changed

+165
-14
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5302,30 +5302,78 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
53025302

53035303
if (AArch64::FPR32RegClass.contains(DestReg) &&
53045304
AArch64::FPR32RegClass.contains(SrcReg)) {
5305-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5306-
.addReg(SrcReg, getKillRegState(KillSrc));
5305+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5306+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5307+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5308+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
5309+
&AArch64::FPR64RegClass);
5310+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
5311+
&AArch64::FPR64RegClass);
5312+
// This instruction is reading and writing D registers. This may upset
5313+
// the register scavenger and machine verifier, so we need to indicate
5314+
// that we are reading an undefined value from SrcRegD, but a proper
5315+
// value from SrcReg.
5316+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5317+
.addReg(SrcRegD, RegState::Undef)
5318+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5319+
} else {
5320+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5321+
.addReg(SrcReg, getKillRegState(KillSrc));
5322+
}
53075323
return;
53085324
}
53095325

53105326
if (AArch64::FPR16RegClass.contains(DestReg) &&
53115327
AArch64::FPR16RegClass.contains(SrcReg)) {
5312-
DestReg =
5313-
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
5314-
SrcReg =
5315-
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
5316-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5317-
.addReg(SrcReg, getKillRegState(KillSrc));
5328+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5329+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5330+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5331+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
5332+
&AArch64::FPR64RegClass);
5333+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
5334+
&AArch64::FPR64RegClass);
5335+
// This instruction is reading and writing D registers. This may upset
5336+
// the register scavenger and machine verifier, so we need to indicate
5337+
// that we are reading an undefined value from SrcRegD, but a proper
5338+
// value from SrcReg.
5339+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5340+
.addReg(SrcRegD, RegState::Undef)
5341+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5342+
} else {
5343+
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
5344+
&AArch64::FPR32RegClass);
5345+
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
5346+
&AArch64::FPR32RegClass);
5347+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5348+
.addReg(SrcReg, getKillRegState(KillSrc));
5349+
}
53185350
return;
53195351
}
53205352

53215353
if (AArch64::FPR8RegClass.contains(DestReg) &&
53225354
AArch64::FPR8RegClass.contains(SrcReg)) {
5323-
DestReg =
5324-
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
5325-
SrcReg =
5326-
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
5327-
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5328-
.addReg(SrcReg, getKillRegState(KillSrc));
5355+
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
5356+
!Subtarget.hasZeroCycleRegMoveFPR32()) {
5357+
const TargetRegisterInfo *TRI = &getRegisterInfo();
5358+
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
5359+
&AArch64::FPR64RegClass);
5360+
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
5361+
&AArch64::FPR64RegClass);
5362+
// This instruction is reading and writing D registers. This may upset
5363+
// the register scavenger and machine verifier, so we need to indicate
5364+
// that we are reading an undefined value from SrcRegD, but a proper
5365+
// value from SrcReg.
5366+
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
5367+
.addReg(SrcRegD, RegState::Undef)
5368+
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
5369+
} else {
5370+
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
5371+
&AArch64::FPR32RegClass);
5372+
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
5373+
&AArch64::FPR32RegClass);
5374+
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
5375+
.addReg(SrcReg, getKillRegState(KillSrc));
5376+
}
53295377
return;
53305378
}
53315379

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
2+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
3+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
4+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
5+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines
6+
7+
define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
8+
entry:
9+
; CHECK-LABEL: t:
10+
; NOTCPU-LINUX: fmov s0, s2
11+
; NOTCPU-LINUX: fmov s1, s3
12+
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
13+
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
14+
; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
15+
; NOTCPU-LINUX: fmov s0, [[REG1]]
16+
; NOTCPU-LINUX: fmov s1, [[REG2]]
17+
18+
; NOTCPU-APPLE: fmov s0, s2
19+
; NOTCPU-APPLE: fmov s1, s3
20+
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
21+
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
22+
; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
23+
; NOTCPU-APPLE: fmov s0, [[REG1]]
24+
; NOTCPU-APPLE: fmov s1, [[REG2]]
25+
26+
; CPU: fmov [[REG2:d[0-9]+]], d3
27+
; CPU: fmov [[REG1:d[0-9]+]], d2
28+
; CPU: fmov d0, d2
29+
; CPU: fmov d1, d3
30+
; CPU-NEXT: bl {{_?foo_float}}
31+
; CPU: fmov d0, [[REG1]]
32+
; CPU: fmov d1, [[REG2]]
33+
34+
; NOTATTR: fmov [[REG2:s[0-9]+]], s3
35+
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
36+
; NOTATTR: fmov s0, s2
37+
; NOTATTR: fmov s1, s3
38+
; NOTATTR-NEXT: bl {{_?foo_float}}
39+
; NOTATTR: fmov s0, [[REG1]]
40+
; NOTATTR: fmov s1, [[REG2]]
41+
42+
; ATTR: fmov d0, d2
43+
; ATTR: fmov d1, d3
44+
; ATTR: fmov [[REG2:d[0-9]+]], d3
45+
; ATTR: fmov [[REG1:d[0-9]+]], d2
46+
; ATTR-NEXT: bl {{_?foo_float}}
47+
; ATTR: fmov d0, [[REG1]]
48+
; ATTR: fmov d1, [[REG2]]
49+
%call = call float @foo_float(float %c, float %d)
50+
%call1 = call float @foo_float(float %c, float %d)
51+
unreachable
52+
}
53+
54+
declare float @foo_float(float, float)
55+
56+
define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
57+
entry:
58+
; CHECK-LABEL: t:
59+
; NOTCPU-LINUX: fmov s0, s2
60+
; NOTCPU-LINUX: fmov s1, s3
61+
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
62+
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
63+
; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
64+
; NOTCPU-LINUX: fmov s0, [[REG1]]
65+
; NOTCPU-LINUX: fmov s1, [[REG2]]
66+
67+
; NOTCPU-APPLE: fmov s0, s2
68+
; NOTCPU-APPLE: fmov s1, s3
69+
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
70+
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
71+
; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
72+
; NOTCPU-APPLE: fmov s0, [[REG1]]
73+
; NOTCPU-APPLE: fmov s1, [[REG2]]
74+
75+
; CPU: fmov [[REG2:d[0-9]+]], d3
76+
; CPU: fmov [[REG1:d[0-9]+]], d2
77+
; CPU: fmov d0, d2
78+
; CPU: fmov d1, d3
79+
; CPU-NEXT: bl {{_?foo_half}}
80+
; CPU: fmov d0, [[REG1]]
81+
; CPU: fmov d1, [[REG2]]
82+
83+
; NOTATTR: fmov [[REG2:s[0-9]+]], s3
84+
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
85+
; NOTATTR: fmov s0, s2
86+
; NOTATTR: fmov s1, s3
87+
; NOTATTR-NEXT: bl {{_?foo_half}}
88+
; NOTATTR: fmov s0, [[REG1]]
89+
; NOTATTR: fmov s1, [[REG2]]
90+
91+
; ATTR: fmov d0, d2
92+
; ATTR: fmov d1, d3
93+
; ATTR: fmov [[REG2:d[0-9]+]], d3
94+
; ATTR: fmov [[REG1:d[0-9]+]], d2
95+
; ATTR-NEXT: bl {{_?foo_half}}
96+
; ATTR: fmov d0, [[REG1]]
97+
; ATTR: fmov d1, [[REG2]]
98+
%call = call half @foo_half(half %c, half %d)
99+
%call1 = call half @foo_half(half %c, half %d)
100+
unreachable
101+
}
102+
103+
declare half @foo_half(half, half)

0 commit comments

Comments
 (0)