Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Features.td
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,12 @@ def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r",
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;

def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true",
"Has zero-cycle register moves for FPR64 registers">;

def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
"Has zero-cycle register moves for FPR32 registers">;

def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;

Expand Down
76 changes: 62 additions & 14 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5302,30 +5302,78 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,

if (AArch64::FPR32RegClass.contains(DestReg) &&
AArch64::FPR32RegClass.contains(SrcReg)) {
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
!Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub,
&AArch64::FPR64RegClass);
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub,
&AArch64::FPR64RegClass);
// This instruction is reading and writing D registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegD, but a proper
// value from SrcReg.
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
.addReg(SrcRegD, RegState::Undef)
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
} else {
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep these blank lines after the return; }'s. They help draw your eye to their early-return-ness.

if (AArch64::FPR16RegClass.contains(DestReg) &&
AArch64::FPR16RegClass.contains(SrcReg)) {
DestReg =
RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
SrcReg =
RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
!Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR64RegClass);
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub,
&AArch64::FPR64RegClass);
// This instruction is reading and writing D registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegD, but a proper
// value from SrcReg.
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
.addReg(SrcRegD, RegState::Undef)
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
} else {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
&AArch64::FPR32RegClass);
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
&AArch64::FPR32RegClass);
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

if (AArch64::FPR8RegClass.contains(DestReg) &&
AArch64::FPR8RegClass.contains(SrcReg)) {
DestReg =
RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
SrcReg =
RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
if (Subtarget.hasZeroCycleRegMoveFPR64() &&
!Subtarget.hasZeroCycleRegMoveFPR32()) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub,
&AArch64::FPR64RegClass);
MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub,
&AArch64::FPR64RegClass);
// This instruction is reading and writing D registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegD, but a proper
// value from SrcReg.
BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD)
.addReg(SrcRegD, RegState::Undef)
.addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
} else {
DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
&AArch64::FPR32RegClass);
SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
&AArch64::FPR32RegClass);
BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}

Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureFuseAES, FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing,
FeatureZCZeroingFPWorkaround]>;

Expand All @@ -325,6 +326,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
Expand All @@ -337,6 +339,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
Expand All @@ -349,6 +352,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
Expand All @@ -361,6 +365,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureFuseCryptoEOR,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
Expand All @@ -378,6 +383,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
Expand All @@ -395,6 +401,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
Expand All @@ -412,6 +419,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
Expand All @@ -429,6 +437,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureFuseLiterals,
FeatureStorePairSuppress,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing]>;

def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
Expand All @@ -445,6 +454,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseCryptoEOR,
FeatureFuseLiterals,
FeatureZCRegMove,
FeatureZCRegMoveFPR64,
FeatureZCZeroing
]>;

Expand Down
103 changes: 103 additions & 0 deletions llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines
; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines

define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) {
entry:
; CHECK-LABEL: t:
; NOTCPU-LINUX: fmov s0, s2
; NOTCPU-LINUX: fmov s1, s3
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
; NOTCPU-LINUX-NEXT: bl {{_?foo_float}}
; NOTCPU-LINUX: fmov s0, [[REG1]]
; NOTCPU-LINUX: fmov s1, [[REG2]]

; NOTCPU-APPLE: fmov s0, s2
; NOTCPU-APPLE: fmov s1, s3
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
; NOTCPU-APPLE-NEXT: bl {{_?foo_float}}
; NOTCPU-APPLE: fmov s0, [[REG1]]
; NOTCPU-APPLE: fmov s1, [[REG2]]

; CPU: fmov [[REG2:d[0-9]+]], d3
; CPU: fmov [[REG1:d[0-9]+]], d2
; CPU: fmov d0, d2
; CPU: fmov d1, d3
; CPU-NEXT: bl {{_?foo_float}}
; CPU: fmov d0, [[REG1]]
; CPU: fmov d1, [[REG2]]

; NOTATTR: fmov [[REG2:s[0-9]+]], s3
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
; NOTATTR: fmov s0, s2
; NOTATTR: fmov s1, s3
; NOTATTR-NEXT: bl {{_?foo_float}}
; NOTATTR: fmov s0, [[REG1]]
; NOTATTR: fmov s1, [[REG2]]

; ATTR: fmov d0, d2
; ATTR: fmov d1, d3
; ATTR: fmov [[REG2:d[0-9]+]], d3
; ATTR: fmov [[REG1:d[0-9]+]], d2
; ATTR-NEXT: bl {{_?foo_float}}
; ATTR: fmov d0, [[REG1]]
; ATTR: fmov d1, [[REG2]]
%call = call float @foo_float(float %c, float %d)
%call1 = call float @foo_float(float %c, float %d)
unreachable
}

declare float @foo_float(float, float)

define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) {
entry:
; CHECK-LABEL: t:
; NOTCPU-LINUX: fmov s0, s2
; NOTCPU-LINUX: fmov s1, s3
; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3
; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2
; NOTCPU-LINUX-NEXT: bl {{_?foo_half}}
; NOTCPU-LINUX: fmov s0, [[REG1]]
; NOTCPU-LINUX: fmov s1, [[REG2]]

; NOTCPU-APPLE: fmov s0, s2
; NOTCPU-APPLE: fmov s1, s3
; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3
; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2
; NOTCPU-APPLE-NEXT: bl {{_?foo_half}}
; NOTCPU-APPLE: fmov s0, [[REG1]]
; NOTCPU-APPLE: fmov s1, [[REG2]]

; CPU: fmov [[REG2:d[0-9]+]], d3
; CPU: fmov [[REG1:d[0-9]+]], d2
; CPU: fmov d0, d2
; CPU: fmov d1, d3
; CPU-NEXT: bl {{_?foo_half}}
; CPU: fmov d0, [[REG1]]
; CPU: fmov d1, [[REG2]]

; NOTATTR: fmov [[REG2:s[0-9]+]], s3
; NOTATTR: fmov [[REG1:s[0-9]+]], s2
; NOTATTR: fmov s0, s2
; NOTATTR: fmov s1, s3
; NOTATTR-NEXT: bl {{_?foo_half}}
; NOTATTR: fmov s0, [[REG1]]
; NOTATTR: fmov s1, [[REG2]]

; ATTR: fmov d0, d2
; ATTR: fmov d1, d3
; ATTR: fmov [[REG2:d[0-9]+]], d3
; ATTR: fmov [[REG1:d[0-9]+]], d2
; ATTR-NEXT: bl {{_?foo_half}}
; ATTR: fmov d0, [[REG1]]
; ATTR: fmov d1, [[REG2]]
%call = call half @foo_half(half %c, half %d)
%call1 = call half @foo_half(half %c, half %d)
unreachable
}

declare half @foo_half(half, half)