diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 469c76752c78c..bcc5d438d9afc 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -612,6 +612,12 @@ def FeatureNoSVEFPLD1R : SubtargetFeature<"no-sve-fp-ld1r", def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; +def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFPR64", "true", + "Has zero-cycle register moves for FPR64 registers">; + +def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true", + "Has zero-cycle register moves for FPR32 registers">; + def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", "Has zero-cycle zeroing instructions for generic registers">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 951cb93ea8f8c..c3837cfe73d28 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5302,30 +5302,78 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AArch64::FPR32RegClass.contains(DestReg) && AArch64::FPR32RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR64RegClass); + MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::ssub, + &AArch64::FPR64RegClass); + // This instruction is reading and writing D registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegD, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD) + .addReg(SrcRegD, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::FPR16RegClass.contains(DestReg) && AArch64::FPR16RegClass.contains(SrcReg)) { - DestReg = - RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass); - BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR64RegClass); + MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR64RegClass); + // This instruction is reading and writing D registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegD, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD) + .addReg(SrcRegD, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } if (AArch64::FPR8RegClass.contains(DestReg) && AArch64::FPR8RegClass.contains(SrcReg)) { - DestReg = - RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass); - BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (Subtarget.hasZeroCycleRegMoveFPR64() && + !Subtarget.hasZeroCycleRegMoveFPR32()) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MCRegister DestRegD = TRI->getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR64RegClass); + MCRegister SrcRegD = TRI->getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR64RegClass); + // This instruction is reading and writing D registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegD, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestRegD) + .addReg(SrcRegD, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } return; } diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index c7ea6393e2ad3..d8e4fcbcb5fa4 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -312,6 +312,7 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureFuseAES, FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing, FeatureZCZeroingFPWorkaround]>; @@ -325,6 +326,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", @@ -337,6 +339,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", @@ -349,6 +352,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", @@ -361,6 +365,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureFuseCryptoEOR, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", @@ -378,6 +383,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", @@ -395,6 +401,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", @@ -412,6 +419,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", @@ -429,6 +437,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureFuseLiterals, FeatureStorePairSuppress, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", @@ -445,6 +454,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureFuseCryptoEOR, FeatureFuseLiterals, FeatureZCRegMove, + FeatureZCRegMoveFPR64, FeatureZCZeroing ]>; diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll new file mode 100644 index 0000000000000..f422f96f33495 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov-fpr.ll @@ -0,0 +1,103 @@ +; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s -check-prefixes=NOTCPU-LINUX --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=NOTCPU-APPLE --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=CPU --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 | FileCheck %s -check-prefixes=NOTATTR --match-full-lines +; RUN: llc < %s -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 | FileCheck %s -check-prefixes=ATTR --match-full-lines + +define void @zero_cycle_regmov_FPR32(float %a, float %b, float %c, float %d) { +entry: +; CHECK-LABEL: t: +; NOTCPU-LINUX: fmov s0, s2 +; NOTCPU-LINUX: fmov s1, s3 +; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3 +; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2 +; NOTCPU-LINUX-NEXT: bl {{_?foo_float}} +; NOTCPU-LINUX: fmov s0, [[REG1]] +; NOTCPU-LINUX: fmov s1, [[REG2]] + +; NOTCPU-APPLE: fmov s0, s2 +; NOTCPU-APPLE: fmov s1, s3 +; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3 +; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2 +; NOTCPU-APPLE-NEXT: bl {{_?foo_float}} +; NOTCPU-APPLE: fmov s0, [[REG1]] +; NOTCPU-APPLE: fmov s1, [[REG2]] + +; CPU: fmov [[REG2:d[0-9]+]], d3 +; CPU: fmov [[REG1:d[0-9]+]], d2 +; CPU: fmov d0, d2 +; CPU: fmov d1, d3 +; CPU-NEXT: bl {{_?foo_float}} +; CPU: fmov d0, [[REG1]] +; CPU: fmov d1, [[REG2]] + +; NOTATTR: fmov [[REG2:s[0-9]+]], s3 +; NOTATTR: fmov [[REG1:s[0-9]+]], s2 +; NOTATTR: fmov s0, s2 +; NOTATTR: fmov s1, s3 +; NOTATTR-NEXT: bl {{_?foo_float}} +; NOTATTR: fmov s0, [[REG1]] +; NOTATTR: fmov s1, [[REG2]] + +; ATTR: fmov d0, d2 +; ATTR: fmov d1, d3 +; ATTR: fmov [[REG2:d[0-9]+]], d3 +; ATTR: fmov [[REG1:d[0-9]+]], d2 +; ATTR-NEXT: bl {{_?foo_float}} +; ATTR: fmov d0, [[REG1]] +; ATTR: fmov d1, [[REG2]] + %call = call float @foo_float(float %c, float %d) + %call1 = call float @foo_float(float %c, float %d) + unreachable +} + +declare float @foo_float(float, float) + +define void @zero_cycle_regmov_FPR16(half %a, half %b, half %c, half %d) { +entry: +; CHECK-LABEL: t: +; NOTCPU-LINUX: fmov s0, s2 +; NOTCPU-LINUX: fmov s1, s3 +; NOTCPU-LINUX: fmov [[REG2:s[0-9]+]], s3 +; NOTCPU-LINUX: fmov [[REG1:s[0-9]+]], s2 +; NOTCPU-LINUX-NEXT: bl {{_?foo_half}} +; NOTCPU-LINUX: fmov s0, [[REG1]] +; NOTCPU-LINUX: fmov s1, [[REG2]] + +; NOTCPU-APPLE: fmov s0, s2 +; NOTCPU-APPLE: fmov s1, s3 +; NOTCPU-APPLE: fmov [[REG2:s[0-9]+]], s3 +; NOTCPU-APPLE: fmov [[REG1:s[0-9]+]], s2 +; NOTCPU-APPLE-NEXT: bl {{_?foo_half}} +; NOTCPU-APPLE: fmov s0, [[REG1]] +; NOTCPU-APPLE: fmov s1, [[REG2]] + +; CPU: fmov [[REG2:d[0-9]+]], d3 +; CPU: fmov [[REG1:d[0-9]+]], d2 +; CPU: fmov d0, d2 +; CPU: fmov d1, d3 +; CPU-NEXT: bl {{_?foo_half}} +; CPU: fmov d0, [[REG1]] +; CPU: fmov d1, [[REG2]] + +; NOTATTR: fmov [[REG2:s[0-9]+]], s3 +; NOTATTR: fmov [[REG1:s[0-9]+]], s2 +; NOTATTR: fmov s0, s2 +; NOTATTR: fmov s1, s3 +; NOTATTR-NEXT: bl {{_?foo_half}} +; NOTATTR: fmov s0, [[REG1]] +; NOTATTR: fmov s1, [[REG2]] + +; ATTR: fmov d0, d2 +; ATTR: fmov d1, d3 +; ATTR: fmov [[REG2:d[0-9]+]], d3 +; ATTR: fmov [[REG1:d[0-9]+]], d2 +; ATTR-NEXT: bl {{_?foo_half}} +; ATTR: fmov d0, [[REG1]] +; ATTR: fmov d1, [[REG2]] + %call = call half @foo_half(half %c, half %d) + %call1 = call half @foo_half(half %c, half %d) + unreachable +} + +declare half @foo_half(half, half)