-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AArch64][SVE] optimisation for SVE load intrinsics with no active lanes #95269
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-transforms Author: None (Lukacma) ChangesThis patch extends #73964 and adds optimisation of load SVE intrinsics when predicate is zero. Patch is 21.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95269.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9f5756fc7e401..1dc644cd6cb30 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -985,6 +985,33 @@ static bool isAllActivePredicate(Value *Pred) {
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
+// Simplify unary operation where predicate has all inactive lanes by replacing
+// instruction with zeroed object
+static std::optional<Instruction *>
+instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
+ if (match(II.getOperand(0), m_ZeroInt())) {
+ Constant *Node;
+ Type *RetTy = II.getType();
+ if (RetTy->isStructTy()) {
+ auto StructT = cast<StructType>(RetTy);
+ auto VecT = StructT->getElementType(0);
+ SmallVector<llvm::Constant *, 4> ZerVec;
+ for (unsigned i = 0; i < StructT->getNumElements(); i++) {
+ ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
+ : ConstantInt::get(VecT, 0));
+ }
+ Node = ConstantStruct::get(StructT, ZerVec);
+ } else if (RetTy->isFPOrFPVectorTy())
+ Node = ConstantFP::get(RetTy, 0.0);
+ else
+ Node = ConstantInt::get(II.getType(), 0);
+
+ IC.replaceInstUsesWith(II, Node);
+ return IC.eraseInstFromFunction(II);
+ }
+ return std::nullopt;
+}
+
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
// svsel(ptrue, x, y) => x
@@ -1398,6 +1425,10 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *PtrOp = II.getOperand(1);
Type *VecTy = II.getType();
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
if (isAllActivePredicate(Pred)) {
LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
Load->copyMetadata(II);
@@ -1745,6 +1776,10 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Type *Ty = II.getType();
Value *PassThru = ConstantAggregateZero::get(Ty);
+ // Replace by zero constant when all lanes are inactive
+ if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
+ return II_NA;
+
// Contiguous gather => masked load.
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
@@ -1971,6 +2006,41 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
switch (IID) {
default:
break;
+
+ case Intrinsic::aarch64_sve_ld1_gather:
+ case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_index:
+ case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
+ case Intrinsic::aarch64_sve_ld1ro:
+ case Intrinsic::aarch64_sve_ld1rq:
+ case Intrinsic::aarch64_sve_ld1udq:
+ case Intrinsic::aarch64_sve_ld1uwq:
+ case Intrinsic::aarch64_sve_ld2_sret:
+ case Intrinsic::aarch64_sve_ld2q_sret:
+ case Intrinsic::aarch64_sve_ld3_sret:
+ case Intrinsic::aarch64_sve_ld3q_sret:
+ case Intrinsic::aarch64_sve_ld4_sret:
+ case Intrinsic::aarch64_sve_ld4q_sret:
+ case Intrinsic::aarch64_sve_ldff1:
+ case Intrinsic::aarch64_sve_ldff1_gather:
+ case Intrinsic::aarch64_sve_ldff1_gather_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+ case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+ case Intrinsic::aarch64_sve_ldnf1:
+ case Intrinsic::aarch64_sve_ldnt1:
+ case Intrinsic::aarch64_sve_ldnt1_gather:
+ case Intrinsic::aarch64_sve_ldnt1_gather_index:
+ case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+ case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+ return instCombineSVENoActiveUnaryZero(IC, II);
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
new file mode 100644
index 0000000000000..2470337f09180
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-comb-no-active-lanes-loads.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @test_ld1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 4 x i32> @test_ld1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ld1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+
+define <vscale x 2 x i64> @test_ld1q_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1q_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ld1q.gather.index.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld1q_gather_scalar_offset(
+; CHECK-SAME: <vscale x 2 x i64> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ld1q.gather.scalar.offset.nxv8i16.nxv2i64(<vscale x 1 x i1> zeroinitializer, <vscale x 2 x i64> %a, i64 0)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1q_gather_vector_offset(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ld1q.gather.vector.offset.nxv16i8(<vscale x 1 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_ld1ro(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1ro(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 16 x i8> @test_ld1rq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ld1rq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 2 x i64> @test_ld1udq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ld1udq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1udq.nxv2i64(<vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x i32> @test_ld1uwq(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ld1uwq(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1uwq.nxv4i32(<vscale x 1 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 8 x i16> @test_ld2_sret(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld2q_sret(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld2q_sret(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld3q(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld3q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ld4q(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ld4q(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4q.sret.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ %1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %0, 0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 8 x i16> @test_ldff1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldff1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldff1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldff1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(<vscale x 2 x i1> zeroinitializer, ptr %a, <vscale x 2 x i64> %b)
+ ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_scalar_offset(
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> %a, i64 0)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_sxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(ptr %b, <vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldff1_gather_uxtw_index(
+; CHECK-SAME: ptr [[B:%.*]], <vscale x 4 x i32> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 16 x i8> @test_ldnf1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 16 x i8> @test_ldnf1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 16 x i8> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_ldnt1(ptr %a) #0 {
+; CHECK-LABEL: define <vscale x 8 x i16> @test_ldnt1(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 8 x i16> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> zeroinitializer, ptr %a)
+ ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_ldnt1_gather(ptr %a, <vscale x 4 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 4 x i32> @test_ldnt1_gather(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 4 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 4 x i32> zeroinitializer
+;
+entry:
+ %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %a, <vscale x 4 x i64> %b)
+ ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_ldnt1_gather_index(ptr %a, <vscale x 2 x i64> %b) #0 {
+; CHECK-LABEL: define <vscale x 2 x i64> @test_ldnt1_gather_index(
+; CHECK-SAME: ptr [[A:%.*]], <vscale x 2 x i64> [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: ret <vscale x 2 x i64> z...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for splitting the patch Marian.
I had a look for the loads and it looks fine to me.
; | ||
entry: | ||
%0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> zeroinitializer, ptr %b, <vscale x 4 x i32> %a) | ||
ret <vscale x 4 x i32> %0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nothing to do here, I believe this happens because of another optimisation.
define <vscale x 4 x i32> @test(ptr %b, <vscale x 4 x i32> %a) #0 {
%pg = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> zeroinitializer)
%1 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg, ptr %b, <vscale x 4 x i32> %a)
ret <vscale x 4 x i32> %1
}
…nes (llvm#95269) This patch extends llvm#73964 and adds optimisation of load SVE intrinsics when predicate is zero.
This patch extends #73964 and adds optimisation of load SVE intrinsics when predicate is zero.