Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 42 additions & 18 deletions clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
//
//===----------------------------------------------------------------------===//

#include <utility>

#include "CIRGenCXXABI.h"
#include "CIRGenCall.h"
#include "CIRGenFunction.h"
Expand Down Expand Up @@ -2158,7 +2160,7 @@ static mlir::Value buildArmLdrexNon128Intrinsic(unsigned int builtinID,
}
}

mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
mlir::Value buildNeonCall(CIRGenBuilderTy &builder,
llvm::SmallVector<mlir::Type> argTypes,
llvm::SmallVectorImpl<mlir::Value> &args,
llvm::StringRef intrinsicName, mlir::Type funcResTy,
Expand All @@ -2175,7 +2177,6 @@ mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf,
if (shift > 0)
llvm_unreachable("Argument shift NYI");

CIRGenBuilderTy &builder = cgf.getBuilder();
for (unsigned j = 0; j < argTypes.size(); ++j) {
if (isConstrainedFPIntrinsic) {
assert(!MissingFeatures::buildConstrainedFPCall());
Expand Down Expand Up @@ -2205,6 +2206,24 @@ static int64_t getIntValueFromConstOp(mlir::Value val) {
.getSExtValue();
}

/// This function `buildCommonNeonCallPattern0` implements a common way
// to generate neon intrinsic call that has following pattern:
// 1. There is a need to cast result of the intrinsic call back to
// expression type.
// 2. Function arg types are given, not deduced from actual arg types.
static mlir::Value
buildCommonNeonCallPattern0(CIRGenFunction &cgf, std::string &intrincsName,
llvm::SmallVector<mlir::Type> argTypes,
llvm::SmallVectorImpl<mlir::Value> &ops,
mlir::Type funcResTy, const clang::CallExpr *e) {
CIRGenBuilderTy &builder = cgf.getBuilder();
mlir::Value res =
buildNeonCall(builder, std::move(argTypes), ops, intrincsName, funcResTy,
cgf.getLoc(e->getExprLoc()));
mlir::Type resultType = cgf.ConvertType(e->getType());
return builder.createBitcast(res, resultType);
}

mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr(
unsigned builtinID, unsigned llvmIntrinsic, unsigned altLLVMIntrinsic,
const char *nameHint, unsigned modifier, const CallExpr *e,
Expand Down Expand Up @@ -2269,18 +2288,25 @@ mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr(
default:
llvm::errs() << getAArch64SIMDIntrinsicString(builtinID) << " ";
llvm_unreachable("NYI");
case NEON::BI__builtin_neon_vqadd_v:
mlir::Value res = buildNeonCall(builtinID, *this, {vTy, vTy}, ops,
(intrinicId != altLLVMIntrinsic)
? "llvm.aarch64.neon.uqadd"
: "llvm.aarch64.neon.sqadd",
vTy, getLoc(e->getExprLoc()));
mlir::Type resultType = ConvertType(e->getType());
// AArch64 intrinsic one-element vector type cast to
// scalar type expected by the builtin
return builder.createBitcast(res, resultType);

case NEON::BI__builtin_neon_vpadd_v:
case NEON::BI__builtin_neon_vpaddq_v: {
std::string intrincsName = mlir::isa<mlir::FloatType>(vTy.getEltType())
? "llvm.aarch64.neon.faddp"
: "llvm.aarch64.neon.addp";
return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops,
vTy, e);
break;
}
case NEON::BI__builtin_neon_vqadd_v: {
std::string intrincsName = (intrinicId != altLLVMIntrinsic)
? "llvm.aarch64.neon.uqadd"
: "llvm.aarch64.neon.sqadd";
return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops,
vTy, e);
break;
}
}
return nullptr;
}

Expand Down Expand Up @@ -3085,9 +3111,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
// The prototype of builtin_neon_vqrshrun_n can be found at
// https://developer.arm.com/architectures/instruction-sets/intrinsics/
return buildNeonCall(
BuiltinID, *this,
{builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops,
"llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
builder, {builder.getExtendedElementVectorType(ty, true), SInt32Ty},
Ops, "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc()));
case NEON::BI__builtin_neon_vqshrn_n_v:
llvm_unreachable("NYI");
case NEON::BI__builtin_neon_vrshrn_n_v:
Expand All @@ -3100,7 +3125,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
case NEON::BI__builtin_neon_vrnda_v:
case NEON::BI__builtin_neon_vrndaq_v: {
assert(!MissingFeatures::buildConstrainedFPCall());
return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty,
return buildNeonCall(builder, {ty}, Ops, "llvm.round", ty,
getLoc(E->getExprLoc()));
}
case NEON::BI__builtin_neon_vrndih_f16: {
Expand All @@ -3123,8 +3148,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
case NEON::BI__builtin_neon_vrndns_f32: {
mlir::Value arg0 = buildScalarExpr(E->getArg(0));
args.push_back(arg0);
return buildNeonCall(NEON::BI__builtin_neon_vrndns_f32, *this,
{arg0.getType()}, args, "llvm.roundeven.f32",
return buildNeonCall(builder, {arg0.getType()}, args, "llvm.roundeven.f32",
getCIRGenModule().FloatTy, getLoc(E->getExprLoc()));
}
case NEON::BI__builtin_neon_vrndph_f16: {
Expand Down
203 changes: 160 additions & 43 deletions clang/test/CIR/CodeGen/AArch64/neon-arith.c
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
// RUN: -ffreestanding -emit-cir -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.cir
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \
// RUN: -fclangir -disable-O0-optnone \
// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering \
// RUN: -emit-cir -o %t.cir %s
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
// RUN: -ffreestanding -emit-llvm -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.ll

// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \
// RUN: -fclangir -disable-O0-optnone \
// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering \
// RUN: -emit-llvm -o - %s \
// RUN: | opt -S -passes=instcombine,mem2reg,simplifycfg -o %t.ll
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s

// REQUIRES: aarch64-registered-target || arm-registered-target
Expand All @@ -28,19 +34,9 @@ float32_t test_vrndns_f32(float32_t a) {
// CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr<!cir.float>, !cir.float
// CIR: cir.return [[RET_VAL]] : !cir.float loc

// LLVM: define dso_local float @test_vrndns_f32(float [[ARG:%.*]])
// LLVM: store float [[ARG]], ptr [[ARG_SAVE:%.*]], align 4
// LLVM: [[P0:%.*]] = load float, ptr [[ARG_SAVE]], align 4,
// LLVM: store float [[P0]], ptr [[P0_SAVE:%.*]], align 4,
// LLVM: [[INTRIN_ARG:%.*]] = load float, ptr [[P0_SAVE]], align 4,
// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[INTRIN_ARG]])
// LLVM: store float [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 4,
// LLVM: [[RES_COPY0:%.*]] = load float, ptr [[RES_SAVE0]], align 4,
// LLVM: store float [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 4,
// LLVM: [[RES_COPY1:%.*]] = load float, ptr [[RES_SAVE1]], align 4,
// LLVM: store float [[RES_COPY1]], ptr [[RET_P:%.*]], align 4,
// LLVM: [[RET_VAL:%.*]] = load float, ptr [[RET_P]], align 4,
// LLVM: ret float [[RET_VAL]]
// LLVM: {{.*}}test_vrndns_f32(float{{.*}}[[ARG:%.*]])
// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[ARG]])
// LLVM: ret float [[INTRIN_RES]]

float32x2_t test_vrnda_f32(float32x2_t a) {
return vrnda_f32(a);
Expand All @@ -62,19 +58,9 @@ float32x2_t test_vrnda_f32(float32x2_t a) {
// CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr<!cir.vector<!cir.float x 2>>, !cir.vector<!cir.float x 2>
// CIR: cir.return [[RET_VAL]] : !cir.vector<!cir.float x 2>

// LLVM: define dso_local <2 x float> @test_vrnda_f32(<2 x float> [[ARG:%.*]])
// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 8
// LLVM: [[P0:%.*]] = load <2 x float>, ptr [[ARG_SAVE]], align 8,
// LLVM: store <2 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 8,
// LLVM: [[INTRIN_ARG:%.*]] = load <2 x float>, ptr [[P0_SAVE]], align 8,
// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[INTRIN_ARG]])
// LLVM: store <2 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 8,
// LLVM: [[RES_COPY0:%.*]] = load <2 x float>, ptr [[RES_SAVE0]], align 8,
// LLVM: store <2 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 8,
// LLVM: [[RES_COPY1:%.*]] = load <2 x float>, ptr [[RES_SAVE1]], align 8,
// LLVM: store <2 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 8,
// LLVM: [[RET_VAL:%.*]] = load <2 x float>, ptr [[RET_P]], align 8,
// LLVM: ret <2 x float> [[RET_VAL]]
// LLVM: {{.*}}test_vrnda_f32(<2 x float>{{.*}}[[ARG:%.*]])
// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[ARG]])
// LLVM: ret <2 x float> [[INTRIN_RES]]

float32x4_t test_vrndaq_f32(float32x4_t a) {
return vrndaq_f32(a);
Expand All @@ -88,16 +74,147 @@ float32x4_t test_vrndaq_f32(float32x4_t a) {
// CIR: {{%.*}} = cir.llvm.intrinsic "llvm.round" [[INTRIN_ARG_BACK]] : (!cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
// CIR: cir.return {{%.*}} : !cir.vector<!cir.float x 4>

// LLVM: define dso_local <4 x float> @test_vrndaq_f32(<4 x float> [[ARG:%.*]])
// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 16
// LLVM: [[P0:%.*]] = load <4 x float>, ptr [[ARG_SAVE]], align 16,
// LLVM: store <4 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 16,
// LLVM: [[INTRIN_ARG:%.*]] = load <4 x float>, ptr [[P0_SAVE]], align 16,
// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[INTRIN_ARG]])
// LLVM: store <4 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 16,
// LLVM: [[RES_COPY0:%.*]] = load <4 x float>, ptr [[RES_SAVE0]], align 16,
// LLVM: store <4 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 16,
// LLVM: [[RES_COPY1:%.*]] = load <4 x float>, ptr [[RES_SAVE1]], align 16,
// LLVM: store <4 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 16,
// LLVM: [[RET_VAL:%.*]] = load <4 x float>, ptr [[RET_P]], align 16,
// LLVM: ret <4 x float> [[RET_VAL]]
// LLVM: {{.*}}test_vrndaq_f32(<4 x float>{{.*}}[[ARG:%.*]])
// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[ARG]])
// LLVM: ret <4 x float> [[INTRIN_RES]]

int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
return vpadd_s8(a, b);
}

// CIR-LABEL: vpadd_s8
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s8i x 8>, !cir.vector<!s8i x 8>) -> !cir.vector<!s8i x 8>

// LLVM: {{.*}}test_vpadd_s8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
// LLVM: ret <8 x i8> [[RES]]


int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
return vpaddq_s8(a, b);
}

// CIR-LABEL: vpaddq_s8
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s8i x 16>, !cir.vector<!s8i x 16>) -> !cir.vector<!s8i x 16>

// LLVM: {{.*}}test_vpaddq_s8(<16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
// LLVM: ret <16 x i8> [[RES]]

uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
return vpadd_u8(a, b);
}

// CIR-LABEL: vpadd_u8
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!u8i x 8>, !cir.vector<!u8i x 8>) -> !cir.vector<!u8i x 8>

// LLVM: {{.*}}test_vpadd_u8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]])
// LLVM: ret <8 x i8> [[RES]]

int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
return vpadd_s16(a, b);
}

// CIR-LABEL: vpadd_s16
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s16i x 4>, !cir.vector<!s16i x 4>) -> !cir.vector<!s16i x 4>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s16i x 4>), !cir.vector<!s8i x 8>

// LLVM: {{.*}}test_vpadd_s16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
// LLVM: ret <4 x i16> [[RES]]

int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
return vpaddq_s16(a, b);
}

// CIR-LABEL: vpaddq_s16
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s16i x 8>, !cir.vector<!s16i x 8>) -> !cir.vector<!s16i x 8>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s16i x 8>), !cir.vector<!s8i x 16>

// LLVM: {{.*}}test_vpaddq_s16(<8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]])
// LLVM: ret <8 x i16> [[RES]]

uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
return vpadd_u16(a, b);
}

// CIR-LABEL: vpadd_u16
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!u16i x 4>, !cir.vector<!u16i x 4>) -> !cir.vector<!u16i x 4>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!u16i x 4>), !cir.vector<!s8i x 8>

// LLVM: {{.*}}test_vpadd_u16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]])
// LLVM: ret <4 x i16> [[RES]]

int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
return vpadd_s32(a, b);
}

// CIR-LABEL: vpadd_s32
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s32i x 2>, !cir.vector<!s32i x 2>) -> !cir.vector<!s32i x 2>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s32i x 2>), !cir.vector<!s8i x 8>

// LLVM: {{.*}}test_vpadd_s32(<2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]])
// LLVM: ret <2 x i32> [[RES]]

int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
return vpaddq_s32(a, b);
}

// CIR-LABEL: vpaddq_s32
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!s32i x 4>, !cir.vector<!s32i x 4>) -> !cir.vector<!s32i x 4>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!s32i x 4>), !cir.vector<!s8i x 16>

// LLVM: {{.*}}test_vpaddq_s32(<4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
// LLVM: ret <4 x i32> [[RES]]

float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
return vpadd_f32(a, b);
}

// CIR-LABEL: vpadd_f32
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!cir.float x 2>, !cir.vector<!cir.float x 2>) -> !cir.vector<!cir.float x 2>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.float x 2>), !cir.vector<!s8i x 8>

// LLVM: {{.*}}test_vpadd_f32(<2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[A]], <2 x float> [[B]])
// LLVM: ret <2 x float> [[RES]]

float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
return vpaddq_f32(a, b);
}

// CIR-LABEL: vpaddq_f32
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!cir.float x 4>, !cir.vector<!cir.float x 4>) -> !cir.vector<!cir.float x 4>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.float x 4>), !cir.vector<!s8i x 16>

// LLVM: {{.*}}test_vpaddq_f32(<4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[A]], <4 x float> [[B]])
// LLVM: ret <4 x float> [[RES]]

float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
return vpaddq_f64(a, b);
}

// CIR-LABEL: vpaddq_f64
// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} :
// CIR-SAME: (!cir.vector<!cir.double x 2>, !cir.vector<!cir.double x 2>) -> !cir.vector<!cir.double x 2>
// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector<!cir.double x 2>), !cir.vector<!s8i x 16>

// LLVM: {{.*}}test_vpaddq_f64(<2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]])
// LLVM: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[A]], <2 x double> [[B]])
// LLVM: ret <2 x double> [[RES]]