diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp index 446bf45747a9..60e45096be41 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include + #include "CIRGenCXXABI.h" #include "CIRGenCall.h" #include "CIRGenFunction.h" @@ -2158,7 +2160,7 @@ static mlir::Value buildArmLdrexNon128Intrinsic(unsigned int builtinID, } } -mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf, +mlir::Value buildNeonCall(CIRGenBuilderTy &builder, llvm::SmallVector argTypes, llvm::SmallVectorImpl &args, llvm::StringRef intrinsicName, mlir::Type funcResTy, @@ -2175,7 +2177,6 @@ mlir::Value buildNeonCall(unsigned int builtinID, CIRGenFunction &cgf, if (shift > 0) llvm_unreachable("Argument shift NYI"); - CIRGenBuilderTy &builder = cgf.getBuilder(); for (unsigned j = 0; j < argTypes.size(); ++j) { if (isConstrainedFPIntrinsic) { assert(!MissingFeatures::buildConstrainedFPCall()); @@ -2205,6 +2206,24 @@ static int64_t getIntValueFromConstOp(mlir::Value val) { .getSExtValue(); } +/// This function `buildCommonNeonCallPattern0` implements a common way +// to generate neon intrinsic call that has following pattern: +// 1. There is a need to cast result of the intrinsic call back to +// expression type. +// 2. Function arg types are given, not deduced from actual arg types. +static mlir::Value +buildCommonNeonCallPattern0(CIRGenFunction &cgf, std::string &intrincsName, + llvm::SmallVector argTypes, + llvm::SmallVectorImpl &ops, + mlir::Type funcResTy, const clang::CallExpr *e) { + CIRGenBuilderTy &builder = cgf.getBuilder(); + mlir::Value res = + buildNeonCall(builder, std::move(argTypes), ops, intrincsName, funcResTy, + cgf.getLoc(e->getExprLoc())); + mlir::Type resultType = cgf.ConvertType(e->getType()); + return builder.createBitcast(res, resultType); +} + mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr( unsigned builtinID, unsigned llvmIntrinsic, unsigned altLLVMIntrinsic, const char *nameHint, unsigned modifier, const CallExpr *e, @@ -2269,18 +2288,25 @@ mlir::Value CIRGenFunction::buildCommonNeonBuiltinExpr( default: llvm::errs() << getAArch64SIMDIntrinsicString(builtinID) << " "; llvm_unreachable("NYI"); - case NEON::BI__builtin_neon_vqadd_v: - mlir::Value res = buildNeonCall(builtinID, *this, {vTy, vTy}, ops, - (intrinicId != altLLVMIntrinsic) - ? "llvm.aarch64.neon.uqadd" - : "llvm.aarch64.neon.sqadd", - vTy, getLoc(e->getExprLoc())); - mlir::Type resultType = ConvertType(e->getType()); - // AArch64 intrinsic one-element vector type cast to - // scalar type expected by the builtin - return builder.createBitcast(res, resultType); + + case NEON::BI__builtin_neon_vpadd_v: + case NEON::BI__builtin_neon_vpaddq_v: { + std::string intrincsName = mlir::isa(vTy.getEltType()) + ? "llvm.aarch64.neon.faddp" + : "llvm.aarch64.neon.addp"; + return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops, + vTy, e); break; } + case NEON::BI__builtin_neon_vqadd_v: { + std::string intrincsName = (intrinicId != altLLVMIntrinsic) + ? "llvm.aarch64.neon.uqadd" + : "llvm.aarch64.neon.sqadd"; + return buildCommonNeonCallPattern0(*this, intrincsName, {vTy, vTy}, ops, + vTy, e); + break; + } + } return nullptr; } @@ -3085,9 +3111,8 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, // The prototype of builtin_neon_vqrshrun_n can be found at // https://developer.arm.com/architectures/instruction-sets/intrinsics/ return buildNeonCall( - BuiltinID, *this, - {builder.getExtendedElementVectorType(ty, true), SInt32Ty}, Ops, - "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc())); + builder, {builder.getExtendedElementVectorType(ty, true), SInt32Ty}, + Ops, "llvm.aarch64.neon.sqrshrun", ty, getLoc(E->getExprLoc())); case NEON::BI__builtin_neon_vqshrn_n_v: llvm_unreachable("NYI"); case NEON::BI__builtin_neon_vrshrn_n_v: @@ -3100,7 +3125,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, case NEON::BI__builtin_neon_vrnda_v: case NEON::BI__builtin_neon_vrndaq_v: { assert(!MissingFeatures::buildConstrainedFPCall()); - return buildNeonCall(BuiltinID, *this, {ty}, Ops, "llvm.round", ty, + return buildNeonCall(builder, {ty}, Ops, "llvm.round", ty, getLoc(E->getExprLoc())); } case NEON::BI__builtin_neon_vrndih_f16: { @@ -3123,8 +3148,7 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E, case NEON::BI__builtin_neon_vrndns_f32: { mlir::Value arg0 = buildScalarExpr(E->getArg(0)); args.push_back(arg0); - return buildNeonCall(NEON::BI__builtin_neon_vrndns_f32, *this, - {arg0.getType()}, args, "llvm.roundeven.f32", + return buildNeonCall(builder, {arg0.getType()}, args, "llvm.roundeven.f32", getCIRGenModule().FloatTy, getLoc(E->getExprLoc())); } case NEON::BI__builtin_neon_vrndph_f16: { diff --git a/clang/test/CIR/CodeGen/AArch64/neon-arith.c b/clang/test/CIR/CodeGen/AArch64/neon-arith.c index 52d6d1a0c003..42c1fd389b17 100644 --- a/clang/test/CIR/CodeGen/AArch64/neon-arith.c +++ b/clang/test/CIR/CodeGen/AArch64/neon-arith.c @@ -1,8 +1,14 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-cir -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.cir +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering \ +// RUN: -emit-cir -o %t.cir %s // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s -// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \ -// RUN: -ffreestanding -emit-llvm -fno-clangir-call-conv-lowering -target-feature +neon %s -o %t.ll + +// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -target-feature +neon \ +// RUN: -fclangir -disable-O0-optnone \ +// RUN: -flax-vector-conversions=none -fno-clangir-call-conv-lowering \ +// RUN: -emit-llvm -o - %s \ +// RUN: | opt -S -passes=instcombine,mem2reg,simplifycfg -o %t.ll // RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -28,19 +34,9 @@ float32_t test_vrndns_f32(float32_t a) { // CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr, !cir.float // CIR: cir.return [[RET_VAL]] : !cir.float loc -// LLVM: define dso_local float @test_vrndns_f32(float [[ARG:%.*]]) -// LLVM: store float [[ARG]], ptr [[ARG_SAVE:%.*]], align 4 -// LLVM: [[P0:%.*]] = load float, ptr [[ARG_SAVE]], align 4, -// LLVM: store float [[P0]], ptr [[P0_SAVE:%.*]], align 4, -// LLVM: [[INTRIN_ARG:%.*]] = load float, ptr [[P0_SAVE]], align 4, -// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[INTRIN_ARG]]) -// LLVM: store float [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 4, -// LLVM: [[RES_COPY0:%.*]] = load float, ptr [[RES_SAVE0]], align 4, -// LLVM: store float [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 4, -// LLVM: [[RES_COPY1:%.*]] = load float, ptr [[RES_SAVE1]], align 4, -// LLVM: store float [[RES_COPY1]], ptr [[RET_P:%.*]], align 4, -// LLVM: [[RET_VAL:%.*]] = load float, ptr [[RET_P]], align 4, -// LLVM: ret float [[RET_VAL]] +// LLVM: {{.*}}test_vrndns_f32(float{{.*}}[[ARG:%.*]]) +// LLVM: [[INTRIN_RES:%.*]] = call float @llvm.roundeven.f32(float [[ARG]]) +// LLVM: ret float [[INTRIN_RES]] float32x2_t test_vrnda_f32(float32x2_t a) { return vrnda_f32(a); @@ -62,19 +58,9 @@ float32x2_t test_vrnda_f32(float32x2_t a) { // CIR: [[RET_VAL:%.*]] = cir.load [[RET_P]] : !cir.ptr>, !cir.vector // CIR: cir.return [[RET_VAL]] : !cir.vector -// LLVM: define dso_local <2 x float> @test_vrnda_f32(<2 x float> [[ARG:%.*]]) -// LLVM: store <2 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 8 -// LLVM: [[P0:%.*]] = load <2 x float>, ptr [[ARG_SAVE]], align 8, -// LLVM: store <2 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 8, -// LLVM: [[INTRIN_ARG:%.*]] = load <2 x float>, ptr [[P0_SAVE]], align 8, -// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[INTRIN_ARG]]) -// LLVM: store <2 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 8, -// LLVM: [[RES_COPY0:%.*]] = load <2 x float>, ptr [[RES_SAVE0]], align 8, -// LLVM: store <2 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 8, -// LLVM: [[RES_COPY1:%.*]] = load <2 x float>, ptr [[RES_SAVE1]], align 8, -// LLVM: store <2 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 8, -// LLVM: [[RET_VAL:%.*]] = load <2 x float>, ptr [[RET_P]], align 8, -// LLVM: ret <2 x float> [[RET_VAL]] +// LLVM: {{.*}}test_vrnda_f32(<2 x float>{{.*}}[[ARG:%.*]]) +// LLVM: [[INTRIN_RES:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[ARG]]) +// LLVM: ret <2 x float> [[INTRIN_RES]] float32x4_t test_vrndaq_f32(float32x4_t a) { return vrndaq_f32(a); @@ -88,16 +74,147 @@ float32x4_t test_vrndaq_f32(float32x4_t a) { // CIR: {{%.*}} = cir.llvm.intrinsic "llvm.round" [[INTRIN_ARG_BACK]] : (!cir.vector) -> !cir.vector // CIR: cir.return {{%.*}} : !cir.vector -// LLVM: define dso_local <4 x float> @test_vrndaq_f32(<4 x float> [[ARG:%.*]]) -// LLVM: store <4 x float> [[ARG]], ptr [[ARG_SAVE:%.*]], align 16 -// LLVM: [[P0:%.*]] = load <4 x float>, ptr [[ARG_SAVE]], align 16, -// LLVM: store <4 x float> [[P0]], ptr [[P0_SAVE:%.*]], align 16, -// LLVM: [[INTRIN_ARG:%.*]] = load <4 x float>, ptr [[P0_SAVE]], align 16, -// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[INTRIN_ARG]]) -// LLVM: store <4 x float> [[INTRIN_RES]], ptr [[RES_SAVE0:%.*]], align 16, -// LLVM: [[RES_COPY0:%.*]] = load <4 x float>, ptr [[RES_SAVE0]], align 16, -// LLVM: store <4 x float> [[RES_COPY0]], ptr [[RES_SAVE1:%.*]], align 16, -// LLVM: [[RES_COPY1:%.*]] = load <4 x float>, ptr [[RES_SAVE1]], align 16, -// LLVM: store <4 x float> [[RES_COPY1]], ptr [[RET_P:%.*]], align 16, -// LLVM: [[RET_VAL:%.*]] = load <4 x float>, ptr [[RET_P]], align 16, -// LLVM: ret <4 x float> [[RET_VAL]] +// LLVM: {{.*}}test_vrndaq_f32(<4 x float>{{.*}}[[ARG:%.*]]) +// LLVM: [[INTRIN_RES:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[ARG]]) +// LLVM: ret <4 x float> [[INTRIN_RES]] + +int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { + return vpadd_s8(a, b); +} + +// CIR-LABEL: vpadd_s8 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + +// LLVM: {{.*}}test_vpadd_s8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// LLVM: ret <8 x i8> [[RES]] + + +int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) { + return vpaddq_s8(a, b); +} + +// CIR-LABEL: vpaddq_s8 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + +// LLVM: {{.*}}test_vpaddq_s8(<16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// LLVM: ret <16 x i8> [[RES]] + +uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { + return vpadd_u8(a, b); +} + +// CIR-LABEL: vpadd_u8 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector + +// LLVM: {{.*}}test_vpadd_u8(<8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// LLVM: ret <8 x i8> [[RES]] + +int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { + return vpadd_s16(a, b); +} + +// CIR-LABEL: vpadd_s16 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpadd_s16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// LLVM: ret <4 x i16> [[RES]] + +int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) { + return vpaddq_s16(a, b); +} + +// CIR-LABEL: vpaddq_s16 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpaddq_s16(<8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// LLVM: ret <8 x i16> [[RES]] + +uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { + return vpadd_u16(a, b); +} + +// CIR-LABEL: vpadd_u16 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpadd_u16(<4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// LLVM: ret <4 x i16> [[RES]] + +int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { + return vpadd_s32(a, b); +} + +// CIR-LABEL: vpadd_s32 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpadd_s32(<2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// LLVM: ret <2 x i32> [[RES]] + +int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) { + return vpaddq_s32(a, b); +} + +// CIR-LABEL: vpaddq_s32 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpaddq_s32(<4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// LLVM: ret <4 x i32> [[RES]] + +float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { + return vpadd_f32(a, b); +} + +// CIR-LABEL: vpadd_f32 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpadd_f32(<2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// LLVM: ret <2 x float> [[RES]] + +float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) { + return vpaddq_f32(a, b); +} + +// CIR-LABEL: vpaddq_f32 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpaddq_f32(<4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// LLVM: ret <4 x float> [[RES]] + +float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) { + return vpaddq_f64(a, b); +} + +// CIR-LABEL: vpaddq_f64 +// CIR: [[RES:%.*]] = cir.llvm.intrinsic "llvm.aarch64.neon.addp" {{%.*}}, {{%.*}} : +// CIR-SAME: (!cir.vector, !cir.vector) -> !cir.vector +// CIR: {{%.*}} = cir.cast(bitcast, [[RES]] : !cir.vector), !cir.vector + +// LLVM: {{.*}}test_vpaddq_f64(<2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]]) +// LLVM: [[RES:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// LLVM: ret <2 x double> [[RES]]