diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 6c7e3d055456a..9bf8975414951 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -131,6 +131,14 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::IntType::get(getContext(), n, false); } + static unsigned getCIRIntOrFloatBitWidth(mlir::Type eltTy) { + if (auto intType = mlir::dyn_cast(eltTy)) + return intType.getWidth(); + if (auto floatType = mlir::dyn_cast(eltTy)) + return floatType.getWidth(); + + llvm_unreachable("Unsupported type in getCIRIntOrFloatBitWidth"); + } cir::IntType getSIntNTy(int n) { return cir::IntType::get(getContext(), n, true); } @@ -584,6 +592,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::CmpOp::create(*this, loc, getBoolTy(), kind, lhs, rhs); } + cir::VecCmpOp createVecCompare(mlir::Location loc, cir::CmpOpKind kind, + mlir::Value lhs, mlir::Value rhs) { + VectorType vecCast = mlir::cast(lhs.getType()); + IntType integralTy = + getSIntNTy(getCIRIntOrFloatBitWidth(vecCast.getElementType())); + VectorType integralVecTy = + VectorType::get(context, integralTy, vecCast.getSize()); + return cir::VecCmpOp::create(*this, loc, integralVecTy, kind, lhs, rhs); + } + mlir::Value createIsNaN(mlir::Location loc, mlir::Value operand) { return createCompare(loc, cir::CmpOpKind::ne, operand, operand); } diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 567c79a27c07b..525d2b00392ff 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -258,6 +258,7 @@ struct MissingFeatures { static bool emitBranchThroughCleanup() { return false; } static bool emitCheckedInBoundsGEP() { return false; } static bool emitCondLikelihoodViaExpectIntrinsic() { return false; } + static bool emitConstrainedFPCall() { return false; } static bool emitLifetimeMarkers() { return false; } static bool emitLValueAlignmentAssumption() { return false; } static bool emitNullCheckForDeleteCalls() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp index 2d6cf30fa2ded..c366b7e61f0f9 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp @@ -34,18 +34,53 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e, .getResult(); } +// OG has unordered comparison as a form of optimization in addition to +// ordered comparison, while CIR doesn't. +// +// This means that we can't encode the comparison code of UGT (unordered +// greater than), at least not at the CIR level. +// +// The boolean shouldInvert compensates for this. +// For example: to get to the comparison code UGT, we pass in +// emitVectorFCmp (OLE, shouldInvert = true) since OLE is the inverse of UGT. + +// There are several ways to support this otherwise: +// - register extra CmpOpKind for unordered comparison types and build the +// translation code for +// to go from CIR -> LLVM dialect. Notice we get this naturally with +// shouldInvert, benefiting from existing infrastructure, albeit having to +// generate an extra `not` at CIR). +// - Just add extra comparison code to a new VecCmpOpKind instead of +// cluttering CmpOpKind. +// - Add a boolean in VecCmpOp to indicate if it's doing unordered or ordered +// comparison +// - Just emit the intrinsics call instead of calling this helper, see how the +// LLVM lowering handles this. +static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder, + llvm::SmallVector &ops, + mlir::Location loc, cir::CmpOpKind pred, + bool shouldInvert) { + assert(!cir::MissingFeatures::cgFPOptionsRAII()); + // TODO(cir): Add isSignaling boolean once emitConstrainedFPCall implemented + assert(!cir::MissingFeatures::emitConstrainedFPCall()); + mlir::Value cmp = builder.createVecCompare(loc, pred, ops[0], ops[1]); + mlir::Value bitCast = builder.createBitcast( + shouldInvert ? builder.createNot(cmp) : cmp, ops[0].getType()); + return bitCast; +} + mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, - const CallExpr *e) { + const CallExpr *expr) { if (builtinID == Builtin::BI__builtin_cpu_is) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_is"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_supports) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_supports"); return {}; } if (builtinID == Builtin::BI__builtin_cpu_init) { - cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init"); + cgm.errorNYI(expr->getSourceRange(), "__builtin_cpu_init"); return {}; } @@ -66,7 +101,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, getContext().GetBuiltinType(builtinID, error, &iceArguments); assert(error == ASTContext::GE_None && "Error while getting builtin type."); - for (auto [idx, arg] : llvm::enumerate(e->arguments())) { + for (auto [idx, arg] : llvm::enumerate(expr->arguments())) { ops.push_back(emitScalarOrConstFoldImmArg(iceArguments, idx, arg)); } @@ -77,15 +112,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, default: return {}; case X86::BI_mm_clflush: - return emitIntrinsicCallOp(*this, e, "x86.sse2.clflush", voidTy, ops[0]); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.clflush", voidTy, ops[0]); case X86::BI_mm_lfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.lfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.lfence", voidTy); case X86::BI_mm_pause: - return emitIntrinsicCallOp(*this, e, "x86.sse2.pause", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.pause", voidTy); case X86::BI_mm_mfence: - return emitIntrinsicCallOp(*this, e, "x86.sse2.mfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse2.mfence", voidTy); case X86::BI_mm_sfence: - return emitIntrinsicCallOp(*this, e, "x86.sse.sfence", voidTy); + return emitIntrinsicCallOp(*this, expr, "x86.sse.sfence", voidTy); case X86::BI_mm_prefetch: case X86::BI__rdtsc: case X86::BI__builtin_ia32_rdtscp: @@ -741,10 +776,18 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_cmpunordpd: case X86::BI__builtin_ia32_cmpneqps: case X86::BI__builtin_ia32_cmpneqpd: + cgm.errorNYI(expr->getSourceRange(), + std::string("unimplemented X86 builtin call: ") + + getContext().BuiltinInfo.getName(builtinID)); + return {}; case X86::BI__builtin_ia32_cmpnltps: case X86::BI__builtin_ia32_cmpnltpd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::lt, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpnleps: case X86::BI__builtin_ia32_cmpnlepd: + return emitVectorFCmp(builder, ops, getLoc(expr->getExprLoc()), + cir::CmpOpKind::le, /*shouldInvert=*/true); case X86::BI__builtin_ia32_cmpordps: case X86::BI__builtin_ia32_cmpordpd: case X86::BI__builtin_ia32_cmpph128_mask: @@ -829,7 +872,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3: case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: case X86::BI__builtin_ia32_prefetchi: - cgm.errorNYI(e->getSourceRange(), + cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented X86 builtin call: ") + getContext().BuiltinInfo.getName(builtinID)); return {}; diff --git a/clang/test/CIR/CodeGen/builtin-fcmp-sse.c b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c new file mode 100644 index 0000000000000..c273d6b3fca0e --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin-fcmp-sse.c @@ -0,0 +1,213 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); + +__m128 test_cmpnleps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnleps( + // CIR: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnleps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnleps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnleps(A, B); +} + +__m128d test_cmpnlepd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnlepd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp ole <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnlepd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnlepd(A, B); +} + +__m128 test_cmpnltps(__m128 A, __m128 B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltps( + // CIR-SAME: %[[ARG0:.*]]: !cir.vector<4 x !cir.float> {{.*}}, %[[ARG1:.*]]: !cir.vector<4 x !cir.float> {{.*}}) -> !cir.vector<4 x !cir.float> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<4 x !cir.float> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<4 x !cir.float> + // CIR: } + + // LLVM-LABEL: define dso_local <4 x float> @test_cmpnltps( + // LLVM-SAME: <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <4 x float>, i64 1, align 16 + // LLVM-NEXT: store <4 x float> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <4 x float> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <4 x float>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <4 x float> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <4 x i1> [[TMP8]] to <4 x i32> + // LLVM-NEXT: [[TMP10:%.*]] = xor <4 x i32> [[TMP9]], splat (i32 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <4 x float> + // LLVM-NEXT: store <4 x float> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <4 x float>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <4 x float> [[TMP12]] + + // OGCG-LABEL: define dso_local <4 x float> @test_cmpnltps( + // OGCG-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16 + // OGCG-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <4 x float> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float> + // OGCG-NEXT: ret <4 x float> [[TMP4]] + return __builtin_ia32_cmpnltps(A, B); +} + +__m128d test_cmpnltpd(__m128d A, __m128d B) { + // CIR-LABEL: cir.func dso_local @test_cmpnltpd( + // CIR: %[[ARG0:.*]]: !cir.vector<2 x !cir.double> {{.*}}, %[[ARG1:.*]]: !cir.vector<2 x !cir.double> {{.*}}) -> !cir.vector<2 x !cir.double> inline(never) { + // CIR: %[[ALLOCA_0:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["A", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_1:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["B", init] {alignment = 16 : i64} + // CIR: %[[ALLOCA_2:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr>, ["__retval"] {alignment = 16 : i64} + // CIR: cir.store %[[ARG0]], %[[ALLOCA_0]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: cir.store %[[ARG1]], %[[ALLOCA_1]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> + // CIR: %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> + // CIR: %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> + // CIR: cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr> + // CIR: %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr>, !cir.vector<2 x !cir.double> + // CIR: cir.return %[[LOAD_2]] : !cir.vector<2 x !cir.double> + // CIR: } + + // LLVM-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // LLVM-SAME: <2 x double> [[TMP0:%.*]], <2 x double> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { + // LLVM-NEXT: [[TMP3:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP4:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: [[TMP5:%.*]] = alloca <2 x double>, i64 1, align 16 + // LLVM-NEXT: store <2 x double> [[TMP0]], ptr [[TMP3]], align 16 + // LLVM-NEXT: store <2 x double> [[TMP1]], ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 + // LLVM-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 + // LLVM-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP6]], [[TMP7]] + // LLVM-NEXT: [[TMP9:%.*]] = sext <2 x i1> [[TMP8]] to <2 x i64> + // LLVM-NEXT: [[TMP10:%.*]] = xor <2 x i64> [[TMP9]], splat (i64 -1) + // LLVM-NEXT: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <2 x double> + // LLVM-NEXT: store <2 x double> [[TMP11]], ptr [[TMP5]], align 16 + // LLVM-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 + // LLVM-NEXT: ret <2 x double> [[TMP12]] + + // OGCG-LABEL: define dso_local <2 x double> @test_cmpnltpd( + // OGCG-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { + // OGCG-NEXT: [[ENTRY:.*:]] + // OGCG-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16 + // OGCG-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 + // OGCG-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 + // OGCG-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16 + // OGCG-NEXT: [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]] + // OGCG-NEXT: [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> + // OGCG-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double> + // OGCG-NEXT: ret <2 x double> [[TMP4]] + return __builtin_ia32_cmpnltpd(A, B); +}