From 633a986a1e49698e05d109d69fafaa20989aa6c9 Mon Sep 17 00:00:00 2001 From: NagrajMG Date: Mon, 29 Sep 2025 20:08:07 +0530 Subject: [PATCH 01/17] FIxes #156611: Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 45 ++- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 245 ++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 287 +++++++++++++++++++ clang/lib/Headers/mmintrin.h | 5 + clang/test/CodeGen/X86/avx2-builtins.c | 6 +- clang/test/CodeGen/X86/avx512bw-builtins.c | 11 +- clang/test/CodeGen/X86/avx512f-builtins.c | 9 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 17 ++ clang/test/CodeGen/X86/avx512vlbw-builtins.c | 50 ++++ clang/test/CodeGen/X86/mmx-builtins.c | 2 +- clang/test/CodeGen/X86/sse2-builtins.c | 6 +- 11 files changed, 663 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 77e599587edc3..e70691a30627a 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -145,6 +145,10 @@ let Features = "mmx", Header = "mmintrin.h", Attributes = [NoThrow, Const] in { def _m_prefetch : X86LibBuiltin<"void(void *)">; } +let Features = "mmx", Attributes = [NoThrow, Const, Constexpr] in { + def pshufw : X86Builtin<"_Vector<4, short>(_Vector<4, short>, _Constant int)">; +} + // PRFCHW let Features = "prfchw", Header = "intrin.h", Attributes = [NoThrow, Const] in { def _m_prefetchw : X86LibBuiltin<"void(void volatile const *)">; @@ -217,10 +221,13 @@ let Features = "sse2", Attributes = [NoThrow] in { def movnti : X86Builtin<"void(int *, int)">; } -let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">; +let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">; + def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">; def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">; +} + +let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">; def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">; def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">; @@ -584,9 +591,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; - def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">; - def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; - def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; @@ -647,6 +651,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; + + def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; + def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; + def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { @@ -1990,13 +1998,13 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVect } let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; - def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; + def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; + def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { @@ -2016,21 +2024,35 @@ let Features = "avx512f", let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psrlv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; + def pshuflw512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, _Vector<32, short>, unsigned int)">; + def pshuflw512_maskz : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, unsigned int)">; + def pshufhw512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, _Vector<32, short>, unsigned int)">; + def pshufhw512_maskz : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, unsigned int)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def psrlv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; + def pshuflw256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, _Vector<16, short>, unsigned short)">; + def pshuflw256_maskz : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, unsigned short)">; + def pshufhw256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, _Vector<16, short>, unsigned short)">; + def pshufhw256_maskz : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, unsigned short)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; + def pshuflw128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, _Vector<8, short>, unsigned char)">; + def pshuflw128_maskz : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, unsigned char)">; + def pshufhw128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, _Vector<8, short>, unsigned char)">; + def pshufhw128_maskz : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, unsigned char)">; } -let Features = "avx512f", - Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">; def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">; def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">; + def pshufd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int, _Vector<16, int>, unsigned short)">; + def pshufd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int, unsigned short)">; + def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { @@ -2047,10 +2069,14 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psravq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; + def pshufd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int, _Vector<4, int>, unsigned char)">; + def pshufd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int, unsigned char)">; } let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; + def pshufd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int, _Vector<8, int>, unsigned char)">; + def pshufd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int, unsigned char)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { @@ -3266,7 +3292,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128> } let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">; def expanddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">; def expanddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 891344d4e6ed0..e0bd5d531db34 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2862,6 +2862,218 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_pshuflw_common(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + const unsigned NumArgs = Call->getNumArgs(); + assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); + APSInt K; + Pointer SrcPT; + const bool HasMask = (NumArgs == 3) || (NumArgs == 4); + const bool IsMaskZ = (NumArgs == 3); + if (NumArgs == 4) { + K = popToAPSInt(S, Call->getArg(3)); + SrcPT = S.Stk.pop(); + } else if (NumArgs == 3) { + K = popToAPSInt(S, Call->getArg(2)); + } + + APSInt Imm = popToAPSInt(S, Call->getArg(1)); + const Pointer &Src = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + const unsigned NumElems = Dst.getNumElems(); + const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); + const unsigned ElemBits = 16; + const unsigned LaneElems = 128u / ElemBits; + const unsigned Half = 4; + assert(NumElems % LaneElems == 0 && "pshuflw expects 128-bit lanes"); + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + + for (unsigned i = 0; i != NumElems; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + + unsigned srcIdx; + if (inLane < Half) { + const unsigned pos = inLane; + const unsigned sel = (Ctl >> (2 * pos)) & 0x3; + srcIdx = laneBase + sel; + } else { + srcIdx = i; + } + + APSInt Chosen; + INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); + + if (!HasMask) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + continue; + } + + const bool Keep = + (i < static_cast(K.getBitWidth())) ? K[i] : false; + + if (Keep) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + } else if (IsMaskZ) { + APSInt Zero(APInt(Chosen.getBitWidth(), 0)); + Zero.setIsSigned(Chosen.isSigned()); + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Zero); }); + } else { + APSInt PT; + INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); + } + } + + Dst.initializeAllElements(); + return true; +} + +static bool interp__builtin_ia32_pshufhw_common(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + (void)OpPC; + const unsigned NumArgs = Call->getNumArgs(); + assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); + + APSInt K; + Pointer SrcPT; + const bool HasMask = (NumArgs == 3) || (NumArgs == 4); + const bool IsMaskZ = (NumArgs == 3); + + if (NumArgs == 4) { + K = popToAPSInt(S, Call->getArg(3)); + SrcPT = S.Stk.pop(); + } else if (NumArgs == 3) { + K = popToAPSInt(S, Call->getArg(2)); + } + + APSInt Imm = popToAPSInt(S, Call->getArg(1)); + const Pointer &Src = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + const unsigned NumElems = Dst.getNumElems(); + const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); + + const unsigned ElemBits = 16; + const unsigned LaneElems = 128u / ElemBits; + const unsigned HalfBase = 4; + assert(NumElems % LaneElems == 0); + + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + + for (unsigned i = 0; i != NumElems; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + + unsigned srcIdx; + if (inLane >= HalfBase) { + const unsigned pos = inLane - HalfBase; + const unsigned sel = (Ctl >> (2 * pos)) & 0x3; + srcIdx = laneBase + HalfBase + sel; + } else { + srcIdx = i; + } + + APSInt Chosen; + INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); + + if (!HasMask) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + continue; + } + + const bool Keep = + (i < static_cast(K.getBitWidth())) ? K[i] : false; + if (Keep) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + } else if (IsMaskZ) { + APSInt Zero(APInt(Chosen.getBitWidth(), 0)); + Zero.setIsSigned(Chosen.isSigned()); + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Zero); }); + } else { + APSInt PT; + INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); + } + } + + Dst.initializeAllElements(); + return true; +} + +static bool interp__builtin_ia32_pshufd_common(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + (void)OpPC; + const unsigned NumArgs = Call->getNumArgs(); + assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); + + APSInt K; + Pointer SrcPT; + const bool HasMask = (NumArgs == 3) || (NumArgs == 4); + const bool IsMaskZ = (NumArgs == 3); + + if (NumArgs == 4) { + K = popToAPSInt(S, Call->getArg(3)); + SrcPT = S.Stk.pop(); + } else if (NumArgs == 3) { + K = popToAPSInt(S, Call->getArg(2)); + } + + APSInt Imm = popToAPSInt(S, Call->getArg(1)); + const Pointer &Src = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + const unsigned NumElems = Dst.getNumElems(); + const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); + + const unsigned ElemBits = 32; + const unsigned LaneElems = 128u / ElemBits; + assert(NumElems % LaneElems == 0); + + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + + for (unsigned i = 0; i != NumElems; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + const unsigned sel = (Ctl >> (2 * inLane)) & 0x3; + const unsigned srcIdx = laneBase + sel; + + APSInt Chosen; + INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); + + if (!HasMask) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + continue; + } + + const bool Keep = + (i < static_cast(K.getBitWidth())) ? K[i] : false; + if (Keep) { + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Chosen); }); + } else if (IsMaskZ) { + APSInt Zero(APInt(Chosen.getBitWidth(), 0)); + Zero.setIsSigned(Chosen.isSigned()); + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(i) = static_cast(Zero); }); + } else { + APSInt PT; + INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_elementwise_triop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref @@ -3417,6 +3629,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_int_binop(S, OpPC, Call, llvm::APIntOps::mulhs); + case clang::X86::BI__builtin_ia32_pshuflw: + case clang::X86::BI__builtin_ia32_pshuflw256: + case clang::X86::BI__builtin_ia32_pshuflw512: + case clang::X86::BI__builtin_ia32_pshuflw128_mask: + case clang::X86::BI__builtin_ia32_pshuflw256_mask: + case clang::X86::BI__builtin_ia32_pshuflw512_mask: + case clang::X86::BI__builtin_ia32_pshuflw128_maskz: + case clang::X86::BI__builtin_ia32_pshuflw256_maskz: + case clang::X86::BI__builtin_ia32_pshuflw512_maskz: + return interp__builtin_ia32_pshuflw_common(S, OpPC, Call); + + case clang::X86::BI__builtin_ia32_pshufhw: + case clang::X86::BI__builtin_ia32_pshufhw256: + case clang::X86::BI__builtin_ia32_pshufhw512: + case clang::X86::BI__builtin_ia32_pshufhw128_mask: + case clang::X86::BI__builtin_ia32_pshufhw256_mask: + case clang::X86::BI__builtin_ia32_pshufhw512_mask: + case clang::X86::BI__builtin_ia32_pshufhw128_maskz: + case clang::X86::BI__builtin_ia32_pshufhw256_maskz: + case clang::X86::BI__builtin_ia32_pshufhw512_maskz: + return interp__builtin_ia32_pshufhw_common(S, OpPC, Call); + + case clang::X86::BI__builtin_ia32_pshufd: + case clang::X86::BI__builtin_ia32_pshufd256: + case clang::X86::BI__builtin_ia32_pshufd512: + case clang::X86::BI__builtin_ia32_pshufd128_mask: + case clang::X86::BI__builtin_ia32_pshufd256_mask: + case clang::X86::BI__builtin_ia32_pshufd512_mask: + case clang::X86::BI__builtin_ia32_pshufd128_maskz: + case clang::X86::BI__builtin_ia32_pshufd256_maskz: + case clang::X86::BI__builtin_ia32_pshufd512_maskz: + return interp__builtin_ia32_pshufd_common(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_psllv2di: case clang::X86::BI__builtin_ia32_psllv4di: case clang::X86::BI__builtin_ia32_psllv4si: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b706b14945b6d..1ce601d37e0d6 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11869,6 +11869,293 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_pshufw: { + APValue Src; + APSInt Imm; + if (!EvaluateAsRValue(Info, E->getArg(0), Src)) return false; + if (!EvaluateInteger(E->getArg(1), Imm, Info)) return false; + + unsigned N = Src.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(N); + + uint8_t C = static_cast(Imm.getZExtValue()); + for (unsigned i = 0; i != N; ++i) { + unsigned sel = (C >> (2 * i)) & 0x3; + ResultElements.push_back(Src.getVectorElt(sel)); + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + + case clang::X86::BI__builtin_ia32_pshuflw: + case clang::X86::BI__builtin_ia32_pshuflw256: + case clang::X86::BI__builtin_ia32_pshuflw512: + case clang::X86::BI__builtin_ia32_pshuflw128_mask: + case clang::X86::BI__builtin_ia32_pshuflw256_mask: + case clang::X86::BI__builtin_ia32_pshuflw512_mask: + case clang::X86::BI__builtin_ia32_pshuflw128_maskz: + case clang::X86::BI__builtin_ia32_pshuflw256_maskz: + case clang::X86::BI__builtin_ia32_pshuflw512_maskz: { + const unsigned BID = E->getBuiltinCallee(); + + const bool IsMask = + BID == clang::X86::BI__builtin_ia32_pshuflw128_mask || + BID == clang::X86::BI__builtin_ia32_pshuflw256_mask || + BID == clang::X86::BI__builtin_ia32_pshuflw512_mask; + + const bool IsMaskZ = + BID == clang::X86::BI__builtin_ia32_pshuflw128_maskz || + BID == clang::X86::BI__builtin_ia32_pshuflw256_maskz || + BID == clang::X86::BI__builtin_ia32_pshuflw512_maskz; + + const unsigned AIdx = 0, ImmIdx = 1; + const unsigned SrcIdx = 2; + const unsigned KIdx = IsMaskZ ? 2 : 3; + + APValue AVal, SrcVal; + APSInt Imm, K; + if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; + if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; + + const APSInt *KPtr = nullptr; + const APValue *PassThru = nullptr; + bool ZeroInactive = false; + + if (IsMask) { + if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; + } else if (IsMaskZ) { + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = nullptr; ZeroInactive = true; + } + + const auto *VT = E->getType()->getAs(); + if (!VT) return false; + const unsigned NumElts = VT->getNumElements(); + + const unsigned ElemBits = 16; + const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); + const unsigned Half = 4; + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + const bool DestUnsigned = + VT->getElementType()->isUnsignedIntegerOrEnumerationType(); + + auto MakeZero = [&]() -> APValue { + return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); + }; + + SmallVector ResultElements; + ResultElements.reserve(NumElts); + + for (unsigned i = 0; i < NumElts; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + + APValue Chosen; + if (inLane < Half) { + const unsigned pos = inLane; + const unsigned sel = (Ctl >> (2 * pos)) & 0x3; + const unsigned srcIdx = laneBase + sel; + Chosen = AVal.getVectorElt(srcIdx); + } else { + Chosen = AVal.getVectorElt(i); + } + + if (KPtr) { + const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; + if (Keep) { + ResultElements.push_back(Chosen); + } else if (ZeroInactive) { + ResultElements.push_back(MakeZero()); + } else { + const APValue &PT = PassThru ? PassThru->getVectorElt(i) + : AVal.getVectorElt(i); + ResultElements.push_back(PT); + } + } else { + ResultElements.push_back(Chosen); + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + + case clang::X86::BI__builtin_ia32_pshufhw: + case clang::X86::BI__builtin_ia32_pshufhw256: + case clang::X86::BI__builtin_ia32_pshufhw512: + case clang::X86::BI__builtin_ia32_pshufhw128_mask: + case clang::X86::BI__builtin_ia32_pshufhw256_mask: + case clang::X86::BI__builtin_ia32_pshufhw512_mask: + case clang::X86::BI__builtin_ia32_pshufhw128_maskz: + case clang::X86::BI__builtin_ia32_pshufhw256_maskz: + case clang::X86::BI__builtin_ia32_pshufhw512_maskz: { + const unsigned BID = E->getBuiltinCallee(); + + const bool IsMask = + BID == clang::X86::BI__builtin_ia32_pshufhw128_mask || + BID == clang::X86::BI__builtin_ia32_pshufhw256_mask || + BID == clang::X86::BI__builtin_ia32_pshufhw512_mask; + + const bool IsMaskZ = + BID == clang::X86::BI__builtin_ia32_pshufhw128_maskz || + BID == clang::X86::BI__builtin_ia32_pshufhw256_maskz || + BID == clang::X86::BI__builtin_ia32_pshufhw512_maskz; + + const unsigned AIdx = 0, ImmIdx = 1; + const unsigned SrcIdx = 2; + const unsigned KIdx = IsMaskZ ? 2 : 3; + + APValue AVal, SrcVal; + APSInt Imm, K; + if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; + if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; + + const APSInt *KPtr = nullptr; + const APValue *PassThru = nullptr; + bool ZeroInactive = false; + if (IsMask) { + if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; + } else if (IsMaskZ) { + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = nullptr; ZeroInactive = true; + } + + const auto *VT = E->getType()->getAs(); + if (!VT) return false; + const unsigned NumElts = VT->getNumElements(); + const unsigned ElemBits = 16; + const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); + const unsigned Half = 4; + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + const bool DestUnsigned = + VT->getElementType()->isUnsignedIntegerOrEnumerationType(); + + auto MakeZero = [&]() -> APValue { + return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); + }; + + SmallVector ResultElements; + ResultElements.reserve(NumElts); + + for (unsigned i = 0; i < NumElts; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + + APValue Chosen; + if (inLane >= Half) { + const unsigned pos = inLane - Half; + const unsigned sel = (Ctl >> (2 * pos)) & 0x3; + const unsigned srcIdx = laneBase + Half + sel; + Chosen = AVal.getVectorElt(srcIdx); + } else { + Chosen = AVal.getVectorElt(i); + } + + if (KPtr) { + const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; + if (Keep) { + ResultElements.push_back(Chosen); + } else if (ZeroInactive) { + ResultElements.push_back(MakeZero()); + } else { + const APValue &PT = PassThru ? PassThru->getVectorElt(i) + : AVal.getVectorElt(i); + ResultElements.push_back(PT); + } + } else { + ResultElements.push_back(Chosen); + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + + case clang::X86::BI__builtin_ia32_pshufd: + case clang::X86::BI__builtin_ia32_pshufd256: + case clang::X86::BI__builtin_ia32_pshufd512: + case clang::X86::BI__builtin_ia32_pshufd128_mask: + case clang::X86::BI__builtin_ia32_pshufd256_mask: + case clang::X86::BI__builtin_ia32_pshufd512_mask: + case clang::X86::BI__builtin_ia32_pshufd128_maskz: + case clang::X86::BI__builtin_ia32_pshufd256_maskz: + case clang::X86::BI__builtin_ia32_pshufd512_maskz: { + const unsigned BID = E->getBuiltinCallee(); + + const bool IsMask = + BID == clang::X86::BI__builtin_ia32_pshufd512_mask || + BID == clang::X86::BI__builtin_ia32_pshufd128_mask || + BID == clang::X86::BI__builtin_ia32_pshufd256_mask; + + const bool IsMaskZ = + BID == clang::X86::BI__builtin_ia32_pshufd512_maskz || + BID == clang::X86::BI__builtin_ia32_pshufd128_maskz || + BID == clang::X86::BI__builtin_ia32_pshufd256_maskz; + + const unsigned AIdx = 0, ImmIdx = 1; + const unsigned SrcIdx = 2; + const unsigned KIdx = IsMaskZ ? 2 : 3; + + APValue AVal, SrcVal; + APSInt Imm, K; + if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; + if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; + + const APSInt *KPtr = nullptr; + const APValue *PassThru = nullptr; + bool ZeroInactive = false; + if (IsMask) { + if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; + } else if (IsMaskZ) { + if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; + KPtr = &K; PassThru = nullptr; ZeroInactive = true; + } + + const auto *VT = E->getType()->getAs(); + if (!VT) return false; + const unsigned NumElts = VT->getNumElements(); + const unsigned ElemBits = 32; + const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); + const uint8_t Ctl = static_cast(Imm.getZExtValue()); + const bool DestUnsigned = + VT->getElementType()->isUnsignedIntegerOrEnumerationType(); + + auto MakeZero = [&]() -> APValue { + return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); + }; + + SmallVector ResultElements; + ResultElements.reserve(NumElts); + + for (unsigned i = 0; i < NumElts; ++i) { + const unsigned laneBase = (i / LaneElems) * LaneElems; + const unsigned inLane = i % LaneElems; + + const unsigned pos = inLane & 3; + const unsigned sel = (Ctl >> (2 * pos)) & 0x3; + const unsigned srcIdx = laneBase + sel; + APValue Chosen = AVal.getVectorElt(srcIdx); + + if (KPtr) { + const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; + if (Keep) { + ResultElements.push_back(Chosen); + } else if (ZeroInactive) { + ResultElements.push_back(MakeZero()); + } else { + const APValue &PT = PassThru ? PassThru->getVectorElt(i) + : AVal.getVectorElt(i); + ResultElements.push_back(PT); + } + } else { + ResultElements.push_back(Chosen); + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: case clang::X86::BI__builtin_ia32_vprotqi: diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 5f617530b6f78..01b5cea02cb1c 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -39,14 +39,19 @@ typedef short __v8hi __attribute__((__vector_size__(16))); typedef char __v16qi __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS_MMX \ + __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) + #define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ __min_vector_width__(128))) #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr +#define __DEFAULT_FN_ATTRS_MMX_CONSTEXPR __DEFAULT_FN_ATTRS_MMX constexpr #else #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 +#define __DEFAULT_FN_ATTRS_MMX_CONSTEXPR __DEFAULT_FN_ATTRS_MMX #endif #define __trunc64(x) \ diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index eff2797e87c75..4299b18243f21 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1109,19 +1109,19 @@ __m256i test_mm256_shuffle_epi32(__m256i a) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> return _mm256_shuffle_epi32(a, 15); } - +TEST_CONSTEXPR(match_v8si(_mm256_shuffle_epi32((((__m256i)(__v8si){0,1,2,3,4,5,6,7})), 15), 3,3,0,0, 7,7,4,4)); __m256i test_mm256_shufflehi_epi16(__m256i a) { // CHECK-LABEL: test_mm256_shufflehi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> return _mm256_shufflehi_epi16(a, 107); } - +TEST_CONSTEXPR(match_v16hi(_mm256_shufflehi_epi16((((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})), 107), 0,1,2,3, 7,6,6,5, 8,9,10,11, 15,14,14,13)); __m256i test_mm256_shufflelo_epi16(__m256i a) { // CHECK-LABEL: test_mm256_shufflelo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> return _mm256_shufflelo_epi16(a, 83); } - +TEST_CONSTEXPR(match_v16hi(_mm256_shufflelo_epi16(((__m256i)(__v16hi){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 83), 3,0,1,1, 4,5,6,7, 11,8,9,9, 12,13,14,15) ); __m256i test_mm256_sign_epi8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_sign_epi8 // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 3f42ac0268978..bd19363c8d948 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1876,13 +1876,15 @@ __m512i test_mm512_shufflehi_epi16(__m512i __A) { // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> return _mm512_shufflehi_epi16(__A, 5); } - +TEST_CONSTEXPR(match_v32hi(_mm512_shufflehi_epi16((((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 0,1,2,3, 5,5,4,4, 8,9,10,11, 13,13,12,12, 16,17,18,19, 21,21,20,20, 24,25,26,27, 29,29,28,28)); __m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_shufflehi_epi16 // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflehi_epi16((((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131})), 0xFFFF0000u, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115, 16,17,18,19,21,21,20,20, 24,25,26,27,29,29,28,28)); +TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflehi_epi16(((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131}), 0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 0,1,2,3,5,5,4,4, 8,9,10,11,13,13,12,12, 116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131)); __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_shufflehi_epi16 @@ -1890,12 +1892,15 @@ __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_shufflehi_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflehi_epi16(0xAAAAAAAAu, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 0,1,0,3,0,5,0,4, 0,9,0,11,0,13,0,12, 0,17,0,19,0,21,0,20, 0,25,0,27,0,29,0,28)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflehi_epi16(0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 0,1,2,3,5,5,4,4, 8,9,10,11,13,13,12,12, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)); __m512i test_mm512_shufflelo_epi16(__m512i __A) { // CHECK-LABEL: test_mm512_shufflelo_epi16 // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> return _mm512_shufflelo_epi16(__A, 5); } +TEST_CONSTEXPR( match_v32hi(_mm512_shufflelo_epi16(((__m512i)(__v32hi){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15, 16,17,18,19, 20,21,22,23, 24,25,26,27, 28,29,30,31}), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31)); __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_shufflelo_epi16 @@ -1903,6 +1908,8 @@ __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflelo_epi16((((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 0xFFFFFFFF, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31)); +TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflelo_epi16(((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131}), 0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 1,1,0,0,4,5,6,7, 9,9,8,8,12,13,14,15, 116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131)); __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_shufflelo_epi16 @@ -1910,6 +1917,8 @@ __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_shufflelo_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflelo_epi16(0xFFFFFFFF, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31)); +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflelo_epi16(0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 1,1,0,0,4,5,6,7, 9,9,8,8,12,13,14,15, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)); __m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_sllv_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 84eaad8d99e61..47cb485a84210 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -9073,20 +9073,25 @@ __m512i test_mm512_shuffle_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> return _mm512_shuffle_epi32(__A, 1); } - +TEST_CONSTEXPR(match_v16si(_mm512_shuffle_epi32((((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12)); __m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_shuffle_epi32 // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); } - +TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0xFFFFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12)); +TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0x0000u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207)); +TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0x00FFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 200,201,202,203,204,205,206,207)); __m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_shuffle_epi32 // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_shuffle_epi32(__U, __A, 1); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0xFFFFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0x5555u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,0,4,0, 9,0,8,0, 13,0,12,0)); +TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0x8001u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,12)); __m512d test_mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_expand_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 5282c7ab06dea..88006232c5c99 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -10025,6 +10025,11 @@ __m128i test_mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return _mm_mask_shuffle_epi32(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x0Fu, ((__m128i)(__v4si){0,1,2,3}), 1), 1,0,0,0)); +TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x0Au, ((__m128i)(__v4si){0,1,2,3}), 1), 100,0,102,0)); +TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x05u, ((__m128i)(__v4si){0,1,2,3}), 1), 1,101,0,103)); +TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x00u, ((__m128i)(__v4si){0,1,2,3}), 1), 100,101,102,103)); + __m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_shuffle_epi32 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> @@ -10032,6 +10037,10 @@ __m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) { return _mm_maskz_shuffle_epi32(__U, __A, 2); } +TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x01u, ((__m128i)(__v4si){0,1,2,3}), 2), 2,0,0,0)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x0Au, ((__m128i)(__v4si){0,1,2,3}), 2), 0,0,0,0)); +TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x0Fu, ((__m128i)(__v4si){0,1,2,3}), 2), 2,0,0,0)); + __m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_shuffle_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> @@ -10039,6 +10048,10 @@ __m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return _mm256_mask_shuffle_epi32(__W, __U, __A, 2); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0xF0u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 100,101,102,103, 6,4,4,4)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,102,103, 6,4,106,107)); +TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0x00u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 100,101,102,103,104,105,106,107)); + __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_shuffle_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> @@ -10046,6 +10059,10 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) { return _mm256_maskz_shuffle_epi32(__U, __A, 2); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4)); +TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4)); + __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_mov_pd // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 6c9c80efcef9d..1fe1ec08ede88 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -3393,6 +3393,13 @@ __m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,5,5,4,4)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,104,105,106,107)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xFFu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,5,5,4,4)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,104,105,106,107)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,101,2,103,5,105,4,107)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,1,102,3,104,5,106,4)); + __m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_shufflehi_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> @@ -3400,6 +3407,13 @@ __m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) { return _mm_maskz_shufflehi_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,5,5,4,4)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xFFu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,5,5,4,4)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,0,0,0,0)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,2,0,5,0,4,0)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,0,3,0,5,0,4)); + __m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_shufflelo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> @@ -3407,6 +3421,13 @@ __m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),0xFF,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,4,5,6,7)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,104,105,106,107)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,104,105,106,107)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,4,5,6,7)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,1,102,0,104,5,106,7)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,101,0,103,4,105,6,107)); + __m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_shufflelo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> @@ -3414,6 +3435,12 @@ __m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) { return _mm_maskz_shufflelo_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xFF,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,4,5,6,7)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,4,5,6,7)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,0,0,0,5,0,7)); +TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,0,0,0,4,0,6,0)); + __m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_shufflehi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> @@ -3421,6 +3448,12 @@ __m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A) return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0xFF00u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),100,101,102,103,104,105,106,107,8,9,10,11,13,13,12,12)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x0000u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0xFFFFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,108,109,110,111,112,113,114,115)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x5555u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,101,2,103,5,105,4,107,8,109,10,111,13,113,12,115)); + __m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_shufflehi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> @@ -3428,6 +3461,13 @@ __m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) { return _mm256_maskz_shufflehi_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x0000u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xFFFFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,0,0,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xFF00u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,0,0,0,0,8,9,10,11,13,13,12,12)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x5555u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,2,0,5,0,4,0,8,0,10,0,13,0,12,0)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xAAAAu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,0,3,0,5,0,4,0,9,0,11,0,13,0,12)); + __m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_shufflelo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> @@ -3435,6 +3475,11 @@ __m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A) return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0xFFFF,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0x000Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,104,105,106,107,200,201,202,203,204,205,206,207)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,200,201,202,203,204,205,206,207)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0xF00Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,104,105,106,107,200,201,202,203,12,13,14,15)); + __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_shufflelo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> @@ -3442,6 +3487,11 @@ __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) { return _mm256_maskz_shufflelo_epi16(__U, __A, 5); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0xFFFF,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0x000Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,0,0,0,0,0,0,0,0)); +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0xF0F0u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,4,5,6,7,0,0,0,0,12,13,14,15)); + void test_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepi16_storeu_epi8 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 26c5f7315457e..5156d070bcde7 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -584,7 +584,7 @@ __m64 test_mm_shuffle_pi16(__m64 a) { // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_shuffle_pi16(a, 3); } - +TEST_CONSTEXPR(match_v4hi(_mm_shuffle_pi16(((__m64)(__v4hi){0,1,2,3}), 3), 3,0,0,0)); __m64 test_mm_sign_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi8 // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128( diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index 84b90c09444c2..83cb4a63f4e3f 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1299,7 +1299,7 @@ __m128i test_mm_shuffle_epi32(__m128i A) { // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer return _mm_shuffle_epi32(A, 0); } - +TEST_CONSTEXPR(match_v4si(_mm_shuffle_epi32(((__m128i)(__v4si){0,1,2,3}), 0), 0,0,0,0)); __m128d test_mm_shuffle_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_shuffle_pd // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> @@ -1311,13 +1311,13 @@ __m128i test_mm_shufflehi_epi16(__m128i A) { // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> return _mm_shufflehi_epi16(A, 0); } - +TEST_CONSTEXPR(match_v8hi(_mm_shufflehi_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}), 0), 0,1,2,3, 4,4,4,4)); __m128i test_mm_shufflelo_epi16(__m128i A) { // CHECK-LABEL: test_mm_shufflelo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> return _mm_shufflelo_epi16(A, 0); } - +TEST_CONSTEXPR(match_v8hi(_mm_shufflelo_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}), 0), 0,0,0,0, 4,5,6,7)); __m128i test_mm_sll_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_sll_epi16 // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) From de765b0958a52f76b3405aafedc34374af771ea4 Mon Sep 17 00:00:00 2001 From: NagrajMG Date: Tue, 30 Sep 2025 00:49:21 +0530 Subject: [PATCH 02/17] [X86] Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 20 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 293 ++++------------- clang/lib/AST/ExprConstant.cpp | 394 ++++++----------------- clang/lib/Headers/mmintrin.h | 5 - 4 files changed, 171 insertions(+), 541 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index e70691a30627a..b320842c5486e 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1025,6 +1025,7 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">; def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">; + def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">; } let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<512>] in { @@ -2024,35 +2025,20 @@ let Features = "avx512f", let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psrlv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; - def pshuflw512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, _Vector<32, short>, unsigned int)">; - def pshuflw512_maskz : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, unsigned int)">; - def pshufhw512_mask : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, _Vector<32, short>, unsigned int)">; - def pshufhw512_maskz : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int, unsigned int)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def psrlv16hi : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; - def pshuflw256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, _Vector<16, short>, unsigned short)">; - def pshuflw256_maskz : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, unsigned short)">; - def pshufhw256_mask : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, _Vector<16, short>, unsigned short)">; - def pshufhw256_maskz : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int, unsigned short)">; } let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; - def pshuflw128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, _Vector<8, short>, unsigned char)">; - def pshuflw128_maskz : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, unsigned char)">; - def pshufhw128_mask : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, _Vector<8, short>, unsigned char)">; - def pshufhw128_maskz : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int, unsigned char)">; } let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">; def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">; def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">; - def pshufd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int, _Vector<16, int>, unsigned short)">; - def pshufd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int, unsigned short)">; - def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { @@ -2069,14 +2055,10 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def psravq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; - def pshufd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int, _Vector<4, int>, unsigned char)">; - def pshufd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int, unsigned char)">; } let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def psravq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; - def pshufd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int, _Vector<8, int>, unsigned char)">; - def pshufd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int, unsigned char)">; } let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index e0bd5d531db34..e7ec8beb2ba81 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2862,214 +2862,64 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, return true; } -static bool interp__builtin_ia32_pshuflw_common(InterpState &S, CodePtr OpPC, - const CallExpr *Call) { - const unsigned NumArgs = Call->getNumArgs(); - assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); - APSInt K; - Pointer SrcPT; - const bool HasMask = (NumArgs == 3) || (NumArgs == 4); - const bool IsMaskZ = (NumArgs == 3); - if (NumArgs == 4) { - K = popToAPSInt(S, Call->getArg(3)); - SrcPT = S.Stk.pop(); - } else if (NumArgs == 3) { - K = popToAPSInt(S, Call->getArg(2)); - } - - APSInt Imm = popToAPSInt(S, Call->getArg(1)); - const Pointer &Src = S.Stk.pop(); - const Pointer &Dst = S.Stk.peek(); - const unsigned NumElems = Dst.getNumElems(); - const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - const unsigned ElemBits = 16; - const unsigned LaneElems = 128u / ElemBits; - const unsigned Half = 4; - assert(NumElems % LaneElems == 0 && "pshuflw expects 128-bit lanes"); - const uint8_t Ctl = static_cast(Imm.getZExtValue()); - - for (unsigned i = 0; i != NumElems; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; - - unsigned srcIdx; - if (inLane < Half) { - const unsigned pos = inLane; - const unsigned sel = (Ctl >> (2 * pos)) & 0x3; - srcIdx = laneBase + sel; - } else { - srcIdx = i; - } - - APSInt Chosen; - INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); - - if (!HasMask) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - continue; - } - - const bool Keep = - (i < static_cast(K.getBitWidth())) ? K[i] : false; - - if (Keep) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - } else if (IsMaskZ) { - APSInt Zero(APInt(Chosen.getBitWidth(), 0)); - Zero.setIsSigned(Chosen.isSigned()); - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Zero); }); - } else { - APSInt PT; - INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); - INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); - } - } +enum class Half { None, Low, High }; - Dst.initializeAllElements(); - return true; -} - -static bool interp__builtin_ia32_pshufhw_common(InterpState &S, CodePtr OpPC, - const CallExpr *Call) { - (void)OpPC; - const unsigned NumArgs = Call->getNumArgs(); - assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); - - APSInt K; - Pointer SrcPT; - const bool HasMask = (NumArgs == 3) || (NumArgs == 4); - const bool IsMaskZ = (NumArgs == 3); - - if (NumArgs == 4) { - K = popToAPSInt(S, Call->getArg(3)); - SrcPT = S.Stk.pop(); - } else if (NumArgs == 3) { - K = popToAPSInt(S, Call->getArg(2)); - } - - APSInt Imm = popToAPSInt(S, Call->getArg(1)); - const Pointer &Src = S.Stk.pop(); +static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, const CallExpr *Call, + Half whichHalf) { + assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); + APSInt controlImm = popToAPSInt(S, Call->getArg(1)); + const Pointer &src = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); - const unsigned NumElems = Dst.getNumElems(); - const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - - const unsigned ElemBits = 16; - const unsigned LaneElems = 128u / ElemBits; - const unsigned HalfBase = 4; - assert(NumElems % LaneElems == 0); + const unsigned numElts = Dst.getNumElems(); + const PrimType elemTy = Dst.getFieldDesc()->getPrimType(); - const uint8_t Ctl = static_cast(Imm.getZExtValue()); + // Only i16/i32 supported + const unsigned elemBits = static_cast(primSize(elemTy) * 8); + if (elemBits != 16 && elemBits != 32) return false; - for (unsigned i = 0; i != NumElems; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; + // Lane: 64b for MMX, 128b otherwise + const unsigned totalBits = numElts * elemBits; + const unsigned laneBits = (totalBits == 64) ? 64u : 128u; + const unsigned laneElts = laneBits / elemBits; + assert(laneElts && (numElts % laneElts == 0)); - unsigned srcIdx; - if (inLane >= HalfBase) { - const unsigned pos = inLane - HalfBase; - const unsigned sel = (Ctl >> (2 * pos)) & 0x3; - srcIdx = laneBase + HalfBase + sel; - } else { - srcIdx = i; - } + const uint8_t ctl = static_cast(controlImm.getZExtValue()); - APSInt Chosen; - INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); + for (unsigned idx = 0; idx != numElts; idx++) { + const unsigned laneBase = (idx / laneElts) * laneElts; + const unsigned laneIdx = idx % laneElts; - if (!HasMask) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - continue; - } + unsigned srcIdx = idx; - const bool Keep = - (i < static_cast(K.getBitWidth())) ? K[i] : false; - if (Keep) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - } else if (IsMaskZ) { - APSInt Zero(APInt(Chosen.getBitWidth(), 0)); - Zero.setIsSigned(Chosen.isSigned()); - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Zero); }); - } else { - APSInt PT; - INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); - INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); - } - } - - Dst.initializeAllElements(); - return true; -} - -static bool interp__builtin_ia32_pshufd_common(InterpState &S, CodePtr OpPC, - const CallExpr *Call) { - (void)OpPC; - const unsigned NumArgs = Call->getNumArgs(); - assert(NumArgs == 2 || NumArgs == 3 || NumArgs == 4); - - APSInt K; - Pointer SrcPT; - const bool HasMask = (NumArgs == 3) || (NumArgs == 4); - const bool IsMaskZ = (NumArgs == 3); - - if (NumArgs == 4) { - K = popToAPSInt(S, Call->getArg(3)); - SrcPT = S.Stk.pop(); - } else if (NumArgs == 3) { - K = popToAPSInt(S, Call->getArg(2)); - } - - APSInt Imm = popToAPSInt(S, Call->getArg(1)); - const Pointer &Src = S.Stk.pop(); - const Pointer &Dst = S.Stk.peek(); - - const unsigned NumElems = Dst.getNumElems(); - const PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - - const unsigned ElemBits = 32; - const unsigned LaneElems = 128u / ElemBits; - assert(NumElems % LaneElems == 0); - - const uint8_t Ctl = static_cast(Imm.getZExtValue()); - - for (unsigned i = 0; i != NumElems; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; - const unsigned sel = (Ctl >> (2 * inLane)) & 0x3; - const unsigned srcIdx = laneBase + sel; - - APSInt Chosen; - INT_TYPE_SWITCH(ElemT, { Chosen = Src.elem(srcIdx).toAPSInt(); }); - - if (!HasMask) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - continue; + if (elemBits == 32) { + // PSHUFD: 4×i32 per lane + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } else { // 16-bit shuffles + if (laneElts == 4) { + // MMX: permute all 4×i16 + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } else { + // 128b lanes: shuffle 4×i16 half + constexpr unsigned halfSize = 4; + if (whichHalf == Half::Low && laneIdx < halfSize) { + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } else if (whichHalf == Half::High && laneIdx >= halfSize) { + const unsigned rel = laneIdx - halfSize; + const unsigned sel = (ctl >> (2 * rel)) & 0x3; + srcIdx = laneBase + halfSize + sel; + } else if (whichHalf == Half::None) { + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } + } } - const bool Keep = - (i < static_cast(K.getBitWidth())) ? K[i] : false; - if (Keep) { - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Chosen); }); - } else if (IsMaskZ) { - APSInt Zero(APInt(Chosen.getBitWidth(), 0)); - Zero.setIsSigned(Chosen.isSigned()); - INT_TYPE_SWITCH_NO_BOOL(ElemT, - { Dst.elem(i) = static_cast(Zero); }); - } else { - APSInt PT; - INT_TYPE_SWITCH(ElemT, { PT = SrcPT.elem(i).toAPSInt(); }); - INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(i) = static_cast(PT); }); - } + INT_TYPE_SWITCH_NO_BOOL(elemTy, { Dst.elem(idx) = src.elem(srcIdx); }); } - Dst.initializeAllElements(); return true; } @@ -3629,39 +3479,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_int_binop(S, OpPC, Call, llvm::APIntOps::mulhs); - case clang::X86::BI__builtin_ia32_pshuflw: - case clang::X86::BI__builtin_ia32_pshuflw256: - case clang::X86::BI__builtin_ia32_pshuflw512: - case clang::X86::BI__builtin_ia32_pshuflw128_mask: - case clang::X86::BI__builtin_ia32_pshuflw256_mask: - case clang::X86::BI__builtin_ia32_pshuflw512_mask: - case clang::X86::BI__builtin_ia32_pshuflw128_maskz: - case clang::X86::BI__builtin_ia32_pshuflw256_maskz: - case clang::X86::BI__builtin_ia32_pshuflw512_maskz: - return interp__builtin_ia32_pshuflw_common(S, OpPC, Call); - - case clang::X86::BI__builtin_ia32_pshufhw: - case clang::X86::BI__builtin_ia32_pshufhw256: - case clang::X86::BI__builtin_ia32_pshufhw512: - case clang::X86::BI__builtin_ia32_pshufhw128_mask: - case clang::X86::BI__builtin_ia32_pshufhw256_mask: - case clang::X86::BI__builtin_ia32_pshufhw512_mask: - case clang::X86::BI__builtin_ia32_pshufhw128_maskz: - case clang::X86::BI__builtin_ia32_pshufhw256_maskz: - case clang::X86::BI__builtin_ia32_pshufhw512_maskz: - return interp__builtin_ia32_pshufhw_common(S, OpPC, Call); - - case clang::X86::BI__builtin_ia32_pshufd: - case clang::X86::BI__builtin_ia32_pshufd256: - case clang::X86::BI__builtin_ia32_pshufd512: - case clang::X86::BI__builtin_ia32_pshufd128_mask: - case clang::X86::BI__builtin_ia32_pshufd256_mask: - case clang::X86::BI__builtin_ia32_pshufd512_mask: - case clang::X86::BI__builtin_ia32_pshufd128_maskz: - case clang::X86::BI__builtin_ia32_pshufd256_maskz: - case clang::X86::BI__builtin_ia32_pshufd512_maskz: - return interp__builtin_ia32_pshufd_common(S, OpPC, Call); - case clang::X86::BI__builtin_ia32_psllv2di: case clang::X86::BI__builtin_ia32_psllv4di: case clang::X86::BI__builtin_ia32_psllv4si: @@ -3892,6 +3709,24 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_selectpd_512: return interp__builtin_select(S, OpPC, Call); + case X86::BI__builtin_ia32_pshufw: + return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::None); + + case X86::BI__builtin_ia32_pshuflw: + case X86::BI__builtin_ia32_pshuflw256: + case X86::BI__builtin_ia32_pshuflw512: + return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::Low); + + case X86::BI__builtin_ia32_pshufhw: + case X86::BI__builtin_ia32_pshufhw256: + case X86::BI__builtin_ia32_pshufhw512: + return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::High); + + case X86::BI__builtin_ia32_pshufd: + case X86::BI__builtin_ia32_pshufd256: + case X86::BI__builtin_ia32_pshufd512: + return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::None); + case X86::BI__builtin_ia32_kandqi: case X86::BI__builtin_ia32_kandhi: case X86::BI__builtin_ia32_kandsi: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 1ce601d37e0d6..876a80446fd3c 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11615,6 +11615,78 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, return true; } +static constexpr unsigned noHalf = ~0u; + +static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, + unsigned elemBits, unsigned halfBase, + APValue &Out) { + // Expect (vec, imm8) + APValue vec; + APSInt imm; + if (!EvaluateAsRValue(Info, Call->getArg(0), vec)) return false; + if (!EvaluateInteger(Call->getArg(1), imm, Info)) return false; + + const auto *vt = Call->getType()->getAs(); + if (!vt) return false; + const unsigned nElts = vt->getNumElements(); + + // Lane geometry: MMX pshufw is a single 64-bit lane; others use 128-bit lanes. + const unsigned totalBits = nElts * elemBits; + const unsigned laneBits = (totalBits == 64) ? 64u : 128u; + const unsigned laneElts = laneBits / elemBits; + if (!laneElts || (nElts % laneElts) != 0) return false; + + const uint8_t ctl = static_cast(imm.getZExtValue()); + + SmallVector ResultElements; + ResultElements.reserve(nElts); + + for (unsigned idx = 0; idx != nElts; idx++) { + const unsigned laneBase = (idx / laneElts) * laneElts; + const unsigned laneIdx = idx % laneElts; + + unsigned srcIdx = idx; + + if (elemBits == 32) { + // PSHUFD: permute 4×i32 per 128-bit lane + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } else { + // elemBits == 16 (PSHUFLW / PSHUFHW / PSHUFW) + if (laneElts == 4) { + // MMX PSHUFW: permute entire 64-bit lane (4×i16) + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } else { + // SSE/AVX/AVX-512: 128-bit lane has 8×i16. Permute a 4×i16 half. + constexpr unsigned halfSize = 4; + if (halfBase == 0) { + // PSHUFLW: permute low half (words 0..3) + if (laneIdx < halfSize) { + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } + } else if (halfBase == halfSize) { + // PSHUFHW: permute high half (words 4..7) + if (laneIdx >= halfSize) { + const unsigned rel = laneIdx - halfSize; + const unsigned sel = (ctl >> (2 * rel)) & 0x3; + srcIdx = laneBase + halfBase + sel; + } + } else { + const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; + srcIdx = laneBase + sel; + } + } + } + + ResultElements.push_back(vec.getVectorElt(srcIdx)); + } + + Out = APValue(ResultElements.data(), ResultElements.size()); + return true; +} + bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!IsConstantEvaluatedBuiltinCall(E)) return ExprEvaluatorBaseTy::VisitCallExpr(E); @@ -11868,294 +11940,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } - - case X86::BI__builtin_ia32_pshufw: { - APValue Src; - APSInt Imm; - if (!EvaluateAsRValue(Info, E->getArg(0), Src)) return false; - if (!EvaluateInteger(E->getArg(1), Imm, Info)) return false; - - unsigned N = Src.getVectorLength(); - SmallVector ResultElements; - ResultElements.reserve(N); - - uint8_t C = static_cast(Imm.getZExtValue()); - for (unsigned i = 0; i != N; ++i) { - unsigned sel = (C >> (2 * i)) & 0x3; - ResultElements.push_back(Src.getVectorElt(sel)); - } - return Success(APValue(ResultElements.data(), ResultElements.size()), E); - } - - case clang::X86::BI__builtin_ia32_pshuflw: - case clang::X86::BI__builtin_ia32_pshuflw256: - case clang::X86::BI__builtin_ia32_pshuflw512: - case clang::X86::BI__builtin_ia32_pshuflw128_mask: - case clang::X86::BI__builtin_ia32_pshuflw256_mask: - case clang::X86::BI__builtin_ia32_pshuflw512_mask: - case clang::X86::BI__builtin_ia32_pshuflw128_maskz: - case clang::X86::BI__builtin_ia32_pshuflw256_maskz: - case clang::X86::BI__builtin_ia32_pshuflw512_maskz: { - const unsigned BID = E->getBuiltinCallee(); - - const bool IsMask = - BID == clang::X86::BI__builtin_ia32_pshuflw128_mask || - BID == clang::X86::BI__builtin_ia32_pshuflw256_mask || - BID == clang::X86::BI__builtin_ia32_pshuflw512_mask; - - const bool IsMaskZ = - BID == clang::X86::BI__builtin_ia32_pshuflw128_maskz || - BID == clang::X86::BI__builtin_ia32_pshuflw256_maskz || - BID == clang::X86::BI__builtin_ia32_pshuflw512_maskz; - - const unsigned AIdx = 0, ImmIdx = 1; - const unsigned SrcIdx = 2; - const unsigned KIdx = IsMaskZ ? 2 : 3; - - APValue AVal, SrcVal; - APSInt Imm, K; - if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; - if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; - - const APSInt *KPtr = nullptr; - const APValue *PassThru = nullptr; - bool ZeroInactive = false; - - if (IsMask) { - if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; - } else if (IsMaskZ) { - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = nullptr; ZeroInactive = true; - } - - const auto *VT = E->getType()->getAs(); - if (!VT) return false; - const unsigned NumElts = VT->getNumElements(); - - const unsigned ElemBits = 16; - const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); - const unsigned Half = 4; - const uint8_t Ctl = static_cast(Imm.getZExtValue()); - const bool DestUnsigned = - VT->getElementType()->isUnsignedIntegerOrEnumerationType(); - - auto MakeZero = [&]() -> APValue { - return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); - }; - - SmallVector ResultElements; - ResultElements.reserve(NumElts); - - for (unsigned i = 0; i < NumElts; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; - - APValue Chosen; - if (inLane < Half) { - const unsigned pos = inLane; - const unsigned sel = (Ctl >> (2 * pos)) & 0x3; - const unsigned srcIdx = laneBase + sel; - Chosen = AVal.getVectorElt(srcIdx); - } else { - Chosen = AVal.getVectorElt(i); - } - - if (KPtr) { - const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; - if (Keep) { - ResultElements.push_back(Chosen); - } else if (ZeroInactive) { - ResultElements.push_back(MakeZero()); - } else { - const APValue &PT = PassThru ? PassThru->getVectorElt(i) - : AVal.getVectorElt(i); - ResultElements.push_back(PT); - } - } else { - ResultElements.push_back(Chosen); - } - } - return Success(APValue(ResultElements.data(), ResultElements.size()), E); - } - - case clang::X86::BI__builtin_ia32_pshufhw: - case clang::X86::BI__builtin_ia32_pshufhw256: - case clang::X86::BI__builtin_ia32_pshufhw512: - case clang::X86::BI__builtin_ia32_pshufhw128_mask: - case clang::X86::BI__builtin_ia32_pshufhw256_mask: - case clang::X86::BI__builtin_ia32_pshufhw512_mask: - case clang::X86::BI__builtin_ia32_pshufhw128_maskz: - case clang::X86::BI__builtin_ia32_pshufhw256_maskz: - case clang::X86::BI__builtin_ia32_pshufhw512_maskz: { - const unsigned BID = E->getBuiltinCallee(); - - const bool IsMask = - BID == clang::X86::BI__builtin_ia32_pshufhw128_mask || - BID == clang::X86::BI__builtin_ia32_pshufhw256_mask || - BID == clang::X86::BI__builtin_ia32_pshufhw512_mask; - - const bool IsMaskZ = - BID == clang::X86::BI__builtin_ia32_pshufhw128_maskz || - BID == clang::X86::BI__builtin_ia32_pshufhw256_maskz || - BID == clang::X86::BI__builtin_ia32_pshufhw512_maskz; - - const unsigned AIdx = 0, ImmIdx = 1; - const unsigned SrcIdx = 2; - const unsigned KIdx = IsMaskZ ? 2 : 3; - - APValue AVal, SrcVal; - APSInt Imm, K; - if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; - if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; - - const APSInt *KPtr = nullptr; - const APValue *PassThru = nullptr; - bool ZeroInactive = false; - if (IsMask) { - if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; - } else if (IsMaskZ) { - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = nullptr; ZeroInactive = true; - } - - const auto *VT = E->getType()->getAs(); - if (!VT) return false; - const unsigned NumElts = VT->getNumElements(); - const unsigned ElemBits = 16; - const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); - const unsigned Half = 4; - const uint8_t Ctl = static_cast(Imm.getZExtValue()); - const bool DestUnsigned = - VT->getElementType()->isUnsignedIntegerOrEnumerationType(); - - auto MakeZero = [&]() -> APValue { - return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); - }; - - SmallVector ResultElements; - ResultElements.reserve(NumElts); - - for (unsigned i = 0; i < NumElts; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; - - APValue Chosen; - if (inLane >= Half) { - const unsigned pos = inLane - Half; - const unsigned sel = (Ctl >> (2 * pos)) & 0x3; - const unsigned srcIdx = laneBase + Half + sel; - Chosen = AVal.getVectorElt(srcIdx); - } else { - Chosen = AVal.getVectorElt(i); - } - - if (KPtr) { - const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; - if (Keep) { - ResultElements.push_back(Chosen); - } else if (ZeroInactive) { - ResultElements.push_back(MakeZero()); - } else { - const APValue &PT = PassThru ? PassThru->getVectorElt(i) - : AVal.getVectorElt(i); - ResultElements.push_back(PT); - } - } else { - ResultElements.push_back(Chosen); - } - } - return Success(APValue(ResultElements.data(), ResultElements.size()), E); - } - - case clang::X86::BI__builtin_ia32_pshufd: - case clang::X86::BI__builtin_ia32_pshufd256: - case clang::X86::BI__builtin_ia32_pshufd512: - case clang::X86::BI__builtin_ia32_pshufd128_mask: - case clang::X86::BI__builtin_ia32_pshufd256_mask: - case clang::X86::BI__builtin_ia32_pshufd512_mask: - case clang::X86::BI__builtin_ia32_pshufd128_maskz: - case clang::X86::BI__builtin_ia32_pshufd256_maskz: - case clang::X86::BI__builtin_ia32_pshufd512_maskz: { - const unsigned BID = E->getBuiltinCallee(); - - const bool IsMask = - BID == clang::X86::BI__builtin_ia32_pshufd512_mask || - BID == clang::X86::BI__builtin_ia32_pshufd128_mask || - BID == clang::X86::BI__builtin_ia32_pshufd256_mask; - - const bool IsMaskZ = - BID == clang::X86::BI__builtin_ia32_pshufd512_maskz || - BID == clang::X86::BI__builtin_ia32_pshufd128_maskz || - BID == clang::X86::BI__builtin_ia32_pshufd256_maskz; - - const unsigned AIdx = 0, ImmIdx = 1; - const unsigned SrcIdx = 2; - const unsigned KIdx = IsMaskZ ? 2 : 3; - - APValue AVal, SrcVal; - APSInt Imm, K; - if (!EvaluateAsRValue(Info, E->getArg(AIdx), AVal)) return false; - if (!EvaluateInteger(E->getArg(ImmIdx), Imm, Info)) return false; - - const APSInt *KPtr = nullptr; - const APValue *PassThru = nullptr; - bool ZeroInactive = false; - if (IsMask) { - if (!EvaluateAsRValue(Info, E->getArg(SrcIdx), SrcVal)) return false; - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = &SrcVal; ZeroInactive = false; - } else if (IsMaskZ) { - if (!EvaluateInteger(E->getArg(KIdx), K, Info)) return false; - KPtr = &K; PassThru = nullptr; ZeroInactive = true; - } - - const auto *VT = E->getType()->getAs(); - if (!VT) return false; - const unsigned NumElts = VT->getNumElements(); - const unsigned ElemBits = 32; - const unsigned LaneElems = std::min(NumElts, 128u / ElemBits); - const uint8_t Ctl = static_cast(Imm.getZExtValue()); - const bool DestUnsigned = - VT->getElementType()->isUnsignedIntegerOrEnumerationType(); - - auto MakeZero = [&]() -> APValue { - return APValue(APSInt(APInt(ElemBits, 0), DestUnsigned)); - }; - - SmallVector ResultElements; - ResultElements.reserve(NumElts); - - for (unsigned i = 0; i < NumElts; ++i) { - const unsigned laneBase = (i / LaneElems) * LaneElems; - const unsigned inLane = i % LaneElems; - - const unsigned pos = inLane & 3; - const unsigned sel = (Ctl >> (2 * pos)) & 0x3; - const unsigned srcIdx = laneBase + sel; - APValue Chosen = AVal.getVectorElt(srcIdx); - - if (KPtr) { - const bool Keep = (i < KPtr->getBitWidth()) ? (*KPtr)[i] : false; - if (Keep) { - ResultElements.push_back(Chosen); - } else if (ZeroInactive) { - ResultElements.push_back(MakeZero()); - } else { - const APValue &PT = PassThru ? PassThru->getVectorElt(i) - : AVal.getVectorElt(i); - ResultElements.push_back(PT); - } - } else { - ResultElements.push_back(Chosen); - } - } - return Success(APValue(ResultElements.data(), ResultElements.size()), E); - } - case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: case clang::X86::BI__builtin_ia32_vprotqi: @@ -12374,6 +12158,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + + case X86::BI__builtin_ia32_pshufw: { + APValue R; + if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/noHalf, R)) return false; + return Success(R, E); + } + + case X86::BI__builtin_ia32_pshuflw: + case X86::BI__builtin_ia32_pshuflw256: + case X86::BI__builtin_ia32_pshuflw512: { + APValue R; + if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/0, R)) + return false; + return Success(R, E); + } + + case X86::BI__builtin_ia32_pshufhw: + case X86::BI__builtin_ia32_pshufhw256: + case X86::BI__builtin_ia32_pshufhw512: { + APValue R; + if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/4, R)) + return false; + return Success(R, E); + } + + case X86::BI__builtin_ia32_pshufd: + case X86::BI__builtin_ia32_pshufd256: + case X86::BI__builtin_ia32_pshufd512: { + APValue R; + if (!evalPshufBuiltin(Info, E, /*ElemBits=*/32, /*HalfBaseElems=*/noHalf, R)) + return false; + return Success(R, E); + } + case Builtin::BI__builtin_elementwise_clzg: case Builtin::BI__builtin_elementwise_ctzg: { APValue SourceLHS; diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 01b5cea02cb1c..5f617530b6f78 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -39,19 +39,14 @@ typedef short __v8hi __attribute__((__vector_size__(16))); typedef char __v16qi __attribute__((__vector_size__(16))); /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS_MMX \ - __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) - #define __DEFAULT_FN_ATTRS_SSE2 \ __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ __min_vector_width__(128))) #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr -#define __DEFAULT_FN_ATTRS_MMX_CONSTEXPR __DEFAULT_FN_ATTRS_MMX constexpr #else #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 -#define __DEFAULT_FN_ATTRS_MMX_CONSTEXPR __DEFAULT_FN_ATTRS_MMX #endif #define __trunc64(x) \ From ba1f202062338399690f2361c9846599e015198e Mon Sep 17 00:00:00 2001 From: NagrajMG Date: Tue, 30 Sep 2025 01:29:05 +0530 Subject: [PATCH 03/17] [X86] Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 4 ---- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 3 --- clang/lib/AST/ExprConstant.cpp | 6 ------ 3 files changed, 13 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index b320842c5486e..20661aa3971a7 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -145,10 +145,6 @@ let Features = "mmx", Header = "mmintrin.h", Attributes = [NoThrow, Const] in { def _m_prefetch : X86LibBuiltin<"void(void *)">; } -let Features = "mmx", Attributes = [NoThrow, Const, Constexpr] in { - def pshufw : X86Builtin<"_Vector<4, short>(_Vector<4, short>, _Constant int)">; -} - // PRFCHW let Features = "prfchw", Header = "intrin.h", Attributes = [NoThrow, Const] in { def _m_prefetchw : X86LibBuiltin<"void(void volatile const *)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index e7ec8beb2ba81..99c0d15b516a2 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3709,9 +3709,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_selectpd_512: return interp__builtin_select(S, OpPC, Call); - case X86::BI__builtin_ia32_pshufw: - return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::None); - case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 876a80446fd3c..2186eb0a5e956 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12159,12 +12159,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } - case X86::BI__builtin_ia32_pshufw: { - APValue R; - if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/noHalf, R)) return false; - return Success(R, E); - } - case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: { From 4ccbbbcd48e5a2c293782ecd8291e9c047ce4bd0 Mon Sep 17 00:00:00 2001 From: NagrajMG Date: Wed, 1 Oct 2025 00:30:01 +0530 Subject: [PATCH 04/17] [X86] Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in constexpr --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 64 ++++++++++---------- clang/lib/AST/ExprConstant.cpp | 77 ++++++++++++------------ 2 files changed, 70 insertions(+), 71 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 99c0d15b516a2..99dd2d97befc3 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2867,58 +2867,58 @@ enum class Half { None, Low, High }; static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, const CallExpr *Call, Half whichHalf) { assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); - APSInt controlImm = popToAPSInt(S, Call->getArg(1)); - const Pointer &src = S.Stk.pop(); + APSInt ControlImm = popToAPSInt(S, Call->getArg(1)); + const Pointer &Src = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); - const unsigned numElts = Dst.getNumElems(); - const PrimType elemTy = Dst.getFieldDesc()->getPrimType(); + unsigned NumElems = Dst.getNumElems(); + PrimType ElemT = Dst.getFieldDesc()->getPrimType(); // Only i16/i32 supported - const unsigned elemBits = static_cast(primSize(elemTy) * 8); - if (elemBits != 16 && elemBits != 32) return false; + unsigned ElemBits = static_cast(primSize(ElemT) * 8); + if (ElemBits != 16 && ElemBits != 32) return false; // Lane: 64b for MMX, 128b otherwise - const unsigned totalBits = numElts * elemBits; - const unsigned laneBits = (totalBits == 64) ? 64u : 128u; - const unsigned laneElts = laneBits / elemBits; - assert(laneElts && (numElts % laneElts == 0)); + unsigned TotalBits = NumElems * ElemBits; + unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; + unsigned LaneElts = LaneBits / ElemBits; + assert(LaneElts && (NumElems % LaneElts == 0)); - const uint8_t ctl = static_cast(controlImm.getZExtValue()); + uint8_t ctl = static_cast(ControlImm.getZExtValue()); - for (unsigned idx = 0; idx != numElts; idx++) { - const unsigned laneBase = (idx / laneElts) * laneElts; - const unsigned laneIdx = idx % laneElts; + for (unsigned idx = 0; idx != NumElems; idx++) { + unsigned LaneBase = (idx / LaneElts) * LaneElts; + unsigned LaneIdx = idx % LaneElts; - unsigned srcIdx = idx; + unsigned SrcIdx = idx; - if (elemBits == 32) { + if (ElemBits == 32) { // PSHUFD: 4×i32 per lane - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } else { // 16-bit shuffles - if (laneElts == 4) { + if (LaneElts == 4) { // MMX: permute all 4×i16 - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } else { // 128b lanes: shuffle 4×i16 half - constexpr unsigned halfSize = 4; - if (whichHalf == Half::Low && laneIdx < halfSize) { - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; - } else if (whichHalf == Half::High && laneIdx >= halfSize) { - const unsigned rel = laneIdx - halfSize; - const unsigned sel = (ctl >> (2 * rel)) & 0x3; - srcIdx = laneBase + halfSize + sel; + constexpr unsigned HalfSize = 4; + if (whichHalf == Half::Low && LaneIdx < HalfSize) { + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; + } else if (whichHalf == Half::High && LaneIdx >= HalfSize) { + unsigned rel = LaneIdx - HalfSize; + unsigned sel = (ctl >> (2 * rel)) & 0x3; + SrcIdx = LaneBase + HalfSize + sel; } else if (whichHalf == Half::None) { - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } } } - INT_TYPE_SWITCH_NO_BOOL(elemTy, { Dst.elem(idx) = src.elem(srcIdx); }); + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(idx) = Src.elem(SrcIdx); }); } Dst.initializeAllElements(); return true; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 2186eb0a5e956..6932c7a54202d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11615,72 +11615,71 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, return true; } -static constexpr unsigned noHalf = ~0u; static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, - unsigned elemBits, unsigned halfBase, + unsigned ElemBits, unsigned HalfBase, APValue &Out) { // Expect (vec, imm8) - APValue vec; - APSInt imm; - if (!EvaluateAsRValue(Info, Call->getArg(0), vec)) return false; - if (!EvaluateInteger(Call->getArg(1), imm, Info)) return false; + APValue Vec; + APSInt Imm; + if (!EvaluateAsRValue(Info, Call->getArg(0), Vec)) return false; + if (!EvaluateInteger(Call->getArg(1), Imm, Info)) return false; - const auto *vt = Call->getType()->getAs(); - if (!vt) return false; - const unsigned nElts = vt->getNumElements(); + const auto *VT = Call->getType()->getAs(); + if (!VT) return false; + unsigned NumElts = VT->getNumElements(); // Lane geometry: MMX pshufw is a single 64-bit lane; others use 128-bit lanes. - const unsigned totalBits = nElts * elemBits; - const unsigned laneBits = (totalBits == 64) ? 64u : 128u; - const unsigned laneElts = laneBits / elemBits; - if (!laneElts || (nElts % laneElts) != 0) return false; + unsigned TotalBits = NumElts * ElemBits; + unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; + unsigned LaneElts = LaneBits / ElemBits; + if (!LaneElts || (NumElts % LaneElts) != 0) return false; - const uint8_t ctl = static_cast(imm.getZExtValue()); + uint8_t ctl = static_cast(Imm.getZExtValue()); SmallVector ResultElements; - ResultElements.reserve(nElts); + ResultElements.reserve(NumElts); - for (unsigned idx = 0; idx != nElts; idx++) { - const unsigned laneBase = (idx / laneElts) * laneElts; - const unsigned laneIdx = idx % laneElts; + for (unsigned idx = 0; idx != NumElts; idx++) { + unsigned LaneBase = (idx / LaneElts) * LaneElts; + unsigned LaneIdx = idx % LaneElts; - unsigned srcIdx = idx; + unsigned SrcIdx = idx; - if (elemBits == 32) { + if (ElemBits == 32) { // PSHUFD: permute 4×i32 per 128-bit lane - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } else { // elemBits == 16 (PSHUFLW / PSHUFHW / PSHUFW) - if (laneElts == 4) { + if (LaneElts == 4) { // MMX PSHUFW: permute entire 64-bit lane (4×i16) - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } else { // SSE/AVX/AVX-512: 128-bit lane has 8×i16. Permute a 4×i16 half. - constexpr unsigned halfSize = 4; - if (halfBase == 0) { + constexpr unsigned HalfSize = 4; + if (HalfBase == 0) { // PSHUFLW: permute low half (words 0..3) - if (laneIdx < halfSize) { - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + if (LaneIdx < HalfSize) { + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } - } else if (halfBase == halfSize) { + } else if (HalfBase == HalfSize) { // PSHUFHW: permute high half (words 4..7) - if (laneIdx >= halfSize) { - const unsigned rel = laneIdx - halfSize; - const unsigned sel = (ctl >> (2 * rel)) & 0x3; - srcIdx = laneBase + halfBase + sel; + if (LaneIdx >= HalfSize) { + unsigned rel = LaneIdx - HalfSize; + unsigned sel = (ctl >> (2 * rel)) & 0x3; + SrcIdx = LaneBase + HalfBase + sel; } } else { - const unsigned sel = (ctl >> (2 * laneIdx)) & 0x3; - srcIdx = laneBase + sel; + unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + sel; } } } - ResultElements.push_back(vec.getVectorElt(srcIdx)); + ResultElements.push_back(Vec.getVectorElt(SrcIdx)); } Out = APValue(ResultElements.data(), ResultElements.size()); @@ -12181,7 +12180,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: { APValue R; - if (!evalPshufBuiltin(Info, E, /*ElemBits=*/32, /*HalfBaseElems=*/noHalf, R)) + if (!evalPshufBuiltin(Info, E, /*ElemBits=*/32, /*HalfBaseElems=*/~0u, R)) return false; return Success(R, E); } From 1d54eebfbe772f1b5a337f0be79a4a453c1285ea Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Wed, 1 Oct 2025 22:40:26 +0530 Subject: [PATCH 05/17] Refactor interp__builtin_ia32_pshuf for readability --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 6532547c6a934..1368ede2e8893 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2775,45 +2775,41 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, enum class Half { None, Low, High }; -static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, const CallExpr *Call, - Half whichHalf) { +static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, + const CallExpr *Call, Half whichHalf) { assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); APSInt ControlImm = popToAPSInt(S, Call->getArg(1)); const Pointer &Src = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); unsigned NumElems = Dst.getNumElems(); - PrimType ElemT = Dst.getFieldDesc()->getPrimType(); + PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - // Only i16/i32 supported unsigned ElemBits = static_cast(primSize(ElemT) * 8); - if (ElemBits != 16 && ElemBits != 32) return false; + if (ElemBits != 16 && ElemBits != 32) + return false; - // Lane: 64b for MMX, 128b otherwise unsigned TotalBits = NumElems * ElemBits; - unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; - unsigned LaneElts = LaneBits / ElemBits; + unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; + unsigned LaneElts = LaneBits / ElemBits; assert(LaneElts && (NumElems % LaneElts == 0)); uint8_t ctl = static_cast(ControlImm.getZExtValue()); for (unsigned idx = 0; idx != NumElems; idx++) { unsigned LaneBase = (idx / LaneElts) * LaneElts; - unsigned LaneIdx = idx % LaneElts; + unsigned LaneIdx = idx % LaneElts; - unsigned SrcIdx = idx; + unsigned SrcIdx = idx; if (ElemBits == 32) { - // PSHUFD: 4×i32 per lane unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + sel; - } else { // 16-bit shuffles + } else { if (LaneElts == 4) { - // MMX: permute all 4×i16 unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + sel; } else { - // 128b lanes: shuffle 4×i16 half constexpr unsigned HalfSize = 4; if (whichHalf == Half::Low && LaneIdx < HalfSize) { unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; From ffa0287b67bcfe32b91f4e99e4a0a85ce4771488 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Wed, 1 Oct 2025 22:48:09 +0530 Subject: [PATCH 06/17] Refactor evalPshufBuiltin for clarity and consistency --- clang/lib/AST/ExprConstant.cpp | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 6932c7a54202d..d15888398f89b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11615,25 +11615,26 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, return true; } - static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, unsigned ElemBits, unsigned HalfBase, APValue &Out) { - // Expect (vec, imm8) APValue Vec; APSInt Imm; - if (!EvaluateAsRValue(Info, Call->getArg(0), Vec)) return false; - if (!EvaluateInteger(Call->getArg(1), Imm, Info)) return false; + if (!EvaluateAsRValue(Info, Call->getArg(0), Vec)) + return false; + if (!EvaluateInteger(Call->getArg(1), Imm, Info)) + return false; const auto *VT = Call->getType()->getAs(); - if (!VT) return false; + if (!VT) + return false; unsigned NumElts = VT->getNumElements(); - // Lane geometry: MMX pshufw is a single 64-bit lane; others use 128-bit lanes. unsigned TotalBits = NumElts * ElemBits; - unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; - unsigned LaneElts = LaneBits / ElemBits; - if (!LaneElts || (NumElts % LaneElts) != 0) return false; + unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; + unsigned LaneElts = LaneBits / ElemBits; + if (!LaneElts || (NumElts % LaneElts) != 0) + return false; uint8_t ctl = static_cast(Imm.getZExtValue()); @@ -11642,31 +11643,25 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, for (unsigned idx = 0; idx != NumElts; idx++) { unsigned LaneBase = (idx / LaneElts) * LaneElts; - unsigned LaneIdx = idx % LaneElts; + unsigned LaneIdx = idx % LaneElts; - unsigned SrcIdx = idx; + unsigned SrcIdx = idx; if (ElemBits == 32) { - // PSHUFD: permute 4×i32 per 128-bit lane unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + sel; } else { - // elemBits == 16 (PSHUFLW / PSHUFHW / PSHUFW) if (LaneElts == 4) { - // MMX PSHUFW: permute entire 64-bit lane (4×i16) unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + sel; } else { - // SSE/AVX/AVX-512: 128-bit lane has 8×i16. Permute a 4×i16 half. constexpr unsigned HalfSize = 4; if (HalfBase == 0) { - // PSHUFLW: permute low half (words 0..3) if (LaneIdx < HalfSize) { unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + sel; } } else if (HalfBase == HalfSize) { - // PSHUFHW: permute high half (words 4..7) if (LaneIdx >= HalfSize) { unsigned rel = LaneIdx - HalfSize; unsigned sel = (ctl >> (2 * rel)) & 0x3; From dc672486d6f73d72305a683ec4cde1f540a4a4bd Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Wed, 1 Oct 2025 23:56:20 +0530 Subject: [PATCH 07/17] Simplify lane element calculation logic Refactor lane element calculation for clarity. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 1368ede2e8893..18ead940b80e9 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2788,10 +2788,8 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned ElemBits = static_cast(primSize(ElemT) * 8); if (ElemBits != 16 && ElemBits != 32) return false; - - unsigned TotalBits = NumElems * ElemBits; - unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; - unsigned LaneElts = LaneBits / ElemBits; + + unsigned LaneElts = 128u / ElemBits; assert(LaneElts && (NumElems % LaneElts == 0)); uint8_t ctl = static_cast(ControlImm.getZExtValue()); From cb30a3206c076181b39100ebd3922994257b1912 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 00:01:24 +0530 Subject: [PATCH 08/17] Simplify calculation of LaneElts in ExprConstant.cpp --- clang/lib/AST/ExprConstant.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index d15888398f89b..8e726335d1187 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11630,9 +11630,7 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, return false; unsigned NumElts = VT->getNumElements(); - unsigned TotalBits = NumElts * ElemBits; - unsigned LaneBits = (TotalBits == 64) ? 64u : 128u; - unsigned LaneElts = LaneBits / ElemBits; + unsigned LaneElts = 128u / ElemBits; if (!LaneElts || (NumElts % LaneElts) != 0) return false; From e83fad7735bbb9479e0bb1a450f8b813d8458b9c Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 00:51:05 +0530 Subject: [PATCH 09/17] [X86] Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in constexpr --- clang/lib/AST/ExprConstant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 8e726335d1187..83b7fa084db08 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11630,7 +11630,9 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, return false; unsigned NumElts = VT->getNumElements(); - unsigned LaneElts = 128u / ElemBits; + unsigned TotalBits = NumElts * ElemBits; + unsigned LaneBits = 128u; + unsigned LaneElts = LaneBits / ElemBits; if (!LaneElts || (NumElts % LaneElts) != 0) return false; From 8588fede0c244f0b604fb5c07be8c776c165b955 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 14:49:42 +0530 Subject: [PATCH 10/17] Update InterpBuiltin.cpp --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 18ead940b80e9..aff5092813558 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2788,7 +2788,7 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned ElemBits = static_cast(primSize(ElemT) * 8); if (ElemBits != 16 && ElemBits != 32) return false; - + unsigned LaneElts = 128u / ElemBits; assert(LaneElts && (NumElems % LaneElts == 0)); From 37c6da9940ffa140eee5433cbdd4dccf8ea4f516 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 14:51:39 +0530 Subject: [PATCH 11/17] Fix formatting of LaneBits declaration --- clang/lib/AST/ExprConstant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 83b7fa084db08..4ee92bf9a7653 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11631,7 +11631,7 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, unsigned NumElts = VT->getNumElements(); unsigned TotalBits = NumElts * ElemBits; - unsigned LaneBits = 128u; + unsigned LaneBits = 128u; unsigned LaneElts = LaneBits / ElemBits; if (!LaneElts || (NumElts % LaneElts) != 0) return false; From f89422ebf8673b8fd8c61630bfb88590d70c0183 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 19:43:57 +0530 Subject: [PATCH 12/17] Simplified interp__builtin_ia32_pshuf function --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 50 +++++++++--------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index aff5092813558..acb9d80c14dee 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2773,10 +2773,8 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, return true; } -enum class Half { None, Low, High }; - static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, - const CallExpr *Call, Half whichHalf) { + const CallExpr *Call, bool IsShufHW) { assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); APSInt ControlImm = popToAPSInt(S, Call->getArg(1)); const Pointer &Src = S.Stk.pop(); @@ -2792,38 +2790,28 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned LaneElts = 128u / ElemBits; assert(LaneElts && (NumElems % LaneElts == 0)); - uint8_t ctl = static_cast(ControlImm.getZExtValue()); - - for (unsigned idx = 0; idx != NumElems; idx++) { - unsigned LaneBase = (idx / LaneElts) * LaneElts; - unsigned LaneIdx = idx % LaneElts; + uint8_t Ctl = static_cast(ControlImm.getZExtValue()); - unsigned SrcIdx = idx; + for (unsigned Idx = 0; Idx != NumElems; Idx++) { + unsigned LaneBase = (Idx / LaneElts) * LaneElts; + unsigned LaneIdx = Idx % LaneElts; + unsigned SrcIdx = Idx; + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; if (ElemBits == 32) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; + SrcIdx = LaneBase + Sel; } else { - if (LaneElts == 4) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } else { - constexpr unsigned HalfSize = 4; - if (whichHalf == Half::Low && LaneIdx < HalfSize) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } else if (whichHalf == Half::High && LaneIdx >= HalfSize) { - unsigned rel = LaneIdx - HalfSize; - unsigned sel = (ctl >> (2 * rel)) & 0x3; - SrcIdx = LaneBase + HalfSize + sel; - } else if (whichHalf == Half::None) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } + unsigned HalfSize = 4; + if (!IsShufHW && LaneIdx < HalfSize) { + SrcIdx = LaneBase + Sel; + } else if (IsShufHW && LaneIdx >= HalfSize) { + unsigned Rel = LaneIdx - HalfSize; + Sel = (Ctl >> (2 * Rel)) & 0x3; + SrcIdx = LaneBase + HalfSize + Sel; } } - INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(idx) = Src.elem(SrcIdx); }); + INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(Idx) = Src.elem(SrcIdx); }); } Dst.initializeAllElements(); return true; @@ -3665,17 +3653,17 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_pshuflw: case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::Low); + return interp__builtin_ia32_pshuf(S, OpPC, Call, false); case X86::BI__builtin_ia32_pshufhw: case X86::BI__builtin_ia32_pshufhw256: case X86::BI__builtin_ia32_pshufhw512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::High); + return interp__builtin_ia32_pshuf(S, OpPC, Call, true); case X86::BI__builtin_ia32_pshufd: case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, Half::None); + return interp__builtin_ia32_pshuf(S, OpPC, Call, false); case X86::BI__builtin_ia32_kandqi: case X86::BI__builtin_ia32_kandhi: From 5b73c1e1a98a28f44f6c5c87853370059c15a527 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 20:15:15 +0530 Subject: [PATCH 13/17] Refactor selection logic in InterpBuiltin.cpp Refactor selection logic for clarity and efficiency. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index acb9d80c14dee..73d718d21f9f8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2796,17 +2796,19 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned LaneBase = (Idx / LaneElts) * LaneElts; unsigned LaneIdx = Idx % LaneElts; unsigned SrcIdx = Idx; - unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; if (ElemBits == 32) { + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; } else { - unsigned HalfSize = 4; - if (!IsShufHW && LaneIdx < HalfSize) { + constexpr unsigned HalfSize = 4; + bool InHigh = LaneIdx >= HalfSize; + if (!IsShufHW && !InHigh) { + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; - } else if (IsShufHW && LaneIdx >= HalfSize) { + } else if (IsShufHW && InHigh) { unsigned Rel = LaneIdx - HalfSize; - Sel = (Ctl >> (2 * Rel)) & 0x3; + unsigned Sel = (Ctl >> (2 * Rel)) & 0x3; SrcIdx = LaneBase + HalfSize + Sel; } } From 689994af9c376b1af895a070b48ee6a481802f16 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 20:26:04 +0530 Subject: [PATCH 14/17] Refactor evalPshufBuiltin parameters for clarity --- clang/lib/AST/ExprConstant.cpp | 57 ++++++++++++++-------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4ee92bf9a7653..936591ae916ec 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11616,8 +11616,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, } static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, - unsigned ElemBits, unsigned HalfBase, - APValue &Out) { + bool IsShufHW, APValue &Out) { APValue Vec; APSInt Imm; if (!EvaluateAsRValue(Info, Call->getArg(0), Vec)) @@ -11628,49 +11627,39 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, const auto *VT = Call->getType()->getAs(); if (!VT) return false; + + QualType ElemT = VT->getElementType(); + unsigned ElemBits = Info.Ctx.getTypeSize(ElemT); unsigned NumElts = VT->getNumElements(); - unsigned TotalBits = NumElts * ElemBits; unsigned LaneBits = 128u; unsigned LaneElts = LaneBits / ElemBits; if (!LaneElts || (NumElts % LaneElts) != 0) return false; - uint8_t ctl = static_cast(Imm.getZExtValue()); + uint8_t Ctl = static_cast(Imm.getZExtValue()); SmallVector ResultElements; ResultElements.reserve(NumElts); - for (unsigned idx = 0; idx != NumElts; idx++) { - unsigned LaneBase = (idx / LaneElts) * LaneElts; - unsigned LaneIdx = idx % LaneElts; - - unsigned SrcIdx = idx; + for (unsigned Idx = 0; Idx != NumElts; Idx++) { + unsigned LaneBase = (Idx / LaneElts) * LaneElts; + unsigned LaneIdx = Idx % LaneElts; + unsigned SrcIdx = Idx; if (ElemBits == 32) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + Sel; } else { - if (LaneElts == 4) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } else { - constexpr unsigned HalfSize = 4; - if (HalfBase == 0) { - if (LaneIdx < HalfSize) { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } - } else if (HalfBase == HalfSize) { - if (LaneIdx >= HalfSize) { - unsigned rel = LaneIdx - HalfSize; - unsigned sel = (ctl >> (2 * rel)) & 0x3; - SrcIdx = LaneBase + HalfBase + sel; - } - } else { - unsigned sel = (ctl >> (2 * LaneIdx)) & 0x3; - SrcIdx = LaneBase + sel; - } + constexpr unsigned HalfSize = 4; + bool InHigh = LaneIdx >= HalfSize; + if (!IsShufHW && !InHigh) { + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; + SrcIdx = LaneBase + Sel; + } else if (IsShufHW && InHigh) { + unsigned Rel = LaneIdx - HalfSize; + unsigned Sel = (Ctl >> (2 * Rel)) & 0x3; + SrcIdx = LaneBase + HalfSize + Sel; } } @@ -12157,7 +12146,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: { APValue R; - if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/0, R)) + if (!evalPshufBuiltin(Info, E, false, R)) return false; return Success(R, E); } @@ -12166,7 +12155,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_pshufhw256: case X86::BI__builtin_ia32_pshufhw512: { APValue R; - if (!evalPshufBuiltin(Info, E, /*ElemBits=*/16, /*HalfBaseElems=*/4, R)) + if (!evalPshufBuiltin(Info, E, true, R)) return false; return Success(R, E); } @@ -12175,7 +12164,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_pshufd256: case X86::BI__builtin_ia32_pshufd512: { APValue R; - if (!evalPshufBuiltin(Info, E, /*ElemBits=*/32, /*HalfBaseElems=*/~0u, R)) + if (!evalPshufBuiltin(Info, E, false, R)) return false; return Success(R, E); } From a6b108e7ed9fa7183e50b68d50ce261b1c2301fd Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 20:37:34 +0530 Subject: [PATCH 15/17] Hoisted selection variable declaration in InterpBuiltin.cpp --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 73d718d21f9f8..0efc577c56849 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2796,19 +2796,18 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned LaneBase = (Idx / LaneElts) * LaneElts; unsigned LaneIdx = Idx % LaneElts; unsigned SrcIdx = Idx; - + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; + if (ElemBits == 32) { - unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; } else { constexpr unsigned HalfSize = 4; bool InHigh = LaneIdx >= HalfSize; if (!IsShufHW && !InHigh) { - unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; } else if (IsShufHW && InHigh) { unsigned Rel = LaneIdx - HalfSize; - unsigned Sel = (Ctl >> (2 * Rel)) & 0x3; + Sel = (Ctl >> (2 * Rel)) & 0x3; SrcIdx = LaneBase + HalfSize + Sel; } } From 6ea8e940933582eb5d882bfe710e8277b93d0b08 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Thu, 2 Oct 2025 20:39:57 +0530 Subject: [PATCH 16/17] Hoisted selection variable declaration in ExprConstant --- clang/lib/AST/ExprConstant.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 936591ae916ec..d0a8e7fcdae50 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11646,19 +11646,18 @@ static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call, unsigned LaneBase = (Idx / LaneElts) * LaneElts; unsigned LaneIdx = Idx % LaneElts; unsigned SrcIdx = Idx; + unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; if (ElemBits == 32) { - unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; } else { constexpr unsigned HalfSize = 4; bool InHigh = LaneIdx >= HalfSize; if (!IsShufHW && !InHigh) { - unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; SrcIdx = LaneBase + Sel; } else if (IsShufHW && InHigh) { unsigned Rel = LaneIdx - HalfSize; - unsigned Sel = (Ctl >> (2 * Rel)) & 0x3; + Sel = (Ctl >> (2 * Rel)) & 0x3; SrcIdx = LaneBase + HalfSize + Sel; } } From 34bc66d57798256f53c677655465dfce1a25a503 Mon Sep 17 00:00:00 2001 From: Nagraj Gaonkar Date: Fri, 3 Oct 2025 19:49:45 +0530 Subject: [PATCH 17/17] Remove blank line in InterpBuiltin.cpp Removed unnecessary blank line in the bytecode interpreter. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 7762f5a3f6dc8..6053237b1a261 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2797,7 +2797,6 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, unsigned LaneIdx = Idx % LaneElts; unsigned SrcIdx = Idx; unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3; - if (ElemBits == 32) { SrcIdx = LaneBase + Sel; } else {