diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 8332eac2a890c..14fd169360dee 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -280,8 +280,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; - def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; - def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; } let Features = "sse2", @@ -300,6 +298,9 @@ let Features = "sse2", def psrawi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">; def psradi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">; + + def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; + def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; } let Features = "sse3", Attributes = [NoThrow] in { @@ -607,12 +608,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i : X86Builtin< "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; @@ -646,10 +645,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">; def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">; def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">; + def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def psrlwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">; def psrldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">; def psrlqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">; + def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">; def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">; @@ -2090,6 +2091,9 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; +} + +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index ff50e6dbfb44e..3b725296960b6 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3320,6 +3320,38 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_x86_byteshift( + InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID, + llvm::function_ref + Fn) { + assert(Call->getNumArgs() == 2); + + APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); + uint64_t Shift = ImmAPS.getZExtValue() & 0xff; + + const Pointer &Src = S.Stk.pop(); + if (!Src.getFieldDesc()->isPrimitiveArray()) + return false; + + unsigned NumElems = Src.getNumElems(); + const Pointer &Dst = S.Stk.peek(); + PrimType ElemT = Src.getFieldDesc()->getPrimType(); + + for (unsigned Lane = 0; Lane != NumElems; Lane += 16) { + for (unsigned I = 0; I != 16; ++I) { + unsigned Base = Lane + I; + APSInt Result = APSInt(Fn(Src, Lane, I, Shift)); + INT_TYPE_SWITCH_NO_BOOL(ElemT, + { Dst.elem(Base) = static_cast(Result); }); + } + } + + Dst.initializeAllElements(); + + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) @@ -4390,6 +4422,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_vec_set_v4di: return interp__builtin_vec_set(S, OpPC, Call, BuiltinID); + case X86::BI__builtin_ia32_pslldqi128_byteshift: + case X86::BI__builtin_ia32_pslldqi256_byteshift: + case X86::BI__builtin_ia32_pslldqi512_byteshift: + // These SLLDQ intrinsics always operate on byte elements (8 bits). + // The lane width is hardcoded to 16 to match the SIMD register size, + // but the algorithm processes one byte per iteration, + // so APInt(8, ...) is correct and intentional. + return interp__builtin_x86_byteshift( + S, OpPC, Call, BuiltinID, + [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) { + if (I < Shift) { + return APInt(8, 0); + } + return APInt(8, Src.elem(Lane + I - Shift)); + }); + + case X86::BI__builtin_ia32_psrldqi128_byteshift: + case X86::BI__builtin_ia32_psrldqi256_byteshift: + case X86::BI__builtin_ia32_psrldqi512_byteshift: + // These SRLDQ intrinsics always operate on byte elements (8 bits). + // The lane width is hardcoded to 16 to match the SIMD register size, + // but the algorithm processes one byte per iteration, + // so APInt(8, ...) is correct and intentional. + return interp__builtin_x86_byteshift( + S, OpPC, Call, BuiltinID, + [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) { + if (I + Shift < 16) { + return APInt(8, Src.elem(Lane + I + Shift)); + } + + return APInt(8, 0); + }); + default: S.FFDiag(S.Current->getLocation(OpPC), diag::note_invalid_subexpr_in_const_expr) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 2bd4476b6f3a4..ae698788aaa20 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12906,6 +12906,66 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(Elems.data(), NumElems), E); } + + case X86::BI__builtin_ia32_pslldqi128_byteshift: + case X86::BI__builtin_ia32_pslldqi256_byteshift: + case X86::BI__builtin_ia32_pslldqi512_byteshift: { + assert(E->getNumArgs() == 2); + + APValue Src; + APSInt Imm; + if (!EvaluateAsRValue(Info, E->getArg(0), Src) || + !EvaluateInteger(E->getArg(1), Imm, Info)) + return false; + + unsigned VecLen = Src.getVectorLength(); + unsigned Shift = Imm.getZExtValue() & 0xff; + + SmallVector ResultElements; + for (unsigned Lane = 0; Lane != VecLen; Lane += 16) { + for (unsigned I = 0; I != 16; ++I) { + if (I < Shift) { + APSInt Zero(8, /*isUnsigned=*/true); + Zero = 0; + ResultElements.push_back(APValue(Zero)); + } else { + ResultElements.push_back(Src.getVectorElt(Lane + I - Shift)); + } + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + + case X86::BI__builtin_ia32_psrldqi128_byteshift: + case X86::BI__builtin_ia32_psrldqi256_byteshift: + case X86::BI__builtin_ia32_psrldqi512_byteshift: { + assert(E->getNumArgs() == 2); + + APValue Src; + APSInt Imm; + if (!EvaluateAsRValue(Info, E->getArg(0), Src) || + !EvaluateInteger(E->getArg(1), Imm, Info)) + return false; + + unsigned VecLen = Src.getVectorLength(); + unsigned Shift = Imm.getZExtValue() & 0xff; + + SmallVector ResultElements; + for (unsigned Lane = 0; Lane != VecLen; Lane += 16) { + for (unsigned I = 0; I != 16; ++I) { + if (I + Shift < 16) { + ResultElements.push_back(Src.getVectorElt(Lane + I + Shift)); + } else { + APSInt Zero(8, /*isUnsigned=*/true); + Zero = 0; + ResultElements.push_back(APValue(Zero)); + } + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } } } diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 5c52d84de56f5..de4cb2fd0b055 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1236,6 +1236,8 @@ __m256i test_mm256_slli_si256(__m256i a) { // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> return _mm256_slli_si256(a, 3); } +TEST_CONSTEXPR(match_v32qi(_mm256_slli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)); +TEST_CONSTEXPR(match_v32qi(_mm256_slli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_sllv_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_sllv_epi32 @@ -1379,6 +1381,8 @@ __m256i test_mm256_srli_si256(__m256i a) { // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> return _mm256_srli_si256(a, 3); } +TEST_CONSTEXPR(match_v32qi(_mm256_srli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0)); +TEST_CONSTEXPR(match_v32qi(_mm256_srli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_srlv_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_srlv_epi32 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index d07e40a2b7071..be2cd480f7558 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -2045,6 +2045,8 @@ __m512i test_mm512_bslli_epi128(__m512i __A) { // CHECK: shufflevector <64 x i8> zeroinitializer, <64 x i8> %{{.*}}, <64 x i32> return _mm512_bslli_epi128(__A, 5); } +TEST_CONSTEXPR(match_v64qi(_mm512_bslli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 5), 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 0, 0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 0, 0, 0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59)); +TEST_CONSTEXPR(match_v64qi(_mm512_bslli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_srlv_epi16 @@ -2223,6 +2225,9 @@ __m512i test_mm512_bsrli_epi128(__m512i __A) { // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> zeroinitializer, <64 x i32> return _mm512_bsrli_epi128(__A, 5); } +TEST_CONSTEXPR(match_v64qi(_mm512_bsrli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 5), 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0, 0, 0, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 0, 0, 0, 0, 0, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v64qi(_mm512_bsrli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); + __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_mov_epi16 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index 8e4fb86112c56..916236a1614e8 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -1418,6 +1418,8 @@ __m128i test_mm_slli_si128(__m128i A) { // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> return _mm_slli_si128(A, 5); } +TEST_CONSTEXPR(match_v16qi(_mm_slli_si128(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 5), 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)); +TEST_CONSTEXPR(match_v16qi(_mm_slli_si128(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_slli_si128_2(__m128i A) { // CHECK-LABEL: test_mm_slli_si128_2 @@ -1570,6 +1572,8 @@ __m128i test_mm_srli_si128(__m128i A) { // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> return _mm_srli_si128(A, 5); } +TEST_CONSTEXPR(match_v16qi(_mm_srli_si128(((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 5), 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v16qi(_mm_srli_si128(((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_srli_si128_2(__m128i A) { // CHECK-LABEL: test_mm_srli_si128_2