diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 4e25a445bbaa80..6eb3bc531efd79 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -18066,8 +18066,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_psadbw: case INS_vpermps: case INS_vpermpd: + case INS_vpermpd_reg: case INS_vpermd: case INS_vpermq: + case INS_vpermq_reg: case INS_vperm2i128: case INS_vperm2f128: case INS_vextractf128: @@ -18086,6 +18088,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency += PERFSCORE_LATENCY_3C; break; + case INS_vpermw: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency += PERFSCORE_LATENCY_6C; + break; + case INS_pextrb: case INS_pextrd: case INS_pextrw: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3d2748659ef185..5b4e8edbe0271c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -23176,7 +23176,7 @@ GenTree* Compiler::gtNewSimdShuffleNode( #if defined(TARGET_XARCH) uint8_t control = 0; bool crossLane = false; - bool needsZero = varTypeIsSmallInt(simdBaseType); + bool needsZero = varTypeIsSmallInt(simdBaseType) && (simdSize != 64); uint64_t value = 0; simd_t vecCns = {}; simd_t mskCns = {}; @@ -23310,6 +23310,61 @@ GenTree* Compiler::gtNewSimdShuffleNode( retNode = gtNewSimdHWIntrinsicNode(type, op1, cnsNode, NI_AVX2_Permute4x64, simdBaseJitType, simdSize); } } + else if (simdSize == 64) + { + if (elementSize == 4) + { + for (uint32_t i = 0; i < elementCount; i++) + { + vecCns.u32[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize); + } + + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_PermuteVar16x32, simdBaseJitType, simdSize); + } + else if (elementSize == 2) + { + for (uint32_t i = 0; i < elementCount; i++) + { + vecCns.u16[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize); + } + + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512BW_PermuteVar32x16, simdBaseJitType, simdSize); + } + else + { + assert(elementSize == 8); + + for (uint32_t i = 0; i < elementCount; i++) + { + vecCns.u64[i] = (uint8_t)(vecCns.u8[i * elementSize] / elementSize); + } + + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = vecCns; + + // swap the operands to match the encoding requirements + retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, NI_AVX512F_Permute8x64, simdBaseJitType, simdSize); + } + assert(retNode != nullptr); + + // TODO-XArch-AVX512: Switch to VPERMI2* + if (needsZero) + { + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = mskCns; + retNode = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize); + } + + return retNode; + } else { if (needsZero && compOpportunisticallyDependsOn(InstructionSet_SSSE3)) @@ -23356,13 +23411,11 @@ GenTree* Compiler::gtNewSimdShuffleNode( if (needsZero) { - assert(!compIsaSupportedDebugOnly(InstructionSet_SSSE3)); - - op2 = gtNewVconNode(type); - op2->AsVecCon()->gtSimd16Val = mskCns.v128[0]; + assert((simdSize == 32) || !compIsaSupportedDebugOnly(InstructionSet_SSSE3)); - GenTree* zero = gtNewZeroConNode(type); - retNode = gtNewSimdCndSelNode(type, op2, retNode, zero, simdBaseJitType, simdSize); + op2 = gtNewVconNode(type); + op2->AsVecCon()->gtSimdVal = mskCns; + retNode = gtNewSimdBinOpNode(GT_AND, type, op2, retNode, simdBaseJitType, simdSize); } return retNode; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 3c9ac8f2d1ddb8..b0e4444fb85f1e 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -853,6 +853,8 @@ HARDWARE_INTRINSIC(AVX512F, Min, HARDWARE_INTRINSIC(AVX512F, Multiply, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, MultiplyLow, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512F, Or, 64, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_vporq, INS_vporq, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, Permute8x64, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq_reg, INS_vpermq_reg, INS_invalid, INS_vpermpd_reg}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(AVX512F, PermuteVar16x32, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX512F, ShiftLeftLogical, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pslld, INS_pslld, INS_psllq, INS_psllq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, ShiftRightArithmetic, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrad, INS_invalid, INS_vpsraq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, ShiftRightLogical, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_psrld, INS_psrld, INS_psrlq, INS_psrlq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) @@ -902,6 +904,7 @@ HARDWARE_INTRINSIC(AVX512BW, MultiplyHighRoundScale, HARDWARE_INTRINSIC(AVX512BW, MultiplyLow, 64, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX512BW, PackSignedSaturate, 64, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX512BW, PackUnsignedSaturate, 64, 2, {INS_invalid, INS_packuswb, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512BW, PermuteVar32x16, 64, 2, {INS_invalid, INS_invalid, INS_vpermw, INS_vpermw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX512BW, ShiftLeftLogical, 64, 2, {INS_invalid, INS_invalid, INS_psllw, INS_psllw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512BW, ShiftLeftLogical128BitLane, 64, 2, {INS_pslldq, INS_pslldq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512BW, ShiftRightArithmetic, 64, 2, {INS_invalid, INS_invalid, INS_psraw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_MaybeIMM|HW_Flag_NoJmpTableIMM|HW_Flag_FullRangeIMM) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 5f80754bb55ea4..043f86e8c5c1ed 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2211,9 +2211,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Shuffle: case NI_Vector256_Shuffle: + case NI_Vector512_Shuffle: { assert((sig->numArgs == 2) || (sig->numArgs == 3)); - assert((simdSize == 16) || (simdSize == 32)); + assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); GenTree* indices = impStackTop(0).val; @@ -2269,6 +2270,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } } } + else if (simdSize == 64) + { + if (varTypeIsByte(simdBaseType)) + { + // TYP_BYTE, TYP_UBYTE need AVX512_VBMI. + break; + } + } else { assert(simdSize == 16); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index e67de8a9aaa49f..377b48e9cdc068 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -620,6 +620,8 @@ INST3(vmovdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_ INST3(vpabsq, "pabsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX) // Packed absolute value of 64-bit integers INST3(vpandq, "pandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs INST3(vpandnq, "pandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(vpermq_reg, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x36), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register +INST3(vpermpd_reg, "permpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x16), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute 64-bit of input register INST3(vpmaxsq, "pmaxsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit signed integers INST3(vpmaxuq, "pmaxuq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x3F), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed maximum 64-bit unsigned integers INST3(vpminsq, "pminsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x39), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed minimum 64-bit signed integers @@ -640,6 +642,7 @@ INST3(kortestd, "kortestd", IUM_WR, BAD_CODE, BAD_ INST3(kortestq, "kortestq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W1 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) INST3(vmovdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX) INST3(vmovdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | REX_W1_EVEX | Encoding_EVEX) +INST3(vpermw, "permw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8D), INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Permute Packed Doublewords Elements INST3(vpcmpb, "pcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) INST3(vpcmpw, "pcmpw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), INS_TT_FULL, Input_16Bit | REX_W1_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) INST3(vpcmpub, "pcmpub", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3E), INS_TT_FULL_MEM, Input_8Bit | REX_W0_EVEX | Encoding_EVEX | INS_Flags_IsMskSrcSrcEvexInstruction) diff --git a/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs new file mode 100644 index 00000000000000..042df982dfb083 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.cs @@ -0,0 +1,47 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using Xunit; + +public class Program +{ + [Fact] + public static int TestEntryPoint() + { + + Vector256 v256Shuffle = Vector256.Create(100, 101, 102, 103, 104, 105, 106, 107); + Vector256 v256ShuffleExpectedResult = Vector256.Create(107, 105, 0, 101, 106, 104, 0, 100); + Vector256 v256ShuffleActualResult = Vector256Shuffle(v256Shuffle); + if(v256ShuffleExpectedResult != v256ShuffleActualResult) + { + return 1; + } + + Vector512 v512Shuffle = Vector512.Create(100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115); + Vector512 v512ShuffleExpectedResult = Vector512.Create(115, 113, 111, 0, 107, 105, 103, 101, 114, 112, 110, 108, 0, 104, 102, 100); + Vector512 v512ShuffleActualResult = Vector512Shuffle(v512Shuffle); + if (v512ShuffleExpectedResult != v512ShuffleActualResult) + { + return 1; + } + return 100; + } + + + [MethodImpl(MethodImplOptions.NoInlining)] + public static Vector256 Vector256Shuffle(Vector256 v1) + { + return Vector256.Shuffle(v1, Vector256.Create(7, 5, 132, 1, 6, 4, -3, 0)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static Vector512 Vector512Shuffle(Vector512 v1) + { + return Vector512.Shuffle(v1, Vector512.Create(15, 13, 11, 99, 7, 5, 3, 1, 14, 12, 10, 8, -11, 4, 2, 0)); + } +} diff --git a/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj new file mode 100644 index 00000000000000..501217e4d86892 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/GitHub_85129/GitHub_85129.csproj @@ -0,0 +1,9 @@ + + + None + True + + + + +