From 6b2df1725cef43e6fba37daed114a911fb63d5a7 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 20 Mar 2023 14:01:45 -0700 Subject: [PATCH 01/44] Enable EVEX feature: embedded broadcast Embedded Broadcast is enabled in Vector256.Add() with limited cases: 1. Vector256.Add(Vec, Vector256.Create(DCon)); 2. Vector256 VecCns = Vector256.Create(DCon); Vector256.Add(Vec, VecCns); 3. Vector256.Add(Vec, Vector256.Create(LCL_VAR)); 4. Vector256 VecCns = Vector256.Create(LCL_VAR); Vector256.Add(Vec, VecCns); Note: Case 2 4 can only be optimized when DOTNET_TieredCompilation = 0. --- src/coreclr/jit/codegeninterface.h | 4 + src/coreclr/jit/codegenxarch.cpp | 2 +- src/coreclr/jit/emit.h | 32 ++++++- src/coreclr/jit/emitxarch.cpp | 57 +++++++++--- src/coreclr/jit/emitxarch.h | 29 +++++-- src/coreclr/jit/gentree.cpp | 4 + src/coreclr/jit/gentree.h | 40 +++++++++ src/coreclr/jit/instr.cpp | 28 +++++- src/coreclr/jit/instr.h | 3 + src/coreclr/jit/instrsxarch.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 134 +++++++++++++++++++++++++++++ 11 files changed, 311 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index 1f8f4c6b0c7fe4..c408eae8b5a0f7 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -764,6 +764,10 @@ class CodeGenInterface virtual const char* siStackVarName(size_t offs, size_t size, unsigned reg, unsigned stkOffs) = 0; #endif // LATE_DISASM + +#if defined(TARGET_XARCH) + bool IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op); +#endif }; #endif // _CODEGEN_INTERFACE_H_ diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 151fa1403abdeb..fce83922e0ab27 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7769,7 +7769,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) #endif } - GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0); + GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0, /*isEB*/ false); } //----------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 1d49eb69070061..d869e6604d70b4 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -781,6 +781,9 @@ class emitter unsigned _idCallRegPtr : 1; // IL indirect calls: addr in reg unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables +#if defined(TARGET_XARCH) + unsigned _idEmbBroadcast : 1; +#endif // TARGET_XARCH #ifdef TARGET_ARM64 opSize _idOpSize : 3; // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16 @@ -830,8 +833,10 @@ class emitter #define ID_EXTRA_BITFIELD_BITS (16) #elif defined(TARGET_ARM64) #define ID_EXTRA_BITFIELD_BITS (18) -#elif defined(TARGET_XARCH) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) +#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) #define ID_EXTRA_BITFIELD_BITS (14) +#elif defined(TARGET_XARCH) +#define ID_EXTRA_BITFIELD_BITS (15) #else #error Unsupported or unset target architecture #endif @@ -1529,6 +1534,19 @@ class emitter _idNoGC = val; } +#ifdef TARGET_XARCH + bool idIsEmbBroadcast() const + { + return _idEmbBroadcast != 0; + } + void idSetEmbBroadcast() + { + assert(_idEmbBroadcast == 0); + _idEmbBroadcast = 1; + assert(_idEmbBroadcast == 1); + } +#endif + #ifdef TARGET_ARMARCH bool idIsLclVar() const { @@ -3874,6 +3892,18 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id) const } } + case INS_addps: + { + if (!id->idIsEmbBroadcast()) + { + return defaultSize; + } + else + { + return EA_4BYTE; + } + } + default: { return defaultSize; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ce54be61350e79..0c15e30a71f78a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1231,6 +1231,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL #define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL +#define EMBEDDED_BROADCAST_BIT 0x0000001000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX perfix with only LL' bits set. @@ -1268,6 +1269,17 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at return code; } +emitter::code_t emitter::AddEmbeddedBroadcast(const instrDesc* id, code_t code) +{ + if (id->idIsEmbBroadcast()) + { + // Shouldn have already added EVEX prefix + assert(hasEvexPrefix(code)); + code |= EMBEDDED_BROADCAST_BIT; + } + return code; +} + // Returns true if this instruction requires a VEX prefix // All AVX instructions require a VEX prefix bool emitter::TakesVexPrefix(instruction ins) const @@ -6778,8 +6790,13 @@ void emitter::emitIns_R_AR_R(instruction ins, emitCurIGsize += sz; } -void emitter::emitIns_R_R_C( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs) +void emitter::emitIns_R_R_C(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + bool isEmbBroadcast) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6797,6 +6814,12 @@ void emitter::emitIns_R_R_C( id->idReg1(reg1); id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; +#if defined(TARGET_XARCH) + if (isEmbBroadcast) + { + id->idSetEmbBroadcast(); + } +#endif // TARGET_XARCH UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -6829,7 +6852,8 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, emitCurIGsize += sz; } -void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs) +void emitter::emitIns_R_R_S( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, bool isEmbBroadcast) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6842,6 +6866,12 @@ void emitter::emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regN id->idReg2(reg2); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); +#if defined(TARGET_XARCH) + if (isEmbBroadcast) + { + id->idSetEmbBroadcast(); + } +#endif // TARGET_XARCH #ifdef DEBUG id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; #endif @@ -8151,12 +8181,17 @@ void emitter::emitIns_SIMD_R_R_A( // fldHnd -- The CORINFO_FIELD_HANDLE used for the memory address // offs -- The offset added to the memory address from fldHnd // -void emitter::emitIns_SIMD_R_R_C( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs) +void emitter::emitIns_SIMD_R_R_C(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + bool isEmbBroadcast) { if (UseSimdEncoding()) { - emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs); + emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs, isEmbBroadcast); } else { @@ -8214,11 +8249,11 @@ void emitter::emitIns_SIMD_R_R_R( // offs -- The offset added to the memory address from varx // void emitter::emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs) + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, bool isEmbBroadcast) { if (UseSimdEncoding()) { - emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs); + emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs, isEmbBroadcast); } else { @@ -8395,7 +8430,7 @@ void emitter::emitIns_SIMD_R_R_R_C(instruction ins, assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs); + emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs, /*isEB*/ false); } //------------------------------------------------------------------------ @@ -8488,7 +8523,7 @@ void emitter::emitIns_SIMD_R_R_R_S( assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs); + emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs, false); } //------------------------------------------------------------------------ @@ -16905,6 +16940,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); + code = AddEmbeddedBroadcast(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17147,6 +17183,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); + code = AddEmbeddedBroadcast(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index c360efdd52ff7e..158d9c0d23614d 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -294,6 +294,7 @@ bool hasEvexPrefix(code_t code) return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; } code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); +code_t AddEmbeddedBroadcast(const instrDesc* id, code_t code); //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. @@ -585,10 +586,16 @@ void emitIns_R_AR_R(instruction ins, int scale, int offs); -void emitIns_R_R_C( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs); +void emitIns_R_R_C(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + bool isEmbBroadcast); -void emitIns_R_R_S(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs); +void emitIns_R_R_S( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, bool isEmbBroadcast); void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); @@ -689,10 +696,18 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); -void emitIns_SIMD_R_R_C( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs); +void emitIns_SIMD_R_R_AR( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset); +void emitIns_SIMD_R_R_C(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + CORINFO_FIELD_HANDLE fldHnd, + int offs, + bool isEmbBroadcast); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); -void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs); +void emitIns_SIMD_R_R_S( + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, bool isEmbBroadcast); void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); @@ -847,7 +862,7 @@ inline bool emitIsUncondJump(instrDesc* jmp) // inline bool HasEmbeddedBroadcast(const instrDesc* id) const { - return false; + return id->idIsEmbBroadcast(); } inline bool HasHighSIMDReg(const instrDesc* id) const; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 9bd689dac50dea..8a9d00403fd947 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21641,6 +21641,10 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { float cnsVal = static_cast(op1->AsDblCon()->DconValue()); +#if defined(TARGET_XARCH) + vecCon->SetCreatedFromScalar(); +#endif // TARGET_XARCH + for (unsigned i = 0; i < (simdSize / 4); i++) { vecCon->gtSimdVal.f32[i] = cnsVal; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a050f2d6e2bb44..7bbd9ef7e192a4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -555,6 +555,11 @@ enum GenTreeFlags : unsigned int GTF_MDARRLEN_NONFAULTING = 0x20000000, // GT_MDARR_LENGTH -- An MD array length operation that cannot fault. Same as GT_IND_NONFAULTING. GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. + + GTF_SIMDASHW_OP = 0x80000000, // GT_HWINTRINSIC -- Indicates that the structHandle should be gotten from gtGetStructHandleForSIMD + // rather than from gtGetStructHandleForHWSIMD. + GTF_SIMD_ADD_EB = 0x80000000, // GT_HWINTRINSIC -- Indicate if this node will enable the embedded broadcast feature. + GTF_VECCON_FROMSCALAR = 0x80000000 // GT_VECCON -- Indicate the vector constant is created from the same scalar. }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -2017,6 +2022,40 @@ struct GenTree ClearRegOptional(); } + bool WithEmbeddedBroadcast() + { + return ((gtFlags & GTF_SIMD_ADD_EB) != 0); + } + + void SetEmbeddedBroadcast() + { + gtFlags |= GTF_SIMD_ADD_EB; + assert(WithEmbeddedBroadcast()); + } + + void ClearEmbeddedBroadcast() + { + gtFlags &= ~GTF_SIMD_ADD_EB; + assert(!WithEmbeddedBroadcast()); + } + + bool IsCreatedFromScalar() + { + return ((gtFlags & GTF_VECCON_FROMSCALAR) != 0); + } + + void SetCreatedFromScalar() + { + gtFlags |= GTF_VECCON_FROMSCALAR; + assert(IsCreatedFromScalar()); + } + + void ClearCreatedFromScalar() + { + gtFlags &= ~GTF_VECCON_FROMSCALAR; + assert(!IsCreatedFromScalar()); + } + bool CanCSE() const { return ((gtFlags & GTF_DONT_CSE) == 0); @@ -6317,6 +6356,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic // struct GenTreeVecCon : public GenTree { + union { simd8_t gtSimd8Val; simd12_t gtSimd12Val; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index b942ddd6d878d9..7281436e3dcf3c 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1096,6 +1096,20 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT } } +#if defined(TARGET_XARCH) +bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) +{ + bool IsEmbBroadcast = false; + if ((op->IsCnsFltOrDbl() || op->IsCnsIntOrI() || (op->OperIs(GT_LCL_VAR) && op->TypeGet() == TYP_FLOAT)) && + op->isContained() && GetEmitter()->UseEvexEncoding()) + { + insFlags flags = instInfo[ins]; + IsEmbBroadcast = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; + } + return IsEmbBroadcast; +} +#endif // TARGET_XARCH + //------------------------------------------------------------------------ // inst_RV_RV_TT: Generates an instruction that takes 2 operands: // a register operand and an operand that may be in memory or register @@ -1118,15 +1132,21 @@ void CodeGen::inst_RV_RV_TT( // TODO-XArch-CQ: Commutative operations can have op1 be contained // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - OperandDesc op2Desc = genOperandDesc(op2); + OperandDesc op2Desc = genOperandDesc(op2); + bool IsEmbBroadcast = false; +#if defined(TARGET_XARCH) + IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); +#endif // TARGET_XARCH switch (op2Desc.GetKind()) { case OperandKind::ClsVar: - emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0); + { + emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, IsEmbBroadcast); break; - + } case OperandKind::Local: - emit->emitIns_SIMD_R_R_S(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset()); + emit->emitIns_SIMD_R_R_S(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset(), + IsEmbBroadcast); break; case OperandKind::Indir: diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index df8658195f8da9..906ac381449a7f 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -180,6 +180,9 @@ enum insFlags : uint64_t KInstruction = 1ULL << 41, + // EVEX feature: embedded broadcast + INS_Flags_EmbeddedBroadcastSupported = 1ULL << 42, + // TODO-Cleanup: Remove this flag and its usage from TARGET_XARCH INS_FLAGS_DONT_CARE = 0x00ULL, }; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 0bb638bfea69bd..6afef063004e5b 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -191,7 +191,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, INST3(FIRST_SSE_INSTRUCTION, "FIRST_SSE_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // SSE -INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed singles +INST3(addps, "addps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x58), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed singles INST3(addss, "addss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles INST3(andnps, "andnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles INST3(andps, "andps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 16120f1279d540..ffd8f4936f9649 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1630,7 +1630,70 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_FMA_MultiplyAddScalar: LowerFusedMultiplyAdd(node); break; +#if defined(TARGET_XARCH) + case NI_AVX2_Add: + case NI_AVX_Add: + { + if (!(comp->canUseEvexEncoding()) || !(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))) + { + // the changes below is EVEX exclusive, if not using EVEX, break early, and lower as usual. + break; + } + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + var_types simdBaseType = node->GetSimdBaseType(); + // look for pre-processed scalar operand first. + if (op1->OperIs(GT_LCL_VAR) && op1->TypeGet() == TYP_FLOAT) + { + node->Op(1) = op2; + node->Op(2) = op1; + node->SetEmbeddedBroadcast(); + // RUIHAN: should we make op2 contained here, or in contained check later? + MakeSrcContained(node, node->Op(2)); + } + else if (op2->OperIs(GT_LCL_VAR) && op2->TypeGet() == TYP_FLOAT) + { + node->SetEmbeddedBroadcast(); + MakeSrcContained(node, node->Op(2)); + } + // RUIHAN: no need to consider the case for both op1 and op2 are CnsVec + // in this situation, two constant vector will be merged beforehand. + // even I am wondering if we need to consider if op1 is CnsVec or not, + // seems CnsVec will be swap op2 already. + if (!node->WithEmbeddedBroadcast() && (op1->IsCnsVec() || op2->IsCnsVec())) + { + // if no Create(LCL_VAR) is pre-processed before lowering Add, + // seek for EmbBroadcast opportunities when one of the operaneds is CnsVec. + GenTree* VecCns = op2->IsCnsVec() ? op2 : op1; + if (VecCns == op1) + { + node->Op(1) = op2; + node->Op(2) = op1; + } + if (VecCns->IsCreatedFromScalar()) + { + switch (simdBaseType) + { + case TYP_FLOAT: + { + float cns = static_cast(VecCns->AsVecCon()->gtSimd32Val.f32[0]); + GenTree* ScalarCns = comp->gtNewDconNode(cns, simdBaseType); + BlockRange().Remove(op2); + BlockRange().InsertAfter(node->Op(1), ScalarCns); + node->Op(2) = ScalarCns; + node->SetEmbeddedBroadcast(); + MakeSrcContained(node, node->Op(2)); + } + break; + default: + break; + } + } + } + break; + } +#endif default: break; } @@ -2366,6 +2429,47 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // var tmp1 = Vector128.CreateScalarUnsafe(op1); // return Avx2.BroadcastScalarToVector256(tmp1); + // try to handle the case: Vector256.Add(vec, Vector256.Create(x)) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + { + if (op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + { + LIR::Use Use; + bool foundUse = BlockRange().TryGetUse(node, &Use); + GenTree* NodeMayEnableEmbBroadcast = nullptr; + if (foundUse) + { + NodeMayEnableEmbBroadcast = Use.User(); + } + if (foundUse && NodeMayEnableEmbBroadcast->OperIs(GT_HWINTRINSIC)) + { + NamedIntrinsic EBId = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->GetHWIntrinsicId(); + if (EBId == NI_AVX_Add || EBId == NI_AVX2_Add) + { + GenTree* EBOp1 = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(1); + GenTree* EBOp2 = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(2); + if (EBOp1 == node) + { + BlockRange().Remove(EBOp1); + const unsigned op1LclNum = op1->AsLclVar()->GetLclNum(); + // RUIHAN: what type of reason should we put here? + comp->lvaSetVarDoNotEnregister( + op1LclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(1) = op1; + } + else if (EBOp2 == node) + { + BlockRange().Remove(EBOp2); + const unsigned op1LclNum = op1->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister( + op1LclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(2) = op1; + } + return op1->gtNext; + } + } + } + } tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); LowerNode(tmp1); @@ -6983,6 +7087,36 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre break; } + case NI_AVX_Add: + case NI_AVX2_Add: + { + if (parentNode->WithEmbeddedBroadcast()) + { + supportsGeneralLoads = true; + } + else + { + assert(!supportsSIMDScalarLoads); + + if (!comp->canUseVexEncoding()) + { + assert(!supportsUnalignedSIMDLoads); + supportsAlignedSIMDLoads = true; + } + else + { + supportsAlignedSIMDLoads = !comp->opts.MinOpts(); + supportsUnalignedSIMDLoads = true; + } + + const unsigned expectedSize = genTypeSize(parentNode->TypeGet()); + const unsigned operandSize = genTypeSize(childNode->TypeGet()); + + supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize); + } + break; + } + default: { assert(!supportsSIMDScalarLoads); From a8c7d8240f0a4f84ba8c7fa060b3dfefcd84bbcc Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 14 Apr 2023 11:31:22 -0700 Subject: [PATCH 02/44] remove some irrelevent change from previous main. --- src/coreclr/jit/gentree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 7bbd9ef7e192a4..d8917f534b077a 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -556,9 +556,8 @@ enum GenTreeFlags : unsigned int GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. - GTF_SIMDASHW_OP = 0x80000000, // GT_HWINTRINSIC -- Indicates that the structHandle should be gotten from gtGetStructHandleForSIMD - // rather than from gtGetStructHandleForHWSIMD. GTF_SIMD_ADD_EB = 0x80000000, // GT_HWINTRINSIC -- Indicate if this node will enable the embedded broadcast feature. + GTF_VECCON_FROMSCALAR = 0x80000000 // GT_VECCON -- Indicate the vector constant is created from the same scalar. }; From 73fb02ffd0c0259778ee1b420485034c19df728d Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 17 Apr 2023 16:48:09 -0700 Subject: [PATCH 03/44] Enable containment at Broadcast intrinsic to improve the embedded broadcast enabling works. --- src/coreclr/jit/gentree.cpp | 17 +++ src/coreclr/jit/gentree.h | 25 +--- src/coreclr/jit/hwintrinsic.h | 9 ++ src/coreclr/jit/hwintrinsiclistxarch.h | 4 +- src/coreclr/jit/instr.cpp | 78 +++++++++++- src/coreclr/jit/lowerxarch.cpp | 166 +++++-------------------- 6 files changed, 137 insertions(+), 162 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8a9d00403fd947..ddce95bd54ee6f 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19137,6 +19137,11 @@ bool GenTree::isContainableHWIntrinsic() const return true; } + case NI_AVX2_BroadcastScalarToVector256: + { + return true; + } + default: { return false; @@ -19272,6 +19277,18 @@ bool GenTree::isEvexCompatibleHWIntrinsic() #endif } +//------------------------------------------------------------------------ +// isEmbBroadcastHWIntrinsic: Checks if the intrinsic is a embedded broadcast compatible inintrsic. +// +// Return Value: +// true if the intrisic node lowering instruction is embedded broadcast compatible. +// +bool GenTree::isEmbBroadcastHWIntrinsic() +{ + assert(gtOper == GT_HWINTRINSIC); + return HWIntrinsicInfo::IsEmbBroadcastCompatible(AsHWIntrinsic()->GetHWIntrinsicId()); +} + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index d8917f534b077a..a39cf261f55259 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -556,8 +556,6 @@ enum GenTreeFlags : unsigned int GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. - GTF_SIMD_ADD_EB = 0x80000000, // GT_HWINTRINSIC -- Indicate if this node will enable the embedded broadcast feature. - GTF_VECCON_FROMSCALAR = 0x80000000 // GT_VECCON -- Indicate the vector constant is created from the same scalar. }; @@ -1489,6 +1487,7 @@ struct GenTree bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); bool isEvexCompatibleHWIntrinsic(); + bool isEmbBroadcastHWIntrinsic(); #else bool isCommutativeHWIntrinsic() const { @@ -1509,6 +1508,11 @@ struct GenTree { return false; } + + bool isEmbBroadcastHWIntrinsic() + { + return false; + } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) @@ -2021,23 +2025,6 @@ struct GenTree ClearRegOptional(); } - bool WithEmbeddedBroadcast() - { - return ((gtFlags & GTF_SIMD_ADD_EB) != 0); - } - - void SetEmbeddedBroadcast() - { - gtFlags |= GTF_SIMD_ADD_EB; - assert(WithEmbeddedBroadcast()); - } - - void ClearEmbeddedBroadcast() - { - gtFlags &= ~GTF_SIMD_ADD_EB; - assert(!WithEmbeddedBroadcast()); - } - bool IsCreatedFromScalar() { return ((gtFlags & GTF_VECCON_FROMSCALAR) != 0); diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 26599ba5b1de25..14d4d5dcdbb218 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -200,6 +200,9 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is a PermuteVar2x intrinsic HW_Flag_PermuteVar2x = 0x4000000, + + // The intrinsic is an embedded broadcast compatiable intrinsic + HW_Flag_EmbBroadcastCompatible = 0x8000000, #endif // TARGET_XARCH }; @@ -769,6 +772,12 @@ struct HWIntrinsicInfo return (flags & HW_Flag_Commutative) != 0; } + static bool IsEmbBroadcastCompatible(NamedIntrinsic id) + { + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_EmbBroadcastCompatible) != 0; + } + static bool IsMaybeCommutative(NamedIntrinsic id) { HWIntrinsicFlag flags = lookupFlags(id); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index b804d4a0edd5d3..90e0862485d771 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -675,7 +675,7 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics -HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -754,7 +754,7 @@ HARDWARE_INTRINSIC(AVX, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX2 Intrinsics HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 7281436e3dcf3c..c606c53fb87084 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -800,6 +800,24 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) switch (intrinsicId) { + case NI_AVX2_BroadcastScalarToVector256: + { + assert(hwintrinsic->isContained()); + GenTree* tmp = hwintrinsic->Op(1); + if(tmp->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + { + op = tmp->AsHWIntrinsic()->Op(1); + assert(op->OperIs(GT_LCL_VAR)); + return genOperandDesc(op); + } + else + { + assert(hwintrinsic->OperIsMemoryLoad()); + assert(hwintrinsic->GetOperandCount() == 1); + addr = hwintrinsic->Op(1); + } + break; + } case NI_Vector128_CreateScalarUnsafe: case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: @@ -1099,14 +1117,47 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT #if defined(TARGET_XARCH) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { - bool IsEmbBroadcast = false; - if ((op->IsCnsFltOrDbl() || op->IsCnsIntOrI() || (op->OperIs(GT_LCL_VAR) && op->TypeGet() == TYP_FLOAT)) && - op->isContained() && GetEmitter()->UseEvexEncoding()) + // need to check if the datatype is EB compatible, say 32-, 64-bit. + insFlags flags = instInfo[ins]; + bool IsEmbBroadcastCompatible = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; + if(!IsEmbBroadcastCompatible) { - insFlags flags = instInfo[ins]; - IsEmbBroadcast = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; + return false; + } + + // RUIHAN check 2 situations here + // 1. Add -> Broadcast -> CreateScalar -> LCL_VAR + // 2. CnsVec + bool IsEmbBroadcastEnabled = false; + switch (op->OperGet()) + { + case GT_HWINTRINSIC: + { + if(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256) + { + GenTree* tmp = op->AsHWIntrinsic()->Op(1); + if(tmp->OperIs(GT_HWINTRINSIC) && tmp->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + { + IsEmbBroadcastEnabled = tmp->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR) + && tmp->AsHWIntrinsic()->Op(1)->TypeIs(TYP_FLOAT); + } + } + } + break; + + case GT_CNS_VEC: + { + if(op->IsCreatedFromScalar()) + { + IsEmbBroadcastEnabled = true; + } + } + + default: + break; } - return IsEmbBroadcast; + + return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; } #endif // TARGET_XARCH @@ -1136,6 +1187,21 @@ void CodeGen::inst_RV_RV_TT( bool IsEmbBroadcast = false; #if defined(TARGET_XARCH) IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); + if(IsEmbBroadcast && op2->OperIs(GT_CNS_VEC) && op2->AsVecCon()->IsCreatedFromScalar()) + { + switch (ins) + { + case INS_addps: + { + float scalar = static_cast(op2->AsVecCon()->gtSimd32Val.f32[0]); + op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); + } + break; + + default: + break; + } + } #endif // TARGET_XARCH switch (op2Desc.GetKind()) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ffd8f4936f9649..4d58340292c14d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1630,70 +1630,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_FMA_MultiplyAddScalar: LowerFusedMultiplyAdd(node); break; -#if defined(TARGET_XARCH) - case NI_AVX2_Add: - case NI_AVX_Add: - { - if (!(comp->canUseEvexEncoding()) || !(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))) - { - // the changes below is EVEX exclusive, if not using EVEX, break early, and lower as usual. - break; - } - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - var_types simdBaseType = node->GetSimdBaseType(); - // look for pre-processed scalar operand first. - if (op1->OperIs(GT_LCL_VAR) && op1->TypeGet() == TYP_FLOAT) - { - node->Op(1) = op2; - node->Op(2) = op1; - node->SetEmbeddedBroadcast(); - // RUIHAN: should we make op2 contained here, or in contained check later? - MakeSrcContained(node, node->Op(2)); - } - else if (op2->OperIs(GT_LCL_VAR) && op2->TypeGet() == TYP_FLOAT) - { - node->SetEmbeddedBroadcast(); - MakeSrcContained(node, node->Op(2)); - } - // RUIHAN: no need to consider the case for both op1 and op2 are CnsVec - // in this situation, two constant vector will be merged beforehand. - // even I am wondering if we need to consider if op1 is CnsVec or not, - // seems CnsVec will be swap op2 already. - if (!node->WithEmbeddedBroadcast() && (op1->IsCnsVec() || op2->IsCnsVec())) - { - // if no Create(LCL_VAR) is pre-processed before lowering Add, - // seek for EmbBroadcast opportunities when one of the operaneds is CnsVec. - GenTree* VecCns = op2->IsCnsVec() ? op2 : op1; - if (VecCns == op1) - { - node->Op(1) = op2; - node->Op(2) = op1; - } - if (VecCns->IsCreatedFromScalar()) - { - switch (simdBaseType) - { - case TYP_FLOAT: - { - float cns = static_cast(VecCns->AsVecCon()->gtSimd32Val.f32[0]); - GenTree* ScalarCns = comp->gtNewDconNode(cns, simdBaseType); - BlockRange().Remove(op2); - BlockRange().InsertAfter(node->Op(1), ScalarCns); - node->Op(2) = ScalarCns; - node->SetEmbeddedBroadcast(); - MakeSrcContained(node, node->Op(2)); - } - break; - default: - break; - } - } - } - break; - } -#endif default: break; } @@ -2429,52 +2366,36 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // var tmp1 = Vector128.CreateScalarUnsafe(op1); // return Avx2.BroadcastScalarToVector256(tmp1); - // try to handle the case: Vector256.Add(vec, Vector256.Create(x)) - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - if (op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) - { - LIR::Use Use; - bool foundUse = BlockRange().TryGetUse(node, &Use); - GenTree* NodeMayEnableEmbBroadcast = nullptr; - if (foundUse) - { - NodeMayEnableEmbBroadcast = Use.User(); - } - if (foundUse && NodeMayEnableEmbBroadcast->OperIs(GT_HWINTRINSIC)) - { - NamedIntrinsic EBId = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->GetHWIntrinsicId(); - if (EBId == NI_AVX_Add || EBId == NI_AVX2_Add) - { - GenTree* EBOp1 = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(1); - GenTree* EBOp2 = NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(2); - if (EBOp1 == node) - { - BlockRange().Remove(EBOp1); - const unsigned op1LclNum = op1->AsLclVar()->GetLclNum(); - // RUIHAN: what type of reason should we put here? - comp->lvaSetVarDoNotEnregister( - op1LclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(1) = op1; - } - else if (EBOp2 == node) - { - BlockRange().Remove(EBOp2); - const unsigned op1LclNum = op1->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister( - op1LclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - NodeMayEnableEmbBroadcast->AsHWIntrinsic()->Op(2) = op1; - } - return op1->gtNext; - } - } - } - } + tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - + + // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. + // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics + // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. + if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + { + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTree* CreateUser = nullptr; + if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->isEmbBroadcastHWIntrinsic()) + { + CreateUser = use.User(); + } + // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" + if(CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) + { + const unsigned opLclNum = op1->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(tmp1, op1); + MakeSrcContained(node, tmp1); + MakeSrcContained(CreateUser, node); + } + } + return LowerNode(node); } @@ -7087,36 +7008,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre break; } - case NI_AVX_Add: - case NI_AVX2_Add: - { - if (parentNode->WithEmbeddedBroadcast()) - { - supportsGeneralLoads = true; - } - else - { - assert(!supportsSIMDScalarLoads); - - if (!comp->canUseVexEncoding()) - { - assert(!supportsUnalignedSIMDLoads); - supportsAlignedSIMDLoads = true; - } - else - { - supportsAlignedSIMDLoads = !comp->opts.MinOpts(); - supportsUnalignedSIMDLoads = true; - } - - const unsigned expectedSize = genTypeSize(parentNode->TypeGet()); - const unsigned operandSize = genTypeSize(childNode->TypeGet()); - - supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize); - } - break; - } - default: { assert(!supportsSIMDScalarLoads); @@ -7637,6 +7528,11 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } + case NI_AVX2_BroadcastScalarToVector256: + { + return true; + } + default: { assert(!childNode->isContainableHWIntrinsic()); From 52cd44d73f94e7e4eabe1ebb09c85e1c62302998 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 18 Apr 2023 11:04:24 -0700 Subject: [PATCH 04/44] Convert the check logics on broadcast into a flag --- src/coreclr/jit/gentree.h | 21 ++++++++++++++- src/coreclr/jit/instr.cpp | 49 +++++++++++++++------------------- src/coreclr/jit/lowerxarch.cpp | 20 +++++++------- 3 files changed, 52 insertions(+), 38 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a39cf261f55259..6a71401c6a7c78 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -556,7 +556,9 @@ enum GenTreeFlags : unsigned int GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. - GTF_VECCON_FROMSCALAR = 0x80000000 // GT_VECCON -- Indicate the vector constant is created from the same scalar. + GTF_VECCON_FROMSCALAR = 0x80000000, // GT_VECCON -- Indicate the vector constant is created from the same scalar. + + GTF_BROADCAST_EMBEDDED = 0x80000000 // GT_HWINTRINSIC -- Indicate this broadcast node is part of embedded broadcast. }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -2042,6 +2044,23 @@ struct GenTree assert(!IsCreatedFromScalar()); } + bool IsEmbBroadcast() + { + return ((gtFlags & GTF_BROADCAST_EMBEDDED) != 0); + } + + void SetEmbBroadcast() + { + gtFlags |= GTF_BROADCAST_EMBEDDED; + assert(IsEmbBroadcast()); + } + + void ClearEmbBroadcast() + { + gtFlags &= ~GTF_BROADCAST_EMBEDDED; + assert(!IsEmbBroadcast()); + } + bool CanCSE() const { return ((gtFlags & GTF_DONT_CSE) == 0); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c606c53fb87084..daa8c12fc4c178 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -803,11 +803,10 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) case NI_AVX2_BroadcastScalarToVector256: { assert(hwintrinsic->isContained()); - GenTree* tmp = hwintrinsic->Op(1); - if(tmp->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + if (hwintrinsic->IsEmbBroadcast()) { - op = tmp->AsHWIntrinsic()->Op(1); - assert(op->OperIs(GT_LCL_VAR)); + op = hwintrinsic->AsHWIntrinsic()->Op(1)->AsHWIntrinsic()->Op(1); + assert(op->OperIs(GT_LCL_VAR) && op->TypeIs(TYP_FLOAT)); return genOperandDesc(op); } else @@ -1118,9 +1117,9 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { // need to check if the datatype is EB compatible, say 32-, 64-bit. - insFlags flags = instInfo[ins]; - bool IsEmbBroadcastCompatible = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; - if(!IsEmbBroadcastCompatible) + insFlags flags = instInfo[ins]; + bool IsEmbBroadcastCompatible = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; + if (!IsEmbBroadcastCompatible) { return false; } @@ -1133,28 +1132,24 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { case GT_HWINTRINSIC: { - if(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256) + if (op->IsEmbBroadcast()) { - GenTree* tmp = op->AsHWIntrinsic()->Op(1); - if(tmp->OperIs(GT_HWINTRINSIC) && tmp->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) - { - IsEmbBroadcastEnabled = tmp->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR) - && tmp->AsHWIntrinsic()->Op(1)->TypeIs(TYP_FLOAT); - } + IsEmbBroadcastEnabled = true; } + break; } - break; - + case GT_CNS_VEC: { - if(op->IsCreatedFromScalar()) + if (op->IsCreatedFromScalar()) { IsEmbBroadcastEnabled = true; } + break; } - - default: - break; + + default: + break; } return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; @@ -1187,19 +1182,19 @@ void CodeGen::inst_RV_RV_TT( bool IsEmbBroadcast = false; #if defined(TARGET_XARCH) IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); - if(IsEmbBroadcast && op2->OperIs(GT_CNS_VEC) && op2->AsVecCon()->IsCreatedFromScalar()) + if (IsEmbBroadcast && op2->OperIs(GT_CNS_VEC) && op2->AsVecCon()->IsCreatedFromScalar()) { switch (ins) { - case INS_addps: + case INS_addps: { float scalar = static_cast(op2->AsVecCon()->gtSimd32Val.f32[0]); - op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); + op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); + break; } - break; - - default: - break; + + default: + break; } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4d58340292c14d..ed77f1ccaefe33 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2366,36 +2366,36 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // var tmp1 = Vector128.CreateScalarUnsafe(op1); // return Avx2.BroadcastScalarToVector256(tmp1); - tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - + // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. - if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); + bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; - if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->isEmbBroadcastHWIntrinsic()) + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->isEmbBroadcastHWIntrinsic()) { CreateUser = use.User(); } // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" - if(CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) - { + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + { const unsigned opLclNum = op1->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(tmp1, op1); MakeSrcContained(node, tmp1); MakeSrcContained(CreateUser, node); + node->SetEmbBroadcast(); } } - + return LowerNode(node); } @@ -7530,7 +7530,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: { - return true; + return childNode->IsEmbBroadcast(); } default: From 10d75c0ad885c84290ac50067dbe8598cd014689 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 19 Apr 2023 14:07:04 -0700 Subject: [PATCH 05/44] bug fixes: 1. fixed the contain logic at lowering, to accomadate the situation when both operands for a EB compatible node are EB candidates. 2. fixed some unexpected EVEX.b set at some non-EVEX instructions on x86 --- src/coreclr/jit/emitxarch.cpp | 8 +++----- src/coreclr/jit/instr.cpp | 8 ++++---- src/coreclr/jit/lowerxarch.cpp | 14 +++++++++----- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 0c15e30a71f78a..60e363ee9515ff 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1271,10 +1271,8 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at emitter::code_t emitter::AddEmbeddedBroadcast(const instrDesc* id, code_t code) { - if (id->idIsEmbBroadcast()) + if (hasEvexPrefix(code) && id->idIsEmbBroadcast()) { - // Shouldn have already added EVEX prefix - assert(hasEvexPrefix(code)); code |= EMBEDDED_BROADCAST_BIT; } return code; @@ -6815,7 +6813,7 @@ void emitter::emitIns_R_R_C(instruction ins, id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; #if defined(TARGET_XARCH) - if (isEmbBroadcast) + if (isEmbBroadcast && UseEvexEncoding()) { id->idSetEmbBroadcast(); } @@ -6867,7 +6865,7 @@ void emitter::emitIns_R_R_S( id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); #if defined(TARGET_XARCH) - if (isEmbBroadcast) + if (isEmbBroadcast && UseEvexEncoding()) { id->idSetEmbBroadcast(); } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index daa8c12fc4c178..d5946cba553d49 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1113,7 +1113,7 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT } } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { // need to check if the datatype is EB compatible, say 32-, 64-bit. @@ -1154,7 +1154,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ // inst_RV_RV_TT: Generates an instruction that takes 2 operands: @@ -1180,7 +1180,7 @@ void CodeGen::inst_RV_RV_TT( OperandDesc op2Desc = genOperandDesc(op2); bool IsEmbBroadcast = false; -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); if (IsEmbBroadcast && op2->OperIs(GT_CNS_VEC) && op2->AsVecCon()->IsCreatedFromScalar()) { @@ -1197,7 +1197,7 @@ void CodeGen::inst_RV_RV_TT( break; } } -#endif // TARGET_XARCH +#endif // TARGET_XARCH && FEATURE_HW_INTRINSICS switch (op2Desc.GetKind()) { case OperandKind::ClsVar: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ed77f1ccaefe33..ccac64384fe856 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2387,11 +2387,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) { - const unsigned opLclNum = op1->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(tmp1, op1); - MakeSrcContained(node, tmp1); - MakeSrcContained(CreateUser, node); node->SetEmbBroadcast(); } } @@ -7855,6 +7850,15 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { + if(op1->OperIs(GT_HWINTRINSIC) && op1->IsEmbBroadcast()) + { + GenTree* CreateScalar = op1->AsHWIntrinsic()->Op(1); + GenTree* local = CreateScalar->AsHWIntrinsic()->Op(1); + const unsigned opLclNum = local->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(CreateScalar, local); + MakeSrcContained(op1, CreateScalar); + } MakeSrcContained(node, op1); // Swap the operands here to make the containment checks in codegen significantly simpler From cdb61441aac3e137ef32abd44d67060fb3ab7468 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 19 Apr 2023 14:13:06 -0700 Subject: [PATCH 06/44] apply format patch. --- src/coreclr/jit/lowerxarch.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ccac64384fe856..fb600f01df9a92 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7850,12 +7850,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - if(op1->OperIs(GT_HWINTRINSIC) && op1->IsEmbBroadcast()) + if (op1->OperIs(GT_HWINTRINSIC) && op1->IsEmbBroadcast()) { - GenTree* CreateScalar = op1->AsHWIntrinsic()->Op(1); - GenTree* local = CreateScalar->AsHWIntrinsic()->Op(1); - const unsigned opLclNum = local->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + GenTree* CreateScalar = op1->AsHWIntrinsic()->Op(1); + GenTree* local = CreateScalar->AsHWIntrinsic()->Op(1); + const unsigned opLclNum = local->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister( + opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(CreateScalar, local); MakeSrcContained(op1, CreateScalar); } From 55a6bb7257ed1edbc9d52ae535db91cdab4fb4dd Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 25 Apr 2023 09:40:01 -0700 Subject: [PATCH 07/44] Add "insOpts" data structure to xarch: insOpts may contain information on the EVEX.b bit, currently only embedded broaddcast --- src/coreclr/jit/codegen.h | 8 ++- src/coreclr/jit/codegenxarch.cpp | 4 +- src/coreclr/jit/emit.h | 42 ++++++++-------- src/coreclr/jit/emitxarch.cpp | 36 +++++++------- src/coreclr/jit/emitxarch.h | 12 ++--- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 5 +- src/coreclr/jit/instr.cpp | 55 ++++++++++++--------- src/coreclr/jit/instr.h | 7 +++ 8 files changed, 97 insertions(+), 72 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index aec68a7bff2a73..a4203129a5b541 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1539,7 +1539,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival); void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival); - void inst_RV_RV_TT(instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW); + void inst_RV_RV_TT(instruction ins, + emitAttr size, + regNumber targetReg, + regNumber op1Reg, + GenTree* op2, + bool isRMW, + var_types simdBaseType); void inst_RV_RV_TT_IV( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index fce83922e0ab27..3d22f47fbd430d 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -987,7 +987,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // all have RMW semantics if VEX support is not available bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW); + inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW, TYP_UNKNOWN); genProduceReg(treeNode); return; @@ -7769,7 +7769,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) #endif } - GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0, /*isEB*/ false); + GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0, INS_OPTS_NONE); } //----------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index d869e6604d70b4..8c1835b10e4697 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -782,8 +782,8 @@ class emitter unsigned _idCallAddr : 1; // IL indirect calls: can make a direct call to iiaAddr unsigned _idNoGC : 1; // Some helpers don't get recorded in GC tables #if defined(TARGET_XARCH) - unsigned _idEmbBroadcast : 1; -#endif // TARGET_XARCH + unsigned _idEvexbContext : 1; // does EVEX.b need to be set. +#endif // TARGET_XARCH #ifdef TARGET_ARM64 opSize _idOpSize : 3; // operand size: 0=1 , 1=2 , 2=4 , 3=8, 4=16 @@ -1535,15 +1535,15 @@ class emitter } #ifdef TARGET_XARCH - bool idIsEmbBroadcast() const + bool idIsEvexbContext() const { - return _idEmbBroadcast != 0; + return _idEvexbContext != 0; } - void idSetEmbBroadcast() + void idSetEvexbContext() { - assert(_idEmbBroadcast == 0); - _idEmbBroadcast = 1; - assert(_idEmbBroadcast == 1); + assert(_idEvexbContext == 0); + _idEvexbContext = 1; + assert(_idEvexbContext == 1); } #endif @@ -3673,6 +3673,20 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) // emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { + emitAttr defaultSize = id->idOpSize(); + + if (id->idIsEvexbContext()) + { + // should have the assumption that Evex.b now stands for the embedded broadcast context. + switch (id->idIns()) + { + case INS_addps: + return EA_4BYTE; + + default: + break; + } + } emitAttr defaultSize = id->idOpSize(); instruction ins = id->idIns(); @@ -3892,18 +3906,6 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id) const } } - case INS_addps: - { - if (!id->idIsEmbBroadcast()) - { - return defaultSize; - } - else - { - return EA_4BYTE; - } - } - default: { return defaultSize; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 60e363ee9515ff..94c6f12d51210a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1231,7 +1231,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL #define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL #define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL -#define EMBEDDED_BROADCAST_BIT 0x0000001000000000ULL +#define EVEX_B_BIT 0x0000001000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX perfix with only LL' bits set. @@ -1269,11 +1269,11 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at return code; } -emitter::code_t emitter::AddEmbeddedBroadcast(const instrDesc* id, code_t code) +emitter::code_t emitter::AddEvexbBit(const instrDesc* id, code_t code) { - if (hasEvexPrefix(code) && id->idIsEmbBroadcast()) + if (hasEvexPrefix(code) && id->idIsEvexbContext()) { - code |= EMBEDDED_BROADCAST_BIT; + code |= EVEX_B_BIT; } return code; } @@ -6794,7 +6794,7 @@ void emitter::emitIns_R_R_C(instruction ins, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, - bool isEmbBroadcast) + insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6813,9 +6813,9 @@ void emitter::emitIns_R_R_C(instruction ins, id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; #if defined(TARGET_XARCH) - if (isEmbBroadcast && UseEvexEncoding()) + if ((instOptions == INS_OPTS_EVEX_b) && UseEvexEncoding()) { - id->idSetEmbBroadcast(); + id->idSetEvexbContext(); } #endif // TARGET_XARCH @@ -6851,7 +6851,7 @@ void emitter::emitIns_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, } void emitter::emitIns_R_R_S( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, bool isEmbBroadcast) + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6865,9 +6865,9 @@ void emitter::emitIns_R_R_S( id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); #if defined(TARGET_XARCH) - if (isEmbBroadcast && UseEvexEncoding()) + if ((instOptions == INS_OPTS_EVEX_b) && UseEvexEncoding()) { - id->idSetEmbBroadcast(); + id->idSetEvexbContext(); } #endif // TARGET_XARCH #ifdef DEBUG @@ -8185,11 +8185,11 @@ void emitter::emitIns_SIMD_R_R_C(instruction ins, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs, - bool isEmbBroadcast) + insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs, isEmbBroadcast); + emitIns_R_R_C(ins, attr, targetReg, op1Reg, fldHnd, offs, instOptions); } else { @@ -8247,11 +8247,11 @@ void emitter::emitIns_SIMD_R_R_R( // offs -- The offset added to the memory address from varx // void emitter::emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, bool isEmbBroadcast) + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs, isEmbBroadcast); + emitIns_R_R_S(ins, attr, targetReg, op1Reg, varx, offs, instOptions); } else { @@ -8428,7 +8428,7 @@ void emitter::emitIns_SIMD_R_R_R_C(instruction ins, assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs, /*isEB*/ false); + emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs, INS_OPTS_NONE); } //------------------------------------------------------------------------ @@ -8521,7 +8521,7 @@ void emitter::emitIns_SIMD_R_R_R_S( assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs, false); + emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs, INS_OPTS_NONE); } //------------------------------------------------------------------------ @@ -16938,7 +16938,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEmbeddedBroadcast(id, code); + code = AddEvexbBit(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17181,7 +17181,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEmbeddedBroadcast(id, code); + code = AddEvexbBit(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 158d9c0d23614d..dc54a5c3cd5508 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -294,7 +294,7 @@ bool hasEvexPrefix(code_t code) return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; } code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); -code_t AddEmbeddedBroadcast(const instrDesc* id, code_t code); +code_t AddEvexbBit(const instrDesc* id, code_t code); //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. @@ -592,10 +592,10 @@ void emitIns_R_R_C(instruction ins, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, - bool isEmbBroadcast); + insOpts instOptions); void emitIns_R_R_S( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, bool isEmbBroadcast); + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions); void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); @@ -704,10 +704,10 @@ void emitIns_SIMD_R_R_C(instruction ins, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs, - bool isEmbBroadcast); + insOpts instOptions); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); void emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, bool isEmbBroadcast); + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions); void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); @@ -862,7 +862,7 @@ inline bool emitIsUncondJump(instrDesc* jmp) // inline bool HasEmbeddedBroadcast(const instrDesc* id) const { - return id->idIsEmbBroadcast(); + return id->idIsEvexbContext(); } inline bool HasHighSIMDReg(const instrDesc* id) const; diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 973d4176a00b63..94952200eb2100 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -728,8 +728,9 @@ void CodeGen::genHWIntrinsic_R_R_RM( assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); } - bool isRMW = node->isRMWHWIntrinsic(compiler); - inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW); + bool isRMW = node->isRMWHWIntrinsic(compiler); + var_types simdBaseType = node->GetSimdBaseType(); + inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW, simdBaseType); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d5946cba553d49..eff981b80ae99a 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1162,15 +1162,20 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) // the result is returned in register // // Arguments: -// ins -- The instruction being emitted -// size -- The emit size attribute -// targetReg -- The target register -// op1Reg -- The first operand register -// op2 -- The second operand, which may be a memory node or a node producing a register -// isRMW -- true if the instruction is RMW; otherwise, false -// -void CodeGen::inst_RV_RV_TT( - instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW) +// ins -- The instruction being emitted +// size -- The emit size attribute +// targetReg -- The target register +// op1Reg -- The first operand register +// op2 -- The second operand, which may be a memory node or a node producing a register +// isRMW -- true if the instruction is RMW; otherwise, false +// simdBaseType -- the base data type for this intrinsic. +void CodeGen::inst_RV_RV_TT(instruction ins, + emitAttr size, + regNumber targetReg, + regNumber op1Reg, + GenTree* op2, + bool isRMW, + var_types simdBaseType) { emitter* emit = GetEmitter(); noway_assert(emit->emitVerifyEncodable(ins, EA_SIZE(size), targetReg)); @@ -1178,23 +1183,27 @@ void CodeGen::inst_RV_RV_TT( // TODO-XArch-CQ: Commutative operations can have op1 be contained // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - OperandDesc op2Desc = genOperandDesc(op2); - bool IsEmbBroadcast = false; + OperandDesc op2Desc = genOperandDesc(op2); + insOpts instOptions = INS_OPTS_NONE; #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) - IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); - if (IsEmbBroadcast && op2->OperIs(GT_CNS_VEC) && op2->AsVecCon()->IsCreatedFromScalar()) + bool IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); + if (IsEmbBroadcast) { - switch (ins) + instOptions = INS_OPTS_EVEX_b; + if (op2->OperIs(GT_CNS_VEC)) { - case INS_addps: + switch (simdBaseType) { - float scalar = static_cast(op2->AsVecCon()->gtSimd32Val.f32[0]); - op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); - break; - } + case TYP_FLOAT: + { + float scalar = static_cast(op2->AsVecCon()->gtSimd32Val.f32[0]); + op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); + break; + } - default: - break; + default: + break; + } } } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS @@ -1202,12 +1211,12 @@ void CodeGen::inst_RV_RV_TT( { case OperandKind::ClsVar: { - emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, IsEmbBroadcast); + emit->emitIns_SIMD_R_R_C(ins, size, targetReg, op1Reg, op2Desc.GetFieldHnd(), 0, instOptions); break; } case OperandKind::Local: emit->emitIns_SIMD_R_R_S(ins, size, targetReg, op1Reg, op2Desc.GetVarNum(), op2Desc.GetLclOffset(), - IsEmbBroadcast); + instOptions); break; case OperandKind::Indir: diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 906ac381449a7f..c3efb39829a021 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -187,6 +187,13 @@ enum insFlags : uint64_t INS_FLAGS_DONT_CARE = 0x00ULL, }; +enum insOpts: unsigned +{ + INS_OPTS_NONE, + + INS_OPTS_EVEX_b +}; + #elif defined(TARGET_ARM) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // TODO-Cleanup: Move 'insFlags' under TARGET_ARM enum insFlags: unsigned From 5569217f2ae9c2dc44e97a7f939b88a2774547c1 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 25 Apr 2023 10:44:34 -0700 Subject: [PATCH 08/44] Add "OperIsBroadcastScalar" check: This check is to ensure the intrinsic is actually a broadcast scalar intrinsic, the reason to add this check is that gentree flags are using overlapping definition, GTF_BROADCAST_EMBEDDED has some conflicting definition, so we need to ensure the flag we checked does not come from other overlapping flags. --- src/coreclr/jit/gentree.cpp | 21 +++++++++++++++++++++ src/coreclr/jit/gentree.h | 1 + 2 files changed, 22 insertions(+) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ddce95bd54ee6f..ee2812a332cd48 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25092,6 +25092,27 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const } } +//------------------------------------------------------------------------ +// OperIsBroadcastScalar: Is this HWIntrinsic a broadcast node from scalar. +// +// Return Value: +// Whether "this" is a broadcast node from scalar. +// +bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const +{ +#if defined(TARGET_XARCH) + NamedIntrinsic intrinsicId = GetHWIntrinsicId(); + if (intrinsicId == NI_AVX2_BroadcastScalarToVector128 || intrinsicId == NI_AVX2_BroadcastScalarToVector256) + { + return true; + } + else + return false; +#else + return false; +#endif +} + //------------------------------------------------------------------------------ // OperRequiresAsgFlag : Check whether the operation requires GTF_ASG flag regardless // of the children's flags. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 6a71401c6a7c78..b0db7671620a32 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6256,6 +6256,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryStore(GenTree** pAddr = nullptr) const; bool OperIsMemoryLoadOrStore() const; bool OperIsMemoryStoreOrBarrier() const; + bool OperIsBroadcastScalar() const; bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; From 4f92123b2616ad18702e2081d3b95e1afa6c42b9 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 25 Apr 2023 12:26:15 -0700 Subject: [PATCH 09/44] rebase the branch and resolve conflicts --- src/coreclr/jit/emit.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 8c1835b10e4697..c7f515354bc0dd 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3673,8 +3673,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) // emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { - emitAttr defaultSize = id->idOpSize(); - if (id->idIsEvexbContext()) { // should have the assumption that Evex.b now stands for the embedded broadcast context. From 328549f6d143a41ed8a9a647e9f0e8ccac1d6c85 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 26 Apr 2023 16:51:19 -0700 Subject: [PATCH 10/44] changes based on the reivews: 1. removed the gentree flag GTF_EMBEDDED_BROADCAST. 2. mark the embedded broadcast node by making it contained. 3. improved logics in GetMemOpSize() to return the correct pointer size when embedded broadcast is enabled. 4. improved logics in genOperandDesc() to emit scalar when constant vector operand is found to be created from scalar. --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/emit.h | 13 +++++---- src/coreclr/jit/emitxarch.cpp | 14 +++++---- src/coreclr/jit/emitxarch.h | 8 +++--- src/coreclr/jit/gentree.cpp | 33 ++++++++++----------- src/coreclr/jit/gentree.h | 26 +---------------- src/coreclr/jit/instr.cpp | 52 +++++++++++++++++----------------- src/coreclr/jit/lowerxarch.cpp | 45 +++++++++++++++++++---------- 8 files changed, 95 insertions(+), 98 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index a4203129a5b541..84d8b7d207a2a8 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1533,7 +1533,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX } }; - OperandDesc genOperandDesc(GenTree* op); + OperandDesc genOperandDesc(GenTree* op, insOpts instOptions = INS_OPTS_NONE, var_types simdBaseType = TYP_UNKNOWN); void inst_TT(instruction ins, emitAttr size, GenTree* op1); void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c7f515354bc0dd..68411edb7d1a73 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3673,21 +3673,24 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) // emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { + + + emitAttr defaultSize = id->idOpSize(); + instruction ins = id->idIns(); if (id->idIsEvexbContext()) { // should have the assumption that Evex.b now stands for the embedded broadcast context. - switch (id->idIns()) + // reference: Section 2.7.5 in Intel 64 and ia-32 architectures software developer's manual volume 2. + ssize_t inputSize = GetInputSizeInBytes(id); + switch (inputSize) { - case INS_addps: + case 4: return EA_4BYTE; default: break; } } - emitAttr defaultSize = id->idOpSize(); - instruction ins = id->idIns(); - switch (ins) { case INS_pextrb: diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 94c6f12d51210a..888f18e8bef8c1 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1269,7 +1269,7 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at return code; } -emitter::code_t emitter::AddEvexbBit(const instrDesc* id, code_t code) +emitter::code_t emitter::AddEvexbBitIfNeeded(const instrDesc* id, code_t code) { if (hasEvexPrefix(code) && id->idIsEvexbContext()) { @@ -6813,8 +6813,9 @@ void emitter::emitIns_R_R_C(instruction ins, id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; #if defined(TARGET_XARCH) - if ((instOptions == INS_OPTS_EVEX_b) && UseEvexEncoding()) + if ((instOptions == INS_OPTS_EVEX_b)) { + assert(UseEvexEncoding()); id->idSetEvexbContext(); } #endif // TARGET_XARCH @@ -6865,8 +6866,9 @@ void emitter::emitIns_R_R_S( id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); #if defined(TARGET_XARCH) - if ((instOptions == INS_OPTS_EVEX_b) && UseEvexEncoding()) + if ((instOptions == INS_OPTS_EVEX_b)) { + assert(UseEvexEncoding()); id->idSetEvexbContext(); } #endif // TARGET_XARCH @@ -15742,7 +15744,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) // Return Value: // size in bytes. // -ssize_t emitter::GetInputSizeInBytes(instrDesc* id) +ssize_t emitter::GetInputSizeInBytes(instrDesc* id) const { insFlags inputSize = static_cast((CodeGenInterface::instInfo[id->idIns()] & Input_Mask)); @@ -16938,7 +16940,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEvexbBit(id, code); + code = AddEvexbBitIfNeeded(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17181,7 +17183,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEvexbBit(id, code); + code = AddEvexbBitIfNeeded(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index dc54a5c3cd5508..8f57d80b54e161 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -294,7 +294,7 @@ bool hasEvexPrefix(code_t code) return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; } code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); -code_t AddEvexbBit(const instrDesc* id, code_t code); +code_t AddEvexbBitIfNeeded(const instrDesc* id, code_t code); //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. @@ -386,7 +386,7 @@ bool codeEvexMigrationCheck(code_t code) return hasEvexPrefix(code); } -ssize_t GetInputSizeInBytes(instrDesc* id); +ssize_t GetInputSizeInBytes(instrDesc* id) const; bool containsAVXInstruction = false; bool ContainsAVX() @@ -704,10 +704,10 @@ void emitIns_SIMD_R_R_C(instruction ins, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs, - insOpts instOptions); + insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); void emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions); + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ee2812a332cd48..ecb0b891e87c6c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19277,18 +19277,6 @@ bool GenTree::isEvexCompatibleHWIntrinsic() #endif } -//------------------------------------------------------------------------ -// isEmbBroadcastHWIntrinsic: Checks if the intrinsic is a embedded broadcast compatible inintrsic. -// -// Return Value: -// true if the intrisic node lowering instruction is embedded broadcast compatible. -// -bool GenTree::isEmbBroadcastHWIntrinsic() -{ - assert(gtOper == GT_HWINTRINSIC); - return HWIntrinsicInfo::IsEmbBroadcastCompatible(AsHWIntrinsic()->GetHWIntrinsicId()); -} - GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, @@ -25102,17 +25090,30 @@ bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const { #if defined(TARGET_XARCH) NamedIntrinsic intrinsicId = GetHWIntrinsicId(); - if (intrinsicId == NI_AVX2_BroadcastScalarToVector128 || intrinsicId == NI_AVX2_BroadcastScalarToVector256) + switch(intrinsicId) { - return true; + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + return true; + default: + return false; } - else - return false; #else return false; #endif } +//------------------------------------------------------------------------ +// OperIsEmbBroadcastHWIntrinsic: Checks if the intrinsic is a embedded broadcast compatible inintrsic. +// +// Return Value: +// true if the intrisic node lowering instruction is embedded broadcast compatible. +// +bool GenTreeHWIntrinsic::OperIsEmbBroadcastHWIntrinsic() const +{ + return HWIntrinsicInfo::IsEmbBroadcastCompatible(GetHWIntrinsicId()); +} + //------------------------------------------------------------------------------ // OperRequiresAsgFlag : Check whether the operation requires GTF_ASG flag regardless // of the children's flags. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index b0db7671620a32..38856f5583f75f 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -557,8 +557,6 @@ enum GenTreeFlags : unsigned int GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. GTF_VECCON_FROMSCALAR = 0x80000000, // GT_VECCON -- Indicate the vector constant is created from the same scalar. - - GTF_BROADCAST_EMBEDDED = 0x80000000 // GT_HWINTRINSIC -- Indicate this broadcast node is part of embedded broadcast. }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -1489,7 +1487,6 @@ struct GenTree bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); bool isEvexCompatibleHWIntrinsic(); - bool isEmbBroadcastHWIntrinsic(); #else bool isCommutativeHWIntrinsic() const { @@ -1510,11 +1507,6 @@ struct GenTree { return false; } - - bool isEmbBroadcastHWIntrinsic() - { - return false; - } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) @@ -2044,23 +2036,6 @@ struct GenTree assert(!IsCreatedFromScalar()); } - bool IsEmbBroadcast() - { - return ((gtFlags & GTF_BROADCAST_EMBEDDED) != 0); - } - - void SetEmbBroadcast() - { - gtFlags |= GTF_BROADCAST_EMBEDDED; - assert(IsEmbBroadcast()); - } - - void ClearEmbBroadcast() - { - gtFlags &= ~GTF_BROADCAST_EMBEDDED; - assert(!IsEmbBroadcast()); - } - bool CanCSE() const { return ((gtFlags & GTF_DONT_CSE) == 0); @@ -6257,6 +6232,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryLoadOrStore() const; bool OperIsMemoryStoreOrBarrier() const; bool OperIsBroadcastScalar() const; + bool OperIsEmbBroadcastHWIntrinsic() const; bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index eff981b80ae99a..6bbc12d4852c85 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -758,7 +758,7 @@ void CodeGen::inst_RV_SH( // This method is not idempotent - it can only be called once for a // given node. // -CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) +CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, var_types simdBaseType) { if (!op->isContained() && !op->isUsedFromSpillTemp()) { @@ -802,18 +802,16 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) { case NI_AVX2_BroadcastScalarToVector256: { - assert(hwintrinsic->isContained()); - if (hwintrinsic->IsEmbBroadcast()) + if (op->isContained()) { - op = hwintrinsic->AsHWIntrinsic()->Op(1)->AsHWIntrinsic()->Op(1); - assert(op->OperIs(GT_LCL_VAR) && op->TypeIs(TYP_FLOAT)); - return genOperandDesc(op); + op = hwintrinsic->AsHWIntrinsic()->Op(1); + assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); + assert(op->isContained()); + return genOperandDesc(op->AsHWIntrinsic()->Op(1)); } else { - assert(hwintrinsic->OperIsMemoryLoad()); - assert(hwintrinsic->GetOperandCount() == 1); - addr = hwintrinsic->Op(1); + unreached(); } break; } @@ -887,6 +885,23 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) case GT_CNS_VEC: { +#if defined(TARGET_XARCH) + if(instOptions == INS_OPTS_EVEX_b) + { + switch (simdBaseType) + { + case TYP_FLOAT: + { + float scalar = static_cast(op->AsVecCon()->gtSimd32Val.f32[0]); + return OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); + } + + default: + unreached(); + } + break; + } +#endif // TARGET_XARCH switch (op->TypeGet()) { #if defined(FEATURE_SIMD) @@ -1132,7 +1147,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { case GT_HWINTRINSIC: { - if (op->IsEmbBroadcast()) + if (op->isContained()) { IsEmbBroadcastEnabled = true; } @@ -1183,30 +1198,15 @@ void CodeGen::inst_RV_RV_TT(instruction ins, // TODO-XArch-CQ: Commutative operations can have op1 be contained // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - OperandDesc op2Desc = genOperandDesc(op2); insOpts instOptions = INS_OPTS_NONE; #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); if (IsEmbBroadcast) { instOptions = INS_OPTS_EVEX_b; - if (op2->OperIs(GT_CNS_VEC)) - { - switch (simdBaseType) - { - case TYP_FLOAT: - { - float scalar = static_cast(op2->AsVecCon()->gtSimd32Val.f32[0]); - op2Desc = OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); - break; - } - - default: - break; - } - } } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS + OperandDesc op2Desc = genOperandDesc(op2, instOptions, simdBaseType); switch (op2Desc.GetKind()) { case OperandKind::ClsVar: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index fb600f01df9a92..ec79356cf6a184 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2370,7 +2370,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. @@ -2380,17 +2379,20 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->isEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) { CreateUser = use.User(); } // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) { - node->SetEmbBroadcast(); + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment issue. + if(node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } } } - return LowerNode(node); } @@ -7525,7 +7527,30 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: { - return childNode->IsEmbBroadcast(); + if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) + { + assert(!childNode->OperIsLeaf()); + GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); + assert(CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); + GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); + if(Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) + { + const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister( + opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(CreateScalar, Scalar); + MakeSrcContained(childNode, CreateScalar); + return true; + } + else + { + return false; + } + } + else + { + return false; + } } default: @@ -7850,16 +7875,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - if (op1->OperIs(GT_HWINTRINSIC) && op1->IsEmbBroadcast()) - { - GenTree* CreateScalar = op1->AsHWIntrinsic()->Op(1); - GenTree* local = CreateScalar->AsHWIntrinsic()->Op(1); - const unsigned opLclNum = local->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister( - opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(CreateScalar, local); - MakeSrcContained(op1, CreateScalar); - } MakeSrcContained(node, op1); // Swap the operands here to make the containment checks in codegen significantly simpler From f86a9932e6af60fcc041b18984e362124e28e485 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 27 Apr 2023 00:30:38 -0700 Subject: [PATCH 11/44] apply format patch --- src/coreclr/jit/emit.h | 1 - src/coreclr/jit/emitxarch.cpp | 2 +- src/coreclr/jit/emitxarch.h | 9 +++++++-- src/coreclr/jit/gentree.cpp | 2 +- src/coreclr/jit/instr.cpp | 8 ++++---- src/coreclr/jit/lowerxarch.cpp | 15 ++++++++------- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 68411edb7d1a73..42f63b5d1c9a11 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3674,7 +3674,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id) emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { - emitAttr defaultSize = id->idOpSize(); instruction ins = id->idIns(); if (id->idIsEvexbContext()) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 888f18e8bef8c1..0d1c807bdfd7c5 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6813,7 +6813,7 @@ void emitter::emitIns_R_R_C(instruction ins, id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; #if defined(TARGET_XARCH) - if ((instOptions == INS_OPTS_EVEX_b)) + if (instOptions == INS_OPTS_EVEX_b) { assert(UseEvexEncoding()); id->idSetEvexbContext(); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 8f57d80b54e161..c3a2e183cfda51 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -706,8 +706,13 @@ void emitIns_SIMD_R_R_C(instruction ins, int offs, insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg); -void emitIns_SIMD_R_R_S( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int varx, int offs, insOpts instOptions = INS_OPTS_NONE); +void emitIns_SIMD_R_R_S(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + int varx, + int offs, + insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_A_I( instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, int ival); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ecb0b891e87c6c..ea0ae5ee9f8e02 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25090,7 +25090,7 @@ bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const { #if defined(TARGET_XARCH) NamedIntrinsic intrinsicId = GetHWIntrinsicId(); - switch(intrinsicId) + switch (intrinsicId) { case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 6bbc12d4852c85..66587582a4900c 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -886,7 +886,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v case GT_CNS_VEC: { #if defined(TARGET_XARCH) - if(instOptions == INS_OPTS_EVEX_b) + if (instOptions == INS_OPTS_EVEX_b) { switch (simdBaseType) { @@ -895,7 +895,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v float scalar = static_cast(op->AsVecCon()->gtSimd32Val.f32[0]); return OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); } - + default: unreached(); } @@ -1198,7 +1198,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, // TODO-XArch-CQ: Commutative operations can have op1 be contained // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - insOpts instOptions = INS_OPTS_NONE; + insOpts instOptions = INS_OPTS_NONE; #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool IsEmbBroadcast = CodeGenInterface::IsEmbeddedBroadcastEnabled(ins, op2); if (IsEmbBroadcast) @@ -1206,7 +1206,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, instOptions = INS_OPTS_EVEX_b; } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2, instOptions, simdBaseType); + OperandDesc op2Desc = genOperandDesc(op2, instOptions, simdBaseType); switch (op2Desc.GetKind()) { case OperandKind::ClsVar: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ec79356cf6a184..8096112e712001 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2386,8 +2386,9 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment issue. - if(node == CreateUser->AsHWIntrinsic()->Op(1)) + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) { std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); } @@ -7527,17 +7528,17 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: { - if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && + parentNode->OperIsEmbBroadcastHWIntrinsic()) { assert(!childNode->OperIsLeaf()); GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); assert(CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); - if(Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) + if (Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) { - const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister( - opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(CreateScalar, Scalar); MakeSrcContained(childNode, CreateScalar); return true; From d486ed4b08412be052b933c06e824fc8f0b50038 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 27 Apr 2023 00:54:42 -0700 Subject: [PATCH 12/44] bug fixes --- src/coreclr/jit/emitxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 0d1c807bdfd7c5..beaed836d43c91 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6866,7 +6866,7 @@ void emitter::emitIns_R_R_S( id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); #if defined(TARGET_XARCH) - if ((instOptions == INS_OPTS_EVEX_b)) + if (instOptions == INS_OPTS_EVEX_b) { assert(UseEvexEncoding()); id->idSetEvexbContext(); From 2c60838243efba097f52d530ec7a0d06a6787bd8 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 27 Apr 2023 11:02:44 -0700 Subject: [PATCH 13/44] bug fixes --- src/coreclr/jit/instr.cpp | 14 +++++++++++--- src/coreclr/jit/lowerxarch.cpp | 25 +++++++++++++++---------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 66587582a4900c..2f1070e9e0ef65 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -804,6 +804,10 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v { if (op->isContained()) { + // if broadcast node is contained, should mean that we have some forms like + // broadcast -> CreateScalarUnsafe -> scalar. + // if so, directly emit scalar. + assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); op = hwintrinsic->AsHWIntrinsic()->Op(1); assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); assert(op->isContained()); @@ -1139,8 +1143,8 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) return false; } - // RUIHAN check 2 situations here - // 1. Add -> Broadcast -> CreateScalar -> LCL_VAR + // Embedded broadcast can be applied when operands are in the following forms. + // 1. Broadcast -> CreateScalar -> LCL_VAR // 2. CnsVec bool IsEmbBroadcastEnabled = false; switch (op->OperGet()) @@ -1167,7 +1171,11 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) break; } - return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; + // to enable embedded broadcast, we need 3 things, + // 1. embedded broadcast compatible intrinsics + // 2. proper forms on the intrinsic operands. + // 3. EVEX enabled. + return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled && GetEmitter()->UseEvexEncoding(); } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 8096112e712001..ba8d4f285b0a42 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2383,7 +2383,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - // RUIHAN: Should we contain this 2 lowered intrinsics or contain the original "Create" if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment @@ -7531,17 +7530,23 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) { - assert(!childNode->OperIsLeaf()); + GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); - assert(CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); - GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); - if (Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) + if(CreateScalar->OperIs(GT_HWINTRINSIC) && CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { - const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(CreateScalar, Scalar); - MakeSrcContained(childNode, CreateScalar); - return true; + GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); + if (Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) + { + const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(CreateScalar, Scalar); + MakeSrcContained(childNode, CreateScalar); + return true; + } + else + { + return false; + } } else { From 172861e095520e759fec373329b6dca4aae37eb0 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 28 Apr 2023 09:57:35 -0700 Subject: [PATCH 14/44] aaply format patch --- src/coreclr/jit/lowerxarch.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ba8d4f285b0a42..a68d60c163da26 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7530,9 +7530,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) { - + GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); - if(CreateScalar->OperIs(GT_HWINTRINSIC) && CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + if (CreateScalar->OperIs(GT_HWINTRINSIC) && + CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); if (Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) From 02c61c7ed0da9f3f9dab4f447f692f1b8083962c Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 1 May 2023 11:02:56 -0700 Subject: [PATCH 15/44] Enable embedded broadcast for Vector128.Add --- src/coreclr/jit/gentree.cpp | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 4 ++-- src/coreclr/jit/instr.cpp | 1 + src/coreclr/jit/lowerxarch.cpp | 24 ++++++++++++++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ea0ae5ee9f8e02..2b6e40f162fa2c 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19138,6 +19138,7 @@ bool GenTree::isContainableHWIntrinsic() const } case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector128: { return true; } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 90e0862485d771..39625449ae7fb9 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -360,7 +360,7 @@ HARDWARE_INTRINSIC(X86Base_X64, DivRem, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE Intrinsics -HARDWARE_INTRINSIC(SSE, Add, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE, Add, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -459,7 +459,7 @@ HARDWARE_INTRINSIC(SSE_X64, ConvertScalarToVector128Single, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE2 Intrinsics -HARDWARE_INTRINSIC(SSE2, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE2, Add, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(SSE2, AddSaturate, 16, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 2f1070e9e0ef65..1d56ad214dbc8f 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -801,6 +801,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v switch (intrinsicId) { case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector128: { if (op->isContained()) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a68d60c163da26..044fb14d05eb7a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2468,6 +2468,29 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) node->ChangeHWIntrinsicId(NI_AVX2_BroadcastScalarToVector128, tmp1); + // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. + // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics + // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + { + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTree* CreateUser = nullptr; + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + { + CreateUser = use.User(); + } + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } + } + } return LowerNode(node); } @@ -7526,6 +7549,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector128: { if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) From 2a6f8a7f755e87e75a97e4b4febadcf8792a56bf Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 2 May 2023 10:27:28 -0700 Subject: [PATCH 16/44] Enable embedded broadcast for Vector512.Add --- src/coreclr/jit/gentree.cpp | 1 + src/coreclr/jit/instr.cpp | 1 + src/coreclr/jit/lowerxarch.cpp | 26 +++++++++++++++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2b6e40f162fa2c..6317bc506d9002 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19139,6 +19139,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX512F_BroadcastScalarToVector512: { return true; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 1d56ad214dbc8f..d28a33857a4d03 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -802,6 +802,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v { case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX512F_BroadcastScalarToVector512: { if (op->isContained()) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 044fb14d05eb7a..30c1854dc2d7ee 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2333,13 +2333,36 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) case TYP_UINT: case TYP_LONG: case TYP_ULONG: - case TYP_FLOAT: case TYP_DOUBLE: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); break; } + + case TYP_FLOAT: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTree* CreateUser = nullptr; + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + { + CreateUser = use.User(); + } + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } + } + break; + } default: { unreached(); @@ -7550,6 +7573,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX512F_BroadcastScalarToVector512: { if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) From b036bcdb167b9f88810e64d884558c875a7fdffe Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 2 May 2023 14:23:15 -0700 Subject: [PATCH 17/44] make double as embedded broadcast supported --- src/coreclr/jit/emit.h | 2 ++ src/coreclr/jit/gentree.cpp | 1 + src/coreclr/jit/instr.cpp | 5 +++++ src/coreclr/jit/instrsxarch.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 34 ++++++++++++++++++++++++++++++---- 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 42f63b5d1c9a11..7e12f961746fd0 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -3685,6 +3685,8 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id) const { case 4: return EA_4BYTE; + case 8: + return EA_8BYTE; default: break; diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 6317bc506d9002..3281a3cf861181 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19140,6 +19140,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX512F_BroadcastScalarToVector512: + case NI_SSE3_MoveAndDuplicate: { return true; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d28a33857a4d03..e31fd973d8aa36 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -803,7 +803,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX512F_BroadcastScalarToVector512: + case NI_SSE3_MoveAndDuplicate: { + if(intrinsicId == NI_SSE3_MoveAndDuplicate) + { + assert(simdBaseType == TYP_DOUBLE); + } if (op->isContained()) { // if broadcast node is contained, should mean that we have some forms like diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 6afef063004e5b..f9b1bbb736aac8 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -240,7 +240,7 @@ INST3(unpcklps, "unpcklps", IUM_WR, BAD_CODE, BAD_CODE, INST3(xorps, "xorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles // SSE2 -INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed doubles +INST3(addpd, "addpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x58), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed doubles INST3(addsd, "addsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x58), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar doubles INST3(andnpd, "andnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles INST3(andpd, "andpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 30c1854dc2d7ee..4fbe716920e4d8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2333,7 +2333,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) case TYP_UINT: case TYP_LONG: case TYP_ULONG: - case TYP_DOUBLE: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); @@ -2341,6 +2340,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) } case TYP_FLOAT: + case TYP_DOUBLE: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); @@ -2352,7 +2352,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment // issue. @@ -2406,7 +2406,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment // issue. @@ -2745,6 +2745,26 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Sse3.MoveAndDuplicate(tmp1); node->ChangeHWIntrinsicId(NI_SSE3_MoveAndDuplicate, tmp1); + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + { + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTree* CreateUser = nullptr; + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + { + CreateUser = use.User(); + } + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_DOUBLE)) + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } + } + } break; } @@ -7574,7 +7594,13 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX512F_BroadcastScalarToVector512: + case NI_SSE3_MoveAndDuplicate: { + if (intrinsicId == NI_SSE3_MoveAndDuplicate) + { + // NI_SSE3_MoveAndDuplicate is for Vector128 only. + assert(childNode->AsHWIntrinsic()->GetSimdBaseType() == TYP_DOUBLE); + } if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastHWIntrinsic()) { @@ -7584,7 +7610,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); - if (Scalar->OperIs(GT_LCL_VAR) && Scalar->TypeIs(TYP_FLOAT)) + if (Scalar->OperIs(GT_LCL_VAR) && (Scalar->TypeIs(TYP_FLOAT) || Scalar->TypeIs(TYP_DOUBLE))) { const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); From 4358ee04398b429ef7d46b4cfa239704a40cf572 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 2 May 2023 14:27:06 -0700 Subject: [PATCH 18/44] Add EB support to AVX_BroadcastScalarToVector* --- src/coreclr/jit/gentree.cpp | 6 ++++-- src/coreclr/jit/instr.cpp | 6 ++++-- src/coreclr/jit/lowerxarch.cpp | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3281a3cf861181..df7b8e00d4a25e 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19137,10 +19137,12 @@ bool GenTree::isContainableHWIntrinsic() const return true; } - case NI_AVX2_BroadcastScalarToVector256: + case NI_SSE3_MoveAndDuplicate: + case NI_AVX_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: - case NI_SSE3_MoveAndDuplicate: { return true; } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index e31fd973d8aa36..39923dd0410065 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -800,10 +800,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v switch (intrinsicId) { - case NI_AVX2_BroadcastScalarToVector256: + case NI_SSE3_MoveAndDuplicate: + case NI_AVX_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: - case NI_SSE3_MoveAndDuplicate: { if(intrinsicId == NI_SSE3_MoveAndDuplicate) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4fbe716920e4d8..a866ad7e6217af 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7591,10 +7591,12 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } - case NI_AVX2_BroadcastScalarToVector256: + case NI_SSE3_MoveAndDuplicate: + case NI_AVX_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: - case NI_SSE3_MoveAndDuplicate: { if (intrinsicId == NI_SSE3_MoveAndDuplicate) { From 3a9093aa151f46dde324f8f61089be0f6270e54f Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 2 May 2023 15:05:17 -0700 Subject: [PATCH 19/44] apply format patch --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/instr.cpp | 2 +- src/coreclr/jit/lowerxarch.cpp | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 84d8b7d207a2a8..99719ee119f7d4 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1545,7 +1545,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regNumber op1Reg, GenTree* op2, bool isRMW, - var_types simdBaseType); + var_types simdBaseType = TYP_UNKNOWN); void inst_RV_RV_TT_IV( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 39923dd0410065..c24a25cfa51935 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -807,7 +807,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { - if(intrinsicId == NI_SSE3_MoveAndDuplicate) + if (intrinsicId == NI_SSE3_MoveAndDuplicate) { assert(simdBaseType == TYP_DOUBLE); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a866ad7e6217af..7cffdf1929e9eb 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2352,7 +2352,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && + (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment // issue. @@ -2406,7 +2407,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && + (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment // issue. @@ -2751,13 +2753,14 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) { CreateUser = use.User(); } if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_DOUBLE)) { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the + // containment // issue. if (node == CreateUser->AsHWIntrinsic()->Op(1)) { From d018d9904e86b1b4667ba752fb4e400c554d97bd Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 3 May 2023 11:13:48 -0700 Subject: [PATCH 20/44] Enable embedded broadcast for double const vector --- src/coreclr/jit/gentree.cpp | 4 ++++ src/coreclr/jit/instr.cpp | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index df7b8e00d4a25e..2a149f5da5849d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21666,6 +21666,10 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { double cnsVal = static_cast(op1->AsDblCon()->DconValue()); +#if defined(TARGET_XARCH) + vecCon->SetCreatedFromScalar(); +#endif // TARGET_XARCH + for (unsigned i = 0; i < (simdSize / 8); i++) { vecCon->gtSimdVal.f64[i] = cnsVal; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c24a25cfa51935..b8efbb52cb76a3 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -905,10 +905,16 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v { case TYP_FLOAT: { - float scalar = static_cast(op->AsVecCon()->gtSimd32Val.f32[0]); + float scalar = static_cast(op->AsVecCon()->gtSimdVal.f32[0]); return OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); } + case TYP_DOUBLE: + { + double scalar = static_cast(op->AsVecCon()->gtSimdVal.f64[0]); + return OperandDesc(emit->emitFltOrDblConst(scalar, EA_8BYTE)); + } + default: unreached(); } From 7557db7f302a0a6826a3592f3905ccd85c6b1e09 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 3 May 2023 17:09:34 -0700 Subject: [PATCH 21/44] Enable embedded broadcast for integer Add. --- src/coreclr/jit/gentree.cpp | 7 +++ src/coreclr/jit/instr.cpp | 60 ++++++++++++++++++-- src/coreclr/jit/instrsxarch.h | 4 +- src/coreclr/jit/lowerxarch.cpp | 101 ++++++++++++++++++++++++--------- 4 files changed, 138 insertions(+), 34 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2a149f5da5849d..45ce00826a95a9 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21628,6 +21628,10 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { uint32_t cnsVal = static_cast(op1->AsIntConCommon()->IntegralValue()); +#if defined(TARGET_XARCH) + vecCon->SetCreatedFromScalar(); +#endif // TARGET_XARCH + for (unsigned i = 0; i < (simdSize / 4); i++) { vecCon->gtSimdVal.u32[i] = cnsVal; @@ -21639,6 +21643,9 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, case TYP_ULONG: { uint64_t cnsVal = static_cast(op1->AsIntConCommon()->IntegralValue()); +#if defined(TARGET_XARCH) + vecCon->SetCreatedFromScalar(); +#endif // TARGET_XARCH for (unsigned i = 0; i < (simdSize / 8); i++) { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index b8efbb52cb76a3..f03ef2cbfc4660 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -816,11 +816,35 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v // if broadcast node is contained, should mean that we have some forms like // broadcast -> CreateScalarUnsafe -> scalar. // if so, directly emit scalar. - assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); - op = hwintrinsic->AsHWIntrinsic()->Op(1); - assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); - assert(op->isContained()); - return genOperandDesc(op->AsHWIntrinsic()->Op(1)); + switch (simdBaseType) + { + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + { + // a special case is when the operand of CreateScalarUnsafe is in integer type, + // CreateScalarUnsafe node will be fold, so we directly match a pattern of + // broadcast -> LCL_VAR(TYP_(U)INT) + assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)); + op = hwintrinsic->Op(1); + assert(op->isContained()); + return genOperandDesc(op); + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); + op = hwintrinsic->Op(1); + assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); + assert(op->isContained()); + return genOperandDesc(op->AsHWIntrinsic()->Op(1)); + } + + default: + unreached(); + } } else { @@ -901,6 +925,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v #if defined(TARGET_XARCH) if (instOptions == INS_OPTS_EVEX_b) { + assert(op->isContained()); switch (simdBaseType) { case TYP_FLOAT: @@ -915,6 +940,31 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v return OperandDesc(emit->emitFltOrDblConst(scalar, EA_8BYTE)); } + case TYP_INT: + { + uint32_t scalar = static_cast(op->AsVecCon()->gtSimdVal.i32[0]); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 4, 4, TYP_INT); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + case TYP_UINT: + { + uint32_t scalar = static_cast(op->AsVecCon()->gtSimdVal.u32[0]); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 4, 4, TYP_UINT); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + case TYP_LONG: + { + uint64_t scalar = static_cast(op->AsVecCon()->gtSimdVal.i64[0]); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 8, 8, TYP_LONG); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + case TYP_ULONG: + { + uint64_t scalar = static_cast(op->AsVecCon()->gtSimdVal.u64[0]); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 8, 8, TYP_ULONG); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + default: unreached(); } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index f9b1bbb736aac8..4cb9cc2955364e 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -290,8 +290,8 @@ INST3(packssdw, "packssdw", IUM_WR, BAD_CODE, BAD_CODE, INST3(packsswb, "packsswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation INST3(packuswb, "packuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation INST3(paddb, "paddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers -INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers -INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers +INST3(paddd, "paddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed double-word (32-bit) integers +INST3(paddq, "paddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Add packed quad-word (64-bit) integers INST3(paddsb, "paddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results INST3(paddsw, "paddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), INS_TT_FULL_MEM, Input_16Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results INST3(paddusb, "paddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), INS_TT_FULL_MEM, Input_8Bit | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 7cffdf1929e9eb..70ce918a4f5cd9 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2329,18 +2329,13 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) node->ResetHWIntrinsicId(NI_AVX512BW_BroadcastScalarToVector512, tmp1); break; } + case TYP_INT: case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); - node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); - break; - } - case TYP_FLOAT: case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); @@ -2352,8 +2347,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && - (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) { // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment // issue. @@ -2407,14 +2401,29 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && - (op1->TypeIs(TYP_FLOAT) || op1->TypeIs(TYP_DOUBLE))) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) + switch (op1->TypeGet()) { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + case TYP_FLOAT: + case TYP_DOUBLE: + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the + // containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } + break; + } + + default: + break; } } } @@ -2506,13 +2515,28 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { CreateUser = use.User(); } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_FLOAT)) + if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) + switch (op1->TypeGet()) { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the + // containment + // issue. + if (node == CreateUser->AsHWIntrinsic()->Op(1)) + { + std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); + } + break; + } + + default: + break; } } } @@ -7615,19 +7639,42 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); - if (Scalar->OperIs(GT_LCL_VAR) && (Scalar->TypeIs(TYP_FLOAT) || Scalar->TypeIs(TYP_DOUBLE))) + if (Scalar->OperIs(GT_LCL_VAR)) { - const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(CreateScalar, Scalar); - MakeSrcContained(childNode, CreateScalar); - return true; + switch (Scalar->TypeGet()) + { + case TYP_FLOAT: + case TYP_DOUBLE: + { + const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister( + opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(CreateScalar, Scalar); + MakeSrcContained(childNode, CreateScalar); + return true; + } + + default: + return false; + ; + } } else { return false; } } + else if (CreateScalar->OperIs(GT_LCL_VAR)) + { + // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be + // fold, we need to specially handle this case. + assert(CreateScalar->TypeIs(TYP_INT) || CreateScalar->TypeIs(TYP_UINT) || + CreateScalar->TypeIs(TYP_LONG) || CreateScalar->TypeIs(TYP_ULONG)); + const unsigned opLclNum = CreateScalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(childNode, CreateScalar); + return true; + } else { return false; From 867eaf0d91c8d9463aab48ef49d737865481bb34 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 4 May 2023 14:14:04 -0700 Subject: [PATCH 22/44] Changes based on the review: 1. Change GenTreeHWIntrinsic::OperIsEmbBroadcastHWIntrinsic to OperIsEmbBroadcastCompatible 2. removed OperIsBroadcastScalar 3. formatting 4. correct errors in the comments. --- src/coreclr/jit/emit.h | 2 +- src/coreclr/jit/emitxarch.h | 2 -- src/coreclr/jit/gentree.cpp | 27 ++------------- src/coreclr/jit/gentree.h | 4 +-- src/coreclr/jit/instr.cpp | 63 +++++++++++++++------------------- src/coreclr/jit/lowerxarch.cpp | 10 +++--- 6 files changed, 37 insertions(+), 71 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 7e12f961746fd0..61f261088fb1bb 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -817,7 +817,7 @@ class emitter //////////////////////////////////////////////////////////////////////// // Space taken up to here: - // x86: 46 bits + // x86: 47 bits // amd64: 46 bits // arm: 48 bits // arm64: 50 bits diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index c3a2e183cfda51..05091df08b60d5 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -696,8 +696,6 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); -void emitIns_SIMD_R_R_AR( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset); void emitIns_SIMD_R_R_C(instruction ins, emitAttr attr, regNumber targetReg, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 45ce00826a95a9..2b3624bdc8edf1 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25097,35 +25097,12 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const } //------------------------------------------------------------------------ -// OperIsBroadcastScalar: Is this HWIntrinsic a broadcast node from scalar. -// -// Return Value: -// Whether "this" is a broadcast node from scalar. -// -bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const -{ -#if defined(TARGET_XARCH) - NamedIntrinsic intrinsicId = GetHWIntrinsicId(); - switch (intrinsicId) - { - case NI_AVX2_BroadcastScalarToVector128: - case NI_AVX2_BroadcastScalarToVector256: - return true; - default: - return false; - } -#else - return false; -#endif -} - -//------------------------------------------------------------------------ -// OperIsEmbBroadcastHWIntrinsic: Checks if the intrinsic is a embedded broadcast compatible inintrsic. +// OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible inintrsic. // // Return Value: // true if the intrisic node lowering instruction is embedded broadcast compatible. // -bool GenTreeHWIntrinsic::OperIsEmbBroadcastHWIntrinsic() const +bool GenTreeHWIntrinsic::OperIsEmbBroadcastCompatible() const { return HWIntrinsicInfo::IsEmbBroadcastCompatible(GetHWIntrinsicId()); } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 38856f5583f75f..99d33fcb4d640c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6231,8 +6231,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryStore(GenTree** pAddr = nullptr) const; bool OperIsMemoryLoadOrStore() const; bool OperIsMemoryStoreOrBarrier() const; - bool OperIsBroadcastScalar() const; - bool OperIsEmbBroadcastHWIntrinsic() const; + bool OperIsEmbBroadcastCompatible() const; bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; @@ -6338,7 +6337,6 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic // struct GenTreeVecCon : public GenTree { - union { simd8_t gtSimd8Val; simd12_t gtSimd12Val; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index f03ef2cbfc4660..c3b05219fedb96 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -811,44 +811,37 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v { assert(simdBaseType == TYP_DOUBLE); } - if (op->isContained()) + // if broadcast node is contained, should mean that we have some forms like + // broadcast -> CreateScalarUnsafe -> scalar. + // if so, directly emit scalar. + switch (simdBaseType) { - // if broadcast node is contained, should mean that we have some forms like - // broadcast -> CreateScalarUnsafe -> scalar. - // if so, directly emit scalar. - switch (simdBaseType) + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: { - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - // a special case is when the operand of CreateScalarUnsafe is in integer type, - // CreateScalarUnsafe node will be fold, so we directly match a pattern of - // broadcast -> LCL_VAR(TYP_(U)INT) - assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)); - op = hwintrinsic->Op(1); - assert(op->isContained()); - return genOperandDesc(op); - } - - case TYP_FLOAT: - case TYP_DOUBLE: - { - assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); - op = hwintrinsic->Op(1); - assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); - assert(op->isContained()); - return genOperandDesc(op->AsHWIntrinsic()->Op(1)); - } - - default: - unreached(); + // a special case is when the operand of CreateScalarUnsafe is in integer type, + // CreateScalarUnsafe node will be fold, so we directly match a pattern of + // broadcast -> LCL_VAR(TYP_(U)INT) + assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)); + op = hwintrinsic->Op(1); + assert(op->isContained()); + return genOperandDesc(op); } - } - else - { - unreached(); + + case TYP_FLOAT: + case TYP_DOUBLE: + { + assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); + op = hwintrinsic->Op(1); + assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); + assert(op->isContained()); + return genOperandDesc(op->AsHWIntrinsic()->Op(1)); + } + + default: + unreached(); } break; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 70ce918a4f5cd9..de2f4c9a2e39f2 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2343,7 +2343,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { CreateUser = use.User(); } @@ -2397,7 +2397,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { CreateUser = use.User(); } @@ -2511,7 +2511,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { CreateUser = use.User(); } @@ -2777,7 +2777,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) bool foundUse = BlockRange().TryGetUse(node, &use); GenTree* CreateUser = nullptr; if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastHWIntrinsic()) + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { CreateUser = use.User(); } @@ -7631,7 +7631,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre assert(childNode->AsHWIntrinsic()->GetSimdBaseType() == TYP_DOUBLE); } if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && - parentNode->OperIsEmbBroadcastHWIntrinsic()) + parentNode->OperIsEmbBroadcastCompatible()) { GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); From 3f4d95b65c67de804ec12c0ce40cfcfaa0442d76 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 5 May 2023 10:59:16 -0700 Subject: [PATCH 23/44] removed the gentree flag: GTF_VECCON_FROMSCALAR --- src/coreclr/jit/gentree.cpp | 15 -------------- src/coreclr/jit/gentree.h | 18 ----------------- src/coreclr/jit/instr.cpp | 40 ++++++++++++++++++++++++++++++++++++- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2b3624bdc8edf1..d15dc2b73366f7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -21628,10 +21628,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { uint32_t cnsVal = static_cast(op1->AsIntConCommon()->IntegralValue()); -#if defined(TARGET_XARCH) - vecCon->SetCreatedFromScalar(); -#endif // TARGET_XARCH - for (unsigned i = 0; i < (simdSize / 4); i++) { vecCon->gtSimdVal.u32[i] = cnsVal; @@ -21643,9 +21639,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, case TYP_ULONG: { uint64_t cnsVal = static_cast(op1->AsIntConCommon()->IntegralValue()); -#if defined(TARGET_XARCH) - vecCon->SetCreatedFromScalar(); -#endif // TARGET_XARCH for (unsigned i = 0; i < (simdSize / 8); i++) { @@ -21658,10 +21651,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { float cnsVal = static_cast(op1->AsDblCon()->DconValue()); -#if defined(TARGET_XARCH) - vecCon->SetCreatedFromScalar(); -#endif // TARGET_XARCH - for (unsigned i = 0; i < (simdSize / 4); i++) { vecCon->gtSimdVal.f32[i] = cnsVal; @@ -21673,10 +21662,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, { double cnsVal = static_cast(op1->AsDblCon()->DconValue()); -#if defined(TARGET_XARCH) - vecCon->SetCreatedFromScalar(); -#endif // TARGET_XARCH - for (unsigned i = 0; i < (simdSize / 8); i++) { vecCon->gtSimdVal.f64[i] = cnsVal; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 99d33fcb4d640c..cb85c21c70ef51 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -556,7 +556,6 @@ enum GenTreeFlags : unsigned int GTF_MDARRLOWERBOUND_NONFAULTING = 0x20000000, // GT_MDARR_LOWER_BOUND -- An MD array lower bound operation that cannot fault. Same as GT_IND_NONFAULTING. - GTF_VECCON_FROMSCALAR = 0x80000000, // GT_VECCON -- Indicate the vector constant is created from the same scalar. }; inline constexpr GenTreeFlags operator ~(GenTreeFlags a) @@ -2019,23 +2018,6 @@ struct GenTree ClearRegOptional(); } - bool IsCreatedFromScalar() - { - return ((gtFlags & GTF_VECCON_FROMSCALAR) != 0); - } - - void SetCreatedFromScalar() - { - gtFlags |= GTF_VECCON_FROMSCALAR; - assert(IsCreatedFromScalar()); - } - - void ClearCreatedFromScalar() - { - gtFlags &= ~GTF_VECCON_FROMSCALAR; - assert(!IsCreatedFromScalar()); - } - bool CanCSE() const { return ((gtFlags & GTF_DONT_CSE) == 0); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index c3b05219fedb96..6381a8a3863311 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1201,6 +1201,8 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) return false; } + insFlags inputSize = static_cast((CodeGenInterface::instInfo[ins] & Input_Mask)); + // Embedded broadcast can be applied when operands are in the following forms. // 1. Broadcast -> CreateScalar -> LCL_VAR // 2. CnsVec @@ -1218,7 +1220,43 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) case GT_CNS_VEC: { - if (op->IsCreatedFromScalar()) + var_types simdType = op->TypeGet(); + bool IsIdentical = true; + switch(inputSize) + { + case Input_32Bit: + { + uint32_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u32[0]); + for(unsigned i = 1; i < genTypeSize(simdType) / 4; i++) + { + uint32_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u32[i]); + if(FirstElement != ElementToCheck) + { + IsIdentical = false; + break; + } + } + break; + } + case Input_64Bit: + { + uint64_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u64[0]); + for(unsigned i = 1; i < genTypeSize(simdType) / 8; i++) + { + uint64_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u64[i]); + if(FirstElement != ElementToCheck) + { + IsIdentical = false; + break; + } + } + break; + } + default: + unreached(); + } + + if(IsIdentical) { IsEmbBroadcastEnabled = true; } From 32fd87ad34312c5c1129e11af00722fbeca6ef9e Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 5 May 2023 11:53:19 -0700 Subject: [PATCH 24/44] Bug fixes on embedded broadcast with AVX_Broadcast --- src/coreclr/jit/instr.cpp | 8 ++++---- src/coreclr/jit/lowerxarch.cpp | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 6381a8a3863311..6aa6425c5a6a51 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -801,12 +801,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v switch (intrinsicId) { case NI_SSE3_MoveAndDuplicate: - case NI_AVX_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector128: - case NI_AVX_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: + // NI_AVX_BroadcastScalarToVector* will use the defult path and emit LCL_ADDR directly. { + assert(op->isContained()); if (intrinsicId == NI_SSE3_MoveAndDuplicate) { assert(simdBaseType == TYP_DOUBLE); @@ -824,7 +824,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v // a special case is when the operand of CreateScalarUnsafe is in integer type, // CreateScalarUnsafe node will be fold, so we directly match a pattern of // broadcast -> LCL_VAR(TYP_(U)INT) - assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)); + assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)); op = hwintrinsic->Op(1); assert(op->isContained()); return genOperandDesc(op); @@ -833,7 +833,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v case TYP_FLOAT: case TYP_DOUBLE: { - assert(op->AsHWIntrinsic()->Op(1)->OperIs(GT_HWINTRINSIC)); + assert(hwintrinsic->Op(1)->OperIs(GT_HWINTRINSIC)); op = hwintrinsic->Op(1); assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); assert(op->isContained()); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index de2f4c9a2e39f2..604beaed84558d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1631,6 +1631,23 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerFusedMultiplyAdd(node); break; + case NI_AVX_BroadcastScalarToVector256: + case NI_AVX_BroadcastScalarToVector128: + { + GenTree* op = node->Op(1); + assert(op->OperIs(GT_LCL_ADDR)); + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) + { + if(node == use.User()->AsHWIntrinsic()->Op(1)) + { + std::swap(use.User()->AsHWIntrinsic()->Op(1), use.User()->AsHWIntrinsic()->Op(2)); + } + } + break; + } + default: break; } @@ -7619,9 +7636,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } case NI_SSE3_MoveAndDuplicate: - case NI_AVX_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector128: - case NI_AVX_BroadcastScalarToVector256: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { @@ -7686,6 +7701,20 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } } + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + { + assert(childNode->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_ADDR)); + if(parentNode->OperIsEmbBroadcastCompatible()) + { + return true; + } + else + { + return false; + } + } + default: { assert(!childNode->isContainableHWIntrinsic()); From 4f97298efab0e20e4e4da066ccb53330ba4fa092 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 8 May 2023 11:30:28 -0700 Subject: [PATCH 25/44] enable embedded broadcast in R_R_A path --- src/coreclr/jit/emitxarch.cpp | 14 +++++++++++--- src/coreclr/jit/emitxarch.h | 8 ++++---- src/coreclr/jit/instr.cpp | 24 ++++++++++++++++++++++-- src/coreclr/jit/lowerxarch.cpp | 10 +++++++--- 4 files changed, 44 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index beaed836d43c91..938495af87d8a8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6677,7 +6677,7 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int emitCurIGsize += sz; } -void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir) +void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); @@ -6688,6 +6688,13 @@ void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regN id->idIns(ins); id->idReg1(reg1); id->idReg2(reg2); +#if defined(TARGET_XARCH) + if (instOptions == INS_OPTS_EVEX_b) + { + assert(UseEvexEncoding()); + id->idSetEvexbContext(); + } +#endif // TARGET_XARCH emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); @@ -8156,11 +8163,11 @@ void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targe // indir -- The GenTreeIndir used for the memory address // void emitter::emitIns_SIMD_R_R_A( - instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir) + instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, insOpts instOptions) { if (UseSimdEncoding()) { - emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir); + emitIns_R_R_A(ins, attr, targetReg, op1Reg, indir, instOptions); } else { @@ -16697,6 +16704,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) else { code = AddSimdPrefixIfNeeded(id, code, size); + code = AddEvexbBitIfNeeded(id, code); regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 05091df08b60d5..ecb26cf07c122b 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -573,7 +573,7 @@ void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival); -void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir); +void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs); @@ -592,10 +592,10 @@ void emitIns_R_R_C(instruction ins, regNumber reg2, CORINFO_FIELD_HANDLE fldHnd, int offs, - insOpts instOptions); + insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_S( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions); + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); @@ -695,7 +695,7 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); -void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir); +void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_C(instruction ins, emitAttr attr, regNumber targetReg, diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 6aa6425c5a6a51..d09e7f0804f7aa 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -800,11 +800,31 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v switch (intrinsicId) { + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + { + //we have the assumption that AVX_BroadcastScalarToVector* + //only take the memory address as the operand. + assert(hwintrinsic->isContained()); + assert(hwintrinsic->OperIsMemoryLoad()); + assert(hwintrinsic->GetOperandCount() == 1); + GenTree* BroadcastScalar = hwintrinsic->Op(1); + if(BroadcastScalar->OperIs(GT_LCL_ADDR)) + { + addr = hwintrinsic->Op(1); + break; + } + else + { + assert(BroadcastScalar->OperIs(GT_LCL_VAR)); + return OperandDesc(simdBaseType, BroadcastScalar); + } + } + case NI_SSE3_MoveAndDuplicate: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: - // NI_AVX_BroadcastScalarToVector* will use the defult path and emit LCL_ADDR directly. { assert(op->isContained()); if (intrinsicId == NI_SSE3_MoveAndDuplicate) @@ -1329,7 +1349,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, // temporary GT_IND to generate code with. GenTreeIndir indirForm; GenTreeIndir* indir = op2Desc.GetIndirForm(&indirForm); - emit->emitIns_SIMD_R_R_A(ins, size, targetReg, op1Reg, indir); + emit->emitIns_SIMD_R_R_A(ins, size, targetReg, op1Reg, indir, instOptions); } break; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 604beaed84558d..c212a1246daa60 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1634,12 +1634,15 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX_BroadcastScalarToVector256: case NI_AVX_BroadcastScalarToVector128: { - GenTree* op = node->Op(1); - assert(op->OperIs(GT_LCL_ADDR)); + // there can be 2 cases hitting AVX_BroadcastScalarToVector* + // 1. pass the address as LCL_ADDR to AVX.BroadcastScalarToVector256() API + // 2. pass the address as LCL_VAR to AVX.BroadcastScalarToVector256() API LIR::Use use; bool foundUse = BlockRange().TryGetUse(node, &use); if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { + GenTree* op = node->Op(1); + assert(op->OperIs(GT_LCL_ADDR) || op->OperIs(GT_LCL_VAR)); if(node == use.User()->AsHWIntrinsic()->Op(1)) { std::swap(use.User()->AsHWIntrinsic()->Op(1), use.User()->AsHWIntrinsic()->Op(2)); @@ -7704,7 +7707,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: { - assert(childNode->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_ADDR)); + GenTree* childNodeOp = hwintrinsic->Op(1); + assert(childNodeOp->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); if(parentNode->OperIsEmbBroadcastCompatible()) { return true; From a5c441443243efd24638485fcc7e0faa6f942fbb Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 8 May 2023 11:34:37 -0700 Subject: [PATCH 26/44] apply format patch --- src/coreclr/jit/emitxarch.cpp | 3 ++- src/coreclr/jit/emitxarch.h | 23 +++++++++++++++++++---- src/coreclr/jit/instr.cpp | 22 +++++++++++----------- src/coreclr/jit/lowerxarch.cpp | 11 ++++++----- 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 938495af87d8a8..9a3a917c7104ad 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -6677,7 +6677,8 @@ void emitter::emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int emitCurIGsize += sz; } -void emitter::emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions) +void emitter::emitIns_R_R_A( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions) { assert(IsAvx512OrPriorInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index ecb26cf07c122b..e6e31013d6cb61 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -573,7 +573,12 @@ void emitIns_R_C_I(instruction ins, emitAttr attr, regNumber reg1, CORINFO_FIELD void emitIns_R_S_I(instruction ins, emitAttr attr, regNumber reg1, int varx, int offs, int ival); -void emitIns_R_R_A(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, GenTreeIndir* indir, insOpts instOptions = INS_OPTS_NONE); +void emitIns_R_R_A(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + GenTreeIndir* indir, + insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_AR(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber base, int offs); @@ -594,8 +599,13 @@ void emitIns_R_R_C(instruction ins, int offs, insOpts instOptions = INS_OPTS_NONE); -void emitIns_R_R_S( - instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, insOpts instOptions = INS_OPTS_NONE); +void emitIns_R_R_S(instruction ins, + emitAttr attr, + regNumber reg1, + regNumber reg2, + int varx, + int offs, + insOpts instOptions = INS_OPTS_NONE); void emitIns_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3); @@ -695,7 +705,12 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival); -void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir, insOpts instOptions = INS_OPTS_NONE); +void emitIns_SIMD_R_R_A(instruction ins, + emitAttr attr, + regNumber targetReg, + regNumber op1Reg, + GenTreeIndir* indir, + insOpts instOptions = INS_OPTS_NONE); void emitIns_SIMD_R_R_C(instruction ins, emitAttr attr, regNumber targetReg, diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d09e7f0804f7aa..11853fc554f62f 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -803,13 +803,13 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: { - //we have the assumption that AVX_BroadcastScalarToVector* - //only take the memory address as the operand. + // we have the assumption that AVX_BroadcastScalarToVector* + // only take the memory address as the operand. assert(hwintrinsic->isContained()); assert(hwintrinsic->OperIsMemoryLoad()); assert(hwintrinsic->GetOperandCount() == 1); GenTree* BroadcastScalar = hwintrinsic->Op(1); - if(BroadcastScalar->OperIs(GT_LCL_ADDR)) + if (BroadcastScalar->OperIs(GT_LCL_ADDR)) { addr = hwintrinsic->Op(1); break; @@ -1240,17 +1240,17 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) case GT_CNS_VEC: { - var_types simdType = op->TypeGet(); - bool IsIdentical = true; - switch(inputSize) + var_types simdType = op->TypeGet(); + bool IsIdentical = true; + switch (inputSize) { case Input_32Bit: { uint32_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u32[0]); - for(unsigned i = 1; i < genTypeSize(simdType) / 4; i++) + for (unsigned i = 1; i < genTypeSize(simdType) / 4; i++) { uint32_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u32[i]); - if(FirstElement != ElementToCheck) + if (FirstElement != ElementToCheck) { IsIdentical = false; break; @@ -1261,10 +1261,10 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) case Input_64Bit: { uint64_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u64[0]); - for(unsigned i = 1; i < genTypeSize(simdType) / 8; i++) + for (unsigned i = 1; i < genTypeSize(simdType) / 8; i++) { uint64_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u64[i]); - if(FirstElement != ElementToCheck) + if (FirstElement != ElementToCheck) { IsIdentical = false; break; @@ -1276,7 +1276,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) unreached(); } - if(IsIdentical) + if (IsIdentical) { IsEmbBroadcastEnabled = true; } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c212a1246daa60..2d1d9ada9f7753 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1638,12 +1638,13 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // 1. pass the address as LCL_ADDR to AVX.BroadcastScalarToVector256() API // 2. pass the address as LCL_VAR to AVX.BroadcastScalarToVector256() API LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) + bool foundUse = BlockRange().TryGetUse(node, &use); + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && + use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { GenTree* op = node->Op(1); assert(op->OperIs(GT_LCL_ADDR) || op->OperIs(GT_LCL_VAR)); - if(node == use.User()->AsHWIntrinsic()->Op(1)) + if (node == use.User()->AsHWIntrinsic()->Op(1)) { std::swap(use.User()->AsHWIntrinsic()->Op(1), use.User()->AsHWIntrinsic()->Op(2)); } @@ -7709,7 +7710,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTree* childNodeOp = hwintrinsic->Op(1); assert(childNodeOp->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); - if(parentNode->OperIsEmbBroadcastCompatible()) + if (parentNode->OperIsEmbBroadcastCompatible()) { return true; } @@ -7717,7 +7718,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { return false; } - } + } default: { From 12363a93dd6e77a79a5e3893821250a761e2d93c Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 8 May 2023 16:16:44 -0700 Subject: [PATCH 27/44] bug fixes: re-introduce "OperIsBroadcastScalar", there are some cases when non-broadcast node (e.g. Load, Read) contained by embedded broadcast and embedded broadcast is enabled unexpectedly, using this method can filter out those cases. --- src/coreclr/jit/gentree.cpp | 27 +++++++++++++++++++++++++++ src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/instr.cpp | 2 +- 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index d15dc2b73366f7..a5a4c200bd84ab 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -25092,6 +25092,33 @@ bool GenTreeHWIntrinsic::OperIsEmbBroadcastCompatible() const return HWIntrinsicInfo::IsEmbBroadcastCompatible(GetHWIntrinsicId()); } +//------------------------------------------------------------------------ +// OperIsBroadcastScalar: Is this HWIntrinsic a broadcast node from scalar. +// +// Return Value: +// Whether "this" is a broadcast node from scalar. +// +bool GenTreeHWIntrinsic::OperIsBroadcastScalar() const +{ +#if defined(TARGET_XARCH) + NamedIntrinsic intrinsicId = GetHWIntrinsicId(); + switch (intrinsicId) + { + case NI_AVX2_BroadcastScalarToVector128: + case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX_BroadcastScalarToVector128: + case NI_AVX_BroadcastScalarToVector256: + case NI_SSE3_MoveAndDuplicate: + case NI_AVX512F_BroadcastScalarToVector512: + return true; + default: + return false; + } +#else + return false; +#endif +} + //------------------------------------------------------------------------------ // OperRequiresAsgFlag : Check whether the operation requires GTF_ASG flag regardless // of the children's flags. diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index cb85c21c70ef51..2f6def6803a5b6 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6214,6 +6214,7 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic bool OperIsMemoryLoadOrStore() const; bool OperIsMemoryStoreOrBarrier() const; bool OperIsEmbBroadcastCompatible() const; + bool OperIsBroadcastScalar() const; bool OperRequiresAsgFlag() const; bool OperRequiresCallFlag() const; diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 11853fc554f62f..e6423e8a04b416 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1231,7 +1231,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { case GT_HWINTRINSIC: { - if (op->isContained()) + if (op->isContained() && op->AsHWIntrinsic()->OperIsBroadcastScalar()) { IsEmbBroadcastEnabled = true; } From b561885f8fbb97faaf3928ed954a0e72ea38a968 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 10 May 2023 17:07:49 -0700 Subject: [PATCH 28/44] Changes based on reviews: 1. code style improvement 2. fixes typos and errors in the comments. 3. extract the operand swap logic when lowering Create node into a function: TryCanonizeEmbBroadcastCandicate() --- src/coreclr/jit/codegenxarch.cpp | 4 +- src/coreclr/jit/emit.h | 4 +- src/coreclr/jit/emitxarch.cpp | 23 ++-- src/coreclr/jit/emitxarch.h | 1 - src/coreclr/jit/gentree.cpp | 4 +- src/coreclr/jit/instr.cpp | 17 ++- src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 208 +++++++++++-------------------- 8 files changed, 106 insertions(+), 156 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3d22f47fbd430d..151fa1403abdeb 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -987,7 +987,7 @@ void CodeGen::genCodeForBinary(GenTreeOp* treeNode) // all have RMW semantics if VEX support is not available bool isRMW = !compiler->canUseVexEncoding(); - inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW, TYP_UNKNOWN); + inst_RV_RV_TT(ins, emitTypeSize(treeNode), targetReg, op1reg, op2, isRMW); genProduceReg(treeNode); return; @@ -7769,7 +7769,7 @@ void CodeGen::genSSE2BitwiseOp(GenTree* treeNode) #endif } - GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0, INS_OPTS_NONE); + GetEmitter()->emitIns_SIMD_R_R_C(ins, EA_16BYTE, targetReg, operandReg, *maskFld, 0); } //----------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 61f261088fb1bb..ee22b55e2f2993 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -818,7 +818,7 @@ class emitter //////////////////////////////////////////////////////////////////////// // Space taken up to here: // x86: 47 bits - // amd64: 46 bits + // amd64: 47 bits // arm: 48 bits // arm64: 50 bits // loongarch64: 46 bits @@ -3689,7 +3689,7 @@ emitAttr emitter::emitGetMemOpSize(instrDesc* id) const return EA_8BYTE; default: - break; + unreached(); } } switch (ins) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 9a3a917c7104ad..3148a98f5a09b0 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1234,7 +1234,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define EVEX_B_BIT 0x0000001000000000ULL //------------------------------------------------------------------------ -// AddEvexPrefix: Add default EVEX perfix with only LL' bits set. +// AddEvexPrefix: Add default EVEX prefix with only LL' bits set. // // Arguments: // ins -- processor instruction to check. @@ -1269,10 +1269,21 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at return code; } +//------------------------------------------------------------------------ +// AddEvexPrefix: set Evex.b bit if EvexbContext is set in instruction descritor. +// +// Arguments: +// id -- instruction descriptor +// code -- opcode bits. +// +// Return Value: +// encoded code with Evex.b set if needed. +// emitter::code_t emitter::AddEvexbBitIfNeeded(const instrDesc* id, code_t code) { - if (hasEvexPrefix(code) && id->idIsEvexbContext()) + if (id->idIsEvexbContext()) { + hasEvexPrefix(code); code |= EVEX_B_BIT; } return code; @@ -6689,13 +6700,11 @@ void emitter::emitIns_R_R_A( id->idIns(ins); id->idReg1(reg1); id->idReg2(reg2); -#if defined(TARGET_XARCH) if (instOptions == INS_OPTS_EVEX_b) { assert(UseEvexEncoding()); id->idSetEvexbContext(); } -#endif // TARGET_XARCH emitHandleMemOp(indir, id, (ins == INS_mulx) ? IF_RWR_RWR_ARD : emitInsModeFormat(ins, IF_RRD_RRD_ARD), ins); @@ -6820,13 +6829,11 @@ void emitter::emitIns_R_R_C(instruction ins, id->idReg1(reg1); id->idReg2(reg2); id->idAddr()->iiaFieldHnd = fldHnd; -#if defined(TARGET_XARCH) if (instOptions == INS_OPTS_EVEX_b) { assert(UseEvexEncoding()); id->idSetEvexbContext(); } -#endif // TARGET_XARCH UNATIVE_OFFSET sz = emitInsSizeCV(id, insCodeRM(ins)); id->idCodeSize(sz); @@ -6873,13 +6880,11 @@ void emitter::emitIns_R_R_S( id->idReg2(reg2); id->idAddr()->iiaLclVar.initLclVarAddr(varx, offs); -#if defined(TARGET_XARCH) if (instOptions == INS_OPTS_EVEX_b) { assert(UseEvexEncoding()); id->idSetEvexbContext(); } -#endif // TARGET_XARCH #ifdef DEBUG id->idDebugOnlyInfo()->idVarRefOffs = emitVarRefOffs; #endif @@ -8438,7 +8443,7 @@ void emitter::emitIns_SIMD_R_R_R_C(instruction ins, assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs, INS_OPTS_NONE); + emitIns_R_R_C(ins, attr, targetReg, op2Reg, fldHnd, offs); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index e6e31013d6cb61..3081e5df051c95 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -870,7 +870,6 @@ inline bool emitIsUncondJump(instrDesc* jmp) //------------------------------------------------------------------------ // HasEmbeddedBroadcast: Do we consider embedded broadcast while encoding. -// TODO-XArch-AVX512: Add eventual check on the instrDesc // // Arguments: // id - Instruction descriptor. diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index a5a4c200bd84ab..30e09327805641 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19144,6 +19144,8 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { + // These intrinsic operations are contained as part of the operand of embedded broadcast compatiable + // instriction return true; } @@ -25085,7 +25087,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryStoreOrBarrier() const // OperIsEmbBroadcastCompatible: Checks if the intrinsic is a embedded broadcast compatible inintrsic. // // Return Value: -// true if the intrisic node lowering instruction is embedded broadcast compatible. +// true if the intrinsic node lowering instruction is embedded broadcast compatible. // bool GenTreeHWIntrinsic::OperIsEmbBroadcastCompatible() const { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index e6423e8a04b416..1da3a8bbea091b 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -750,6 +750,8 @@ void CodeGen::inst_RV_SH( // // Arguments: // op - The operand node for which to obtain the descriptor +// instOptions - The optional parameter to track if embedded broadcast is enabled +// simdBaseType - The base data type of the emitting instruction. // // Return Value: // The operand descriptor for "op". @@ -808,16 +810,16 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v assert(hwintrinsic->isContained()); assert(hwintrinsic->OperIsMemoryLoad()); assert(hwintrinsic->GetOperandCount() == 1); - GenTree* BroadcastScalar = hwintrinsic->Op(1); - if (BroadcastScalar->OperIs(GT_LCL_ADDR)) + GenTree* broadcastScalar = hwintrinsic->Op(1); + if (broadcastScalar->OperIs(GT_LCL_ADDR)) { addr = hwintrinsic->Op(1); break; } else { - assert(BroadcastScalar->OperIs(GT_LCL_VAR)); - return OperandDesc(simdBaseType, BroadcastScalar); + assert(broadcastScalar->OperIs(GT_LCL_VAR)); + return OperandDesc(simdBaseType, broadcastScalar); } } @@ -1210,6 +1212,13 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT } } +//------------------------------------------------------------------------ +// IsEmbeddedBroadcastEnabled: determine if embedded broadcast can be enabled +// +// Arguments: +// ins -- The instruction being emitted +// op -- The second operand of the instruction. +// #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 264405298e9681..042e0f046632b8 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -364,6 +364,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); + void TryCanonizeEmbBroadcastCandicate(GenTreeHWIntrinsic* node); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2d1d9ada9f7753..57c21c60f5937c 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1643,7 +1643,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) { GenTree* op = node->Op(1); - assert(op->OperIs(GT_LCL_ADDR) || op->OperIs(GT_LCL_VAR)); + assert(op->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); if (node == use.User()->AsHWIntrinsic()->Op(1)) { std::swap(use.User()->AsHWIntrinsic()->Op(1), use.User()->AsHWIntrinsic()->Op(2)); @@ -2360,23 +2360,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* CreateUser = nullptr; - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - CreateUser = use.User(); - } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) - { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) - { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); - } - } + // Seek for optimization opportunities using embedded broadcast. + TryCanonizeEmbBroadcastCandicate(node); break; } default: @@ -2409,44 +2394,11 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. - // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics - // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* CreateUser = nullptr; - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - CreateUser = use.User(); - } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) - { - switch (op1->TypeGet()) - { - case TYP_FLOAT: - case TYP_DOUBLE: - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the - // containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) - { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); - } - break; - } - - default: - break; - } - } + // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. + TryCanonizeEmbBroadcastCandicate(node); } return LowerNode(node); } @@ -2528,38 +2480,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* CreateUser = nullptr; - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - CreateUser = use.User(); - } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR)) - { - switch (op1->TypeGet()) - { - case TYP_FLOAT: - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the - // containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) - { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); - } - break; - } - - default: - break; - } - } + // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. + TryCanonizeEmbBroadcastCandicate(node); } return LowerNode(node); } @@ -2794,24 +2716,8 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) node->ChangeHWIntrinsicId(NI_SSE3_MoveAndDuplicate, tmp1); if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) { - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* CreateUser = nullptr; - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - CreateUser = use.User(); - } - if (CreateUser != nullptr && op1->OperIs(GT_LCL_VAR) && op1->TypeIs(TYP_DOUBLE)) - { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the - // containment - // issue. - if (node == CreateUser->AsHWIntrinsic()->Op(1)) - { - std::swap(CreateUser->AsHWIntrinsic()->Op(1), CreateUser->AsHWIntrinsic()->Op(2)); - } - } + // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. + TryCanonizeEmbBroadcastCandicate(node); } break; } @@ -5269,6 +5175,54 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) return blsmskNode; } +//---------------------------------------------------------------------------------------------- +// Lowering::TryCanonizeEmbBroadcastCandicate: +// Tries to canonize the operands of an embedded broadcast op. +// +// Arguments: +// node - `Create` node being lowered +// +// Notes: +// This function will canonize the input operands of a commutive and embedded broadcast op. +// The operand to be broadcasted will be put to the 2nd operand. +// +void Lowering::TryCanonizeEmbBroadcastCandicate(GenTreeHWIntrinsic* node) +{ + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTree* createUser = nullptr; + // Here we assume we have the form of Broadcast -> CreateScalarUnsafe -> Scalar + GenTree* scalar = node->Op(1)->AsHWIntrinsic()->Op(1); + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) + { + createUser = use.User(); + } + if (createUser != nullptr && scalar->OperIs(GT_LCL_VAR) && createUser->AsHWIntrinsic()->isCommutativeHWIntrinsic()) + { + // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment + // issue. + switch (scalar->TypeGet()) + { + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + case TYP_FLOAT: + case TYP_DOUBLE: + { + if (node == createUser->AsHWIntrinsic()->Op(1)) + { + std::swap(createUser->AsHWIntrinsic()->Op(1), createUser->AsHWIntrinsic()->Op(2)); + } + break; + } + + default: + break; + } + } +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible // @@ -7652,12 +7606,11 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastCompatible()) { - - GenTree* CreateScalar = childNode->AsHWIntrinsic()->Op(1); - if (CreateScalar->OperIs(GT_HWINTRINSIC) && - CreateScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + GenTree* createScalar = childNode->AsHWIntrinsic()->Op(1); + if (createScalar->OperIs(GT_HWINTRINSIC) && + createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { - GenTree* Scalar = CreateScalar->AsHWIntrinsic()->Op(1); + GenTree* Scalar = createScalar->AsHWIntrinsic()->Op(1); if (Scalar->OperIs(GT_LCL_VAR)) { switch (Scalar->TypeGet()) @@ -7668,41 +7621,29 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister( opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(CreateScalar, Scalar); - MakeSrcContained(childNode, CreateScalar); + MakeSrcContained(createScalar, Scalar); + MakeSrcContained(childNode, createScalar); return true; } default: return false; - ; } } - else - { - return false; - } } - else if (CreateScalar->OperIs(GT_LCL_VAR)) + else if (createScalar->OperIs(GT_LCL_VAR)) { // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be // fold, we need to specially handle this case. - assert(CreateScalar->TypeIs(TYP_INT) || CreateScalar->TypeIs(TYP_UINT) || - CreateScalar->TypeIs(TYP_LONG) || CreateScalar->TypeIs(TYP_ULONG)); - const unsigned opLclNum = CreateScalar->AsLclVar()->GetLclNum(); + assert(createScalar->TypeIs(TYP_INT) || createScalar->TypeIs(TYP_UINT) || + createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); + const unsigned opLclNum = createScalar->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(childNode, CreateScalar); + MakeSrcContained(childNode, createScalar); return true; } - else - { - return false; - } - } - else - { - return false; } + return false; } case NI_AVX_BroadcastScalarToVector128: @@ -7710,14 +7651,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTree* childNodeOp = hwintrinsic->Op(1); assert(childNodeOp->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); - if (parentNode->OperIsEmbBroadcastCompatible()) - { - return true; - } - else - { - return false; - } + return parentNode->OperIsEmbBroadcastCompatible(); } default: From 90e27c4f63efb181669011f434e5476da60a27c9 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 4 May 2023 16:49:37 -0700 Subject: [PATCH 29/44] unfold VecCon node when lowering if this node is eligible for embedded broadcast. --- src/coreclr/jit/codegen.h | 5 +- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 3 +- src/coreclr/jit/instr.cpp | 125 +++-------------- src/coreclr/jit/lower.cpp | 13 ++ src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 140 +++++++++++++++++++- 6 files changed, 172 insertions(+), 115 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 99719ee119f7d4..507af2df29319f 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1533,7 +1533,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX } }; - OperandDesc genOperandDesc(GenTree* op, insOpts instOptions = INS_OPTS_NONE, var_types simdBaseType = TYP_UNKNOWN); + OperandDesc genOperandDesc(GenTree* op); void inst_TT(instruction ins, emitAttr size, GenTree* op1); void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); @@ -1544,8 +1544,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regNumber targetReg, regNumber op1Reg, GenTree* op2, - bool isRMW, - var_types simdBaseType = TYP_UNKNOWN); + bool isRMW); void inst_RV_RV_TT_IV( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 94952200eb2100..a43857d85e205b 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -729,8 +729,7 @@ void CodeGen::genHWIntrinsic_R_R_RM( } bool isRMW = node->isRMWHWIntrinsic(compiler); - var_types simdBaseType = node->GetSimdBaseType(); - inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW, simdBaseType); + inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 1da3a8bbea091b..a714c2ff384092 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -760,7 +760,7 @@ void CodeGen::inst_RV_SH( // This method is not idempotent - it can only be called once for a // given node. // -CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, var_types simdBaseType) +CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) { if (!op->isContained() && !op->isUsedFromSpillTemp()) { @@ -799,7 +799,7 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v #if defined(FEATURE_HW_INTRINSICS) GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); - + var_types simdBaseType = hwintrinsic->GetSimdBaseType(); switch (intrinsicId) { case NI_AVX_BroadcastScalarToVector128: @@ -846,10 +846,19 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v // a special case is when the operand of CreateScalarUnsafe is in integer type, // CreateScalarUnsafe node will be fold, so we directly match a pattern of // broadcast -> LCL_VAR(TYP_(U)INT) - assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)); - op = hwintrinsic->Op(1); - assert(op->isContained()); - return genOperandDesc(op); + assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR, GT_CNS_INT)); + GenTree* scalar = hwintrinsic->Op(1); + if(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)) + { + assert(scalar->isContained()); + return genOperandDesc(scalar); + } + else + { + ssize_t scalarValue = scalar->AsIntCon()->IconValue(); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), genTypeSize(simdBaseType), simdBaseType); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } } case TYP_FLOAT: @@ -932,60 +941,13 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op, insOpts instOptions, v return OperandDesc(emit->emitFltOrDblConst(op->AsDblCon()->DconValue(), emitTypeSize(op))); case GT_CNS_INT: + { assert(op->isContainedIntOrIImmed()); return OperandDesc(op->AsIntCon()->IconValue(), op->AsIntCon()->ImmedValNeedsReloc(compiler)); + } case GT_CNS_VEC: { -#if defined(TARGET_XARCH) - if (instOptions == INS_OPTS_EVEX_b) - { - assert(op->isContained()); - switch (simdBaseType) - { - case TYP_FLOAT: - { - float scalar = static_cast(op->AsVecCon()->gtSimdVal.f32[0]); - return OperandDesc(emit->emitFltOrDblConst(*reinterpret_cast(&scalar), EA_4BYTE)); - } - - case TYP_DOUBLE: - { - double scalar = static_cast(op->AsVecCon()->gtSimdVal.f64[0]); - return OperandDesc(emit->emitFltOrDblConst(scalar, EA_8BYTE)); - } - - case TYP_INT: - { - uint32_t scalar = static_cast(op->AsVecCon()->gtSimdVal.i32[0]); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 4, 4, TYP_INT); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); - } - case TYP_UINT: - { - uint32_t scalar = static_cast(op->AsVecCon()->gtSimdVal.u32[0]); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 4, 4, TYP_UINT); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); - } - case TYP_LONG: - { - uint64_t scalar = static_cast(op->AsVecCon()->gtSimdVal.i64[0]); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 8, 8, TYP_LONG); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); - } - case TYP_ULONG: - { - uint64_t scalar = static_cast(op->AsVecCon()->gtSimdVal.u64[0]); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalar, 8, 8, TYP_ULONG); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); - } - - default: - unreached(); - } - break; - } -#endif // TARGET_XARCH switch (op->TypeGet()) { #if defined(FEATURE_SIMD) @@ -1233,8 +1195,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) insFlags inputSize = static_cast((CodeGenInterface::instInfo[ins] & Input_Mask)); // Embedded broadcast can be applied when operands are in the following forms. - // 1. Broadcast -> CreateScalar -> LCL_VAR - // 2. CnsVec + // 1. Broadcast -> CreateScalar -> LCL_VAR/CNS bool IsEmbBroadcastEnabled = false; switch (op->OperGet()) { @@ -1247,51 +1208,6 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) break; } - case GT_CNS_VEC: - { - var_types simdType = op->TypeGet(); - bool IsIdentical = true; - switch (inputSize) - { - case Input_32Bit: - { - uint32_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u32[0]); - for (unsigned i = 1; i < genTypeSize(simdType) / 4; i++) - { - uint32_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u32[i]); - if (FirstElement != ElementToCheck) - { - IsIdentical = false; - break; - } - } - break; - } - case Input_64Bit: - { - uint64_t FirstElement = static_cast(op->AsVecCon()->gtSimdVal.u64[0]); - for (unsigned i = 1; i < genTypeSize(simdType) / 8; i++) - { - uint64_t ElementToCheck = static_cast(op->AsVecCon()->gtSimdVal.u64[i]); - if (FirstElement != ElementToCheck) - { - IsIdentical = false; - break; - } - } - break; - } - default: - unreached(); - } - - if (IsIdentical) - { - IsEmbBroadcastEnabled = true; - } - break; - } - default: break; } @@ -1322,8 +1238,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, regNumber targetReg, regNumber op1Reg, GenTree* op2, - bool isRMW, - var_types simdBaseType) + bool isRMW) { emitter* emit = GetEmitter(); noway_assert(emit->emitVerifyEncodable(ins, EA_SIZE(size), targetReg)); @@ -1339,7 +1254,7 @@ void CodeGen::inst_RV_RV_TT(instruction ins, instOptions = INS_OPTS_EVEX_b; } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS - OperandDesc op2Desc = genOperandDesc(op2, instOptions, simdBaseType); + OperandDesc op2Desc = genOperandDesc(op2); switch (op2Desc.GetKind()) { case OperandKind::ClsVar: diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d14dbc4be0eb5d..a4674d0595fe99 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -587,6 +587,19 @@ GenTree* Lowering::LowerNode(GenTree* node) #ifdef FEATURE_HW_INTRINSICS case GT_HWINTRINSIC: return LowerHWIntrinsic(node->AsHWIntrinsic()); +#ifdef TARGET_XARCH + case GT_CNS_VEC: + { + if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + return TryLowerConstVec(node->AsVecCon()); + } + else + break; + } + +#endif // TARGET_XARCH + #endif // FEATURE_HW_INTRINSICS case GT_LCL_FLD: diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 042e0f046632b8..d7eb9cf1ec1681 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -364,6 +364,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); + GenTree* TryLowerConstVec(GenTreeVecCon* node); void TryCanonizeEmbBroadcastCandicate(GenTreeHWIntrinsic* node); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 57c21c60f5937c..b16ee6c54dc0fa 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1003,6 +1003,125 @@ void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node) } } +GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) +{ + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + GenTreeHWIntrinsic* embBroadcastNode = nullptr; + if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) + { + embBroadcastNode = use.User()->AsHWIntrinsic(); + var_types simdType = embBroadcastNode->TypeGet(); + var_types simdBaseType = embBroadcastNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = embBroadcastNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + + int ElementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); + switch (simdBaseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + uint32_t FirstElement = static_cast(node->gtSimdVal.u32[0]); + for(int i = 1; i < ElementCount; i++) + { + uint32_t ElementToCheck = static_cast(node->gtSimdVal.u32[i]); + if(FirstElement != ElementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + uint64_t FirstElement = static_cast(node->gtSimdVal.u64[0]); + for(int i = 1; i < ElementCount; i++) + { + uint64_t ElementToCheck = static_cast(node->gtSimdVal.u64[i]); + if(FirstElement != ElementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + default: + isCreatedFromScalar = false; + break; + } + if(isCreatedFromScalar) + { + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + if(simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if(simdType == TYP_SIMD64) + { + broadcastName = NI_AVX512F_BroadcastScalarToVector512; + } + GenTree* constScalar = nullptr; + switch(simdBaseType) + { + case TYP_FLOAT: + { + float scalar = static_cast(node->gtSimdVal.f32[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_DOUBLE: + { + double scalar = static_cast(node->gtSimdVal.f64[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_INT: + { + int32_t scalar = static_cast(node->gtSimdVal.i32[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_UINT: + { + uint32_t scalar = static_cast(node->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); + break; + } + case TYP_LONG: + { + int64_t scalar = static_cast(node->gtSimdVal.i64[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_ULONG: + { + uint64_t scalar = static_cast(node->gtSimdVal.u64[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + break; + } + } + GenTreeHWIntrinsic* createScalar = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + BlockRange().InsertBefore(node, broadcastNode); + BlockRange().InsertBefore(broadcastNode, createScalar); + BlockRange().InsertBefore(createScalar, constScalar); + use.ReplaceWith(broadcastNode); + BlockRange().Remove(node); + LowerNode(createScalar); + return LowerNode(broadcastNode); + } + } + return node->gtNext; +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node. // @@ -7610,18 +7729,18 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre if (createScalar->OperIs(GT_HWINTRINSIC) && createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { - GenTree* Scalar = createScalar->AsHWIntrinsic()->Op(1); - if (Scalar->OperIs(GT_LCL_VAR)) + GenTree* scalar = createScalar->AsHWIntrinsic()->Op(1); + if (scalar->OperIs(GT_LCL_VAR)) { - switch (Scalar->TypeGet()) + switch (scalar->TypeGet()) { case TYP_FLOAT: case TYP_DOUBLE: { - const unsigned opLclNum = Scalar->AsLclVar()->GetLclNum(); + const unsigned opLclNum = scalar->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister( opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(createScalar, Scalar); + MakeSrcContained(createScalar, scalar); MakeSrcContained(childNode, createScalar); return true; } @@ -7630,6 +7749,12 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre return false; } } + else if (scalar->OperIs(GT_CNS_DBL)) + { + MakeSrcContained(createScalar, scalar); + MakeSrcContained(childNode, createScalar); + return true; + } } else if (createScalar->OperIs(GT_LCL_VAR)) { @@ -7642,6 +7767,11 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre MakeSrcContained(childNode, createScalar); return true; } + else if (createScalar->OperIs(GT_CNS_INT)) + { + MakeSrcContained(childNode, createScalar); + return true; + } } return false; } From 9bfa325b86f217fe11b31d602e142a85c3309317 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 11 May 2023 13:37:19 -0700 Subject: [PATCH 30/44] apply format patch --- src/coreclr/jit/codegen.h | 7 +-- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 2 +- src/coreclr/jit/instr.cpp | 19 +++--- src/coreclr/jit/lower.cpp | 4 +- src/coreclr/jit/lowerxarch.cpp | 70 ++++++++++++--------- 5 files changed, 52 insertions(+), 50 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 507af2df29319f..aec68a7bff2a73 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1539,12 +1539,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void inst_RV_TT(instruction ins, emitAttr size, regNumber op1Reg, GenTree* op2); void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival); void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival); - void inst_RV_RV_TT(instruction ins, - emitAttr size, - regNumber targetReg, - regNumber op1Reg, - GenTree* op2, - bool isRMW); + void inst_RV_RV_TT(instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW); void inst_RV_RV_TT_IV( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW); #endif diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index a43857d85e205b..973d4176a00b63 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -728,7 +728,7 @@ void CodeGen::genHWIntrinsic_R_R_RM( assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op2); } - bool isRMW = node->isRMWHWIntrinsic(compiler); + bool isRMW = node->isRMWHWIntrinsic(compiler); inst_RV_RV_TT(ins, attr, targetReg, op1Reg, op2, isRMW); } diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index a714c2ff384092..44f387e3ced85f 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -797,8 +797,8 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(op->OperIsHWIntrinsic()); #if defined(FEATURE_HW_INTRINSICS) - GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); - NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); + GenTreeHWIntrinsic* hwintrinsic = op->AsHWIntrinsic(); + NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId(); var_types simdBaseType = hwintrinsic->GetSimdBaseType(); switch (intrinsicId) { @@ -848,15 +848,16 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) // broadcast -> LCL_VAR(TYP_(U)INT) assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR, GT_CNS_INT)); GenTree* scalar = hwintrinsic->Op(1); - if(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)) + if (hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)) { assert(scalar->isContained()); return genOperandDesc(scalar); } else { - ssize_t scalarValue = scalar->AsIntCon()->IconValue(); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), genTypeSize(simdBaseType), simdBaseType); + ssize_t scalarValue = scalar->AsIntCon()->IconValue(); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), + genTypeSize(simdBaseType), simdBaseType); return OperandDesc(compiler->eeFindJitDataOffs(cnum)); } } @@ -1233,12 +1234,8 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) // op2 -- The second operand, which may be a memory node or a node producing a register // isRMW -- true if the instruction is RMW; otherwise, false // simdBaseType -- the base data type for this intrinsic. -void CodeGen::inst_RV_RV_TT(instruction ins, - emitAttr size, - regNumber targetReg, - regNumber op1Reg, - GenTree* op2, - bool isRMW) +void CodeGen::inst_RV_RV_TT( + instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW) { emitter* emit = GetEmitter(); noway_assert(emit->emitVerifyEncodable(ins, EA_SIZE(size), targetReg)); diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index a4674d0595fe99..3377723a6db10c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -590,14 +590,14 @@ GenTree* Lowering::LowerNode(GenTree* node) #ifdef TARGET_XARCH case GT_CNS_VEC: { - if(comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) { return TryLowerConstVec(node->AsVecCon()); } else break; } - + #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index b16ee6c54dc0fa..216f506736ae82 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1003,19 +1003,26 @@ void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node) } } +//---------------------------------------------------------------------------------------------- +// TryLowerConstVec: Unfold the Vector to Broadcast->CreateScalarUnsafe->ConstScalar form +// when the constant vector has all the same elements and is an operand +// of an embedded broadcast compatible instruction. +// +// Arguments: +// node - The constant vector node GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) { - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); GenTreeHWIntrinsic* embBroadcastNode = nullptr; - if(foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - embBroadcastNode = use.User()->AsHWIntrinsic(); - var_types simdType = embBroadcastNode->TypeGet(); - var_types simdBaseType = embBroadcastNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = embBroadcastNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; - + if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) + { + embBroadcastNode = use.User()->AsHWIntrinsic(); + var_types simdType = embBroadcastNode->TypeGet(); + var_types simdBaseType = embBroadcastNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = embBroadcastNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + int ElementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); switch (simdBaseType) { @@ -1024,10 +1031,10 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) case TYP_UINT: { uint32_t FirstElement = static_cast(node->gtSimdVal.u32[0]); - for(int i = 1; i < ElementCount; i++) + for (int i = 1; i < ElementCount; i++) { uint32_t ElementToCheck = static_cast(node->gtSimdVal.u32[i]); - if(FirstElement != ElementToCheck) + if (FirstElement != ElementToCheck) { isCreatedFromScalar = false; break; @@ -1035,16 +1042,16 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) } break; } - + case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: { uint64_t FirstElement = static_cast(node->gtSimdVal.u64[0]); - for(int i = 1; i < ElementCount; i++) + for (int i = 1; i < ElementCount; i++) { uint64_t ElementToCheck = static_cast(node->gtSimdVal.u64[i]); - if(FirstElement != ElementToCheck) + if (FirstElement != ElementToCheck) { isCreatedFromScalar = false; break; @@ -1057,59 +1064,62 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) isCreatedFromScalar = false; break; } - if(isCreatedFromScalar) + if (isCreatedFromScalar) { NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if(simdType == TYP_SIMD32) + if (simdType == TYP_SIMD32) { broadcastName = NI_AVX2_BroadcastScalarToVector256; } - else if(simdType == TYP_SIMD64) + else if (simdType == TYP_SIMD64) { broadcastName = NI_AVX512F_BroadcastScalarToVector512; } GenTree* constScalar = nullptr; - switch(simdBaseType) + switch (simdBaseType) { case TYP_FLOAT: { float scalar = static_cast(node->gtSimdVal.f32[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; } case TYP_DOUBLE: { double scalar = static_cast(node->gtSimdVal.f64[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; - } + } case TYP_INT: { int32_t scalar = static_cast(node->gtSimdVal.i32[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_UINT: { - uint32_t scalar = static_cast(node->gtSimdVal.u32[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_INT); + uint32_t scalar = static_cast(node->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); break; } case TYP_LONG: { int64_t scalar = static_cast(node->gtSimdVal.i64[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_ULONG: - { + { uint64_t scalar = static_cast(node->gtSimdVal.u64[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } } - GenTreeHWIntrinsic* createScalar = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, 16); - GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + GenTreeHWIntrinsic* createScalar = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, + simdBaseJitType, 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, + simdBaseJitType, genTypeSize(simdType)); BlockRange().InsertBefore(node, broadcastNode); BlockRange().InsertBefore(broadcastNode, createScalar); BlockRange().InsertBefore(createScalar, constScalar); From 8072d29669b7c60b8e7378549396e309024a383b Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 11 May 2023 14:22:04 -0700 Subject: [PATCH 31/44] bug fixes: 1. added missing default branch 2. filter out some possible embedded broadcast cases for some better optimization --- src/coreclr/jit/instr.cpp | 1 - src/coreclr/jit/lowerxarch.cpp | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 44f387e3ced85f..bfbdf30093285b 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1233,7 +1233,6 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) // op1Reg -- The first operand register // op2 -- The second operand, which may be a memory node or a node producing a register // isRMW -- true if the instruction is RMW; otherwise, false -// simdBaseType -- the base data type for this intrinsic. void CodeGen::inst_RV_RV_TT( instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW) { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 216f506736ae82..390db815500e18 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1040,12 +1040,21 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) break; } } + // There are special case when all elements in the vector are 1/0, + // there are exsiting optimization for those case, filter them out + // of the embedded broadcast. + if (isCreatedFromScalar && (FirstElement == UINT32_MAX || FirstElement == UINT32_MIN)) + { + isCreatedFromScalar = false; + } break; } case TYP_DOUBLE: +#if defined(TARGET_AMD64) case TYP_LONG: case TYP_ULONG: +#endif // TARGET_AMD64 { uint64_t FirstElement = static_cast(node->gtSimdVal.u64[0]); for (int i = 1; i < ElementCount; i++) @@ -1057,9 +1066,15 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) break; } } + // There are special case when all elements in the vector are 1/0, + // there are exsiting optimization for those case, filter them out + // of the embedded broadcast. + if (isCreatedFromScalar && (FirstElement == UINT64_MAX || FirstElement == UINT64_MIN)) + { + isCreatedFromScalar = false; + } break; } - default: isCreatedFromScalar = false; break; @@ -1102,6 +1117,7 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) constScalar = comp->gtNewIconNode(scalar, TYP_INT); break; } +#if defined(TARGET_AMD64) case TYP_LONG: { int64_t scalar = static_cast(node->gtSimdVal.i64[0]); @@ -1114,6 +1130,9 @@ GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } +#endif // TARGET_AMD64 + default: + unreached(); } GenTreeHWIntrinsic* createScalar = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, From 7db1c5e3e68cc5cd58decd75ba2bb55e55752f57 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 18 May 2023 13:38:41 -0700 Subject: [PATCH 32/44] resolve the mishandling for the previous conflict. --- src/coreclr/jit/hwintrinsic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index 14d4d5dcdbb218..f30f6229be86d6 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -200,10 +200,9 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is a PermuteVar2x intrinsic HW_Flag_PermuteVar2x = 0x4000000, - +#endif // TARGET_XARCH // The intrinsic is an embedded broadcast compatiable intrinsic HW_Flag_EmbBroadcastCompatible = 0x8000000, -#endif // TARGET_XARCH }; #if defined(TARGET_XARCH) From c916008e6ac7495ce4f22ac0d091d23b0af101b6 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 18 May 2023 17:23:38 -0700 Subject: [PATCH 33/44] move the unfolding logic to ContainChecks --- src/coreclr/jit/lower.cpp | 13 -- src/coreclr/jit/lower.h | 2 - src/coreclr/jit/lowerxarch.cpp | 336 ++++++++++++--------------------- 3 files changed, 116 insertions(+), 235 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 3377723a6db10c..d14dbc4be0eb5d 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -587,19 +587,6 @@ GenTree* Lowering::LowerNode(GenTree* node) #ifdef FEATURE_HW_INTRINSICS case GT_HWINTRINSIC: return LowerHWIntrinsic(node->AsHWIntrinsic()); -#ifdef TARGET_XARCH - case GT_CNS_VEC: - { - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) - { - return TryLowerConstVec(node->AsVecCon()); - } - else - break; - } - -#endif // TARGET_XARCH - #endif // FEATURE_HW_INTRINSICS case GT_LCL_FLD: diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index d7eb9cf1ec1681..264405298e9681 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -364,8 +364,6 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); GenTree* TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode); - GenTree* TryLowerConstVec(GenTreeVecCon* node); - void TryCanonizeEmbBroadcastCandicate(GenTreeHWIntrinsic* node); void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 390db815500e18..1191ae40168422 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1003,154 +1003,6 @@ void Lowering::LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node) } } -//---------------------------------------------------------------------------------------------- -// TryLowerConstVec: Unfold the Vector to Broadcast->CreateScalarUnsafe->ConstScalar form -// when the constant vector has all the same elements and is an operand -// of an embedded broadcast compatible instruction. -// -// Arguments: -// node - The constant vector node -GenTree* Lowering::TryLowerConstVec(GenTreeVecCon* node) -{ - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTreeHWIntrinsic* embBroadcastNode = nullptr; - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - embBroadcastNode = use.User()->AsHWIntrinsic(); - var_types simdType = embBroadcastNode->TypeGet(); - var_types simdBaseType = embBroadcastNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = embBroadcastNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; - - int ElementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); - switch (simdBaseType) - { - case TYP_FLOAT: - case TYP_INT: - case TYP_UINT: - { - uint32_t FirstElement = static_cast(node->gtSimdVal.u32[0]); - for (int i = 1; i < ElementCount; i++) - { - uint32_t ElementToCheck = static_cast(node->gtSimdVal.u32[i]); - if (FirstElement != ElementToCheck) - { - isCreatedFromScalar = false; - break; - } - } - // There are special case when all elements in the vector are 1/0, - // there are exsiting optimization for those case, filter them out - // of the embedded broadcast. - if (isCreatedFromScalar && (FirstElement == UINT32_MAX || FirstElement == UINT32_MIN)) - { - isCreatedFromScalar = false; - } - break; - } - - case TYP_DOUBLE: -#if defined(TARGET_AMD64) - case TYP_LONG: - case TYP_ULONG: -#endif // TARGET_AMD64 - { - uint64_t FirstElement = static_cast(node->gtSimdVal.u64[0]); - for (int i = 1; i < ElementCount; i++) - { - uint64_t ElementToCheck = static_cast(node->gtSimdVal.u64[i]); - if (FirstElement != ElementToCheck) - { - isCreatedFromScalar = false; - break; - } - } - // There are special case when all elements in the vector are 1/0, - // there are exsiting optimization for those case, filter them out - // of the embedded broadcast. - if (isCreatedFromScalar && (FirstElement == UINT64_MAX || FirstElement == UINT64_MIN)) - { - isCreatedFromScalar = false; - } - break; - } - default: - isCreatedFromScalar = false; - break; - } - if (isCreatedFromScalar) - { - NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if (simdType == TYP_SIMD32) - { - broadcastName = NI_AVX2_BroadcastScalarToVector256; - } - else if (simdType == TYP_SIMD64) - { - broadcastName = NI_AVX512F_BroadcastScalarToVector512; - } - GenTree* constScalar = nullptr; - switch (simdBaseType) - { - case TYP_FLOAT: - { - float scalar = static_cast(node->gtSimdVal.f32[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); - break; - } - case TYP_DOUBLE: - { - double scalar = static_cast(node->gtSimdVal.f64[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); - break; - } - case TYP_INT: - { - int32_t scalar = static_cast(node->gtSimdVal.i32[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_UINT: - { - uint32_t scalar = static_cast(node->gtSimdVal.u32[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_INT); - break; - } -#if defined(TARGET_AMD64) - case TYP_LONG: - { - int64_t scalar = static_cast(node->gtSimdVal.i64[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_ULONG: - { - uint64_t scalar = static_cast(node->gtSimdVal.u64[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_LONG); - break; - } -#endif // TARGET_AMD64 - default: - unreached(); - } - GenTreeHWIntrinsic* createScalar = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, - simdBaseJitType, 16); - GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, - simdBaseJitType, genTypeSize(simdType)); - BlockRange().InsertBefore(node, broadcastNode); - BlockRange().InsertBefore(broadcastNode, createScalar); - BlockRange().InsertBefore(createScalar, constScalar); - use.ReplaceWith(broadcastNode); - BlockRange().Remove(node); - LowerNode(createScalar); - return LowerNode(broadcastNode); - } - } - return node->gtNext; -} - //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsic: Perform containment analysis for a hardware intrinsic node. // @@ -2508,8 +2360,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); - // Seek for optimization opportunities using embedded broadcast. - TryCanonizeEmbBroadcastCandicate(node); break; } default: @@ -2542,12 +2392,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) LowerNode(tmp1); node->ResetHWIntrinsicId(NI_AVX2_BroadcastScalarToVector256, tmp1); - - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. - TryCanonizeEmbBroadcastCandicate(node); - } return LowerNode(node); } @@ -2622,15 +2466,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Avx2.BroadcastScalarToVector128(tmp1); node->ChangeHWIntrinsicId(NI_AVX2_BroadcastScalarToVector128, tmp1); - - // if AVX512 is supported, seek for optimization opportunities using embedded broadcast. - // contain the broadcast intrinsics in the embeddebd broadcast compatible intrinsics - // at codegen phase, directly emit the operend on "Create" node instead of a series of broadcast. - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. - TryCanonizeEmbBroadcastCandicate(node); - } return LowerNode(node); } @@ -2862,11 +2697,6 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // return Sse3.MoveAndDuplicate(tmp1); node->ChangeHWIntrinsicId(NI_SSE3_MoveAndDuplicate, tmp1); - if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - // If AVX512 is supported, seek for optimization opportunities using embedded broadcast. - TryCanonizeEmbBroadcastCandicate(node); - } break; } @@ -5323,54 +5153,6 @@ GenTree* Lowering::TryLowerXorOpToGetMaskUpToLowestSetBit(GenTreeOp* xorNode) return blsmskNode; } -//---------------------------------------------------------------------------------------------- -// Lowering::TryCanonizeEmbBroadcastCandicate: -// Tries to canonize the operands of an embedded broadcast op. -// -// Arguments: -// node - `Create` node being lowered -// -// Notes: -// This function will canonize the input operands of a commutive and embedded broadcast op. -// The operand to be broadcasted will be put to the 2nd operand. -// -void Lowering::TryCanonizeEmbBroadcastCandicate(GenTreeHWIntrinsic* node) -{ - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* createUser = nullptr; - // Here we assume we have the form of Broadcast -> CreateScalarUnsafe -> Scalar - GenTree* scalar = node->Op(1)->AsHWIntrinsic()->Op(1); - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - createUser = use.User(); - } - if (createUser != nullptr && scalar->OperIs(GT_LCL_VAR) && createUser->AsHWIntrinsic()->isCommutativeHWIntrinsic()) - { - // swap the embedded broadcast candidate to 2nd operand, convenient to handle the containment - // issue. - switch (scalar->TypeGet()) - { - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - case TYP_FLOAT: - case TYP_DOUBLE: - { - if (node == createUser->AsHWIntrinsic()->Op(1)) - { - std::swap(createUser->AsHWIntrinsic()->Op(1), createUser->AsHWIntrinsic()->Op(2)); - } - break; - } - - default: - break; - } - } -} - //---------------------------------------------------------------------------------------------- // Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible // @@ -7618,9 +7400,123 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTreeVecCon* vecCon = childNode->AsVecCon(); canBeContained = !vecCon->IsAllBitsSet() && !vecCon->IsZero(); + //seek for opportunities to unfold the constant vector as the embedded broadcast candidate. + if(canBeContained && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && vecCon->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && parentNode->OperIsEmbBroadcastCompatible()) + { + var_types simdType = parentNode->TypeGet(); + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); + switch (simdBaseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + uint32_t firstElement = static_cast(vecCon->gtSimdVal.u32[0]); + for(int i = 1; i < elementCount; i++) + { + uint32_t elementToCheck = static_cast(vecCon->gtSimdVal.u32[i]); + if(firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); + for(int i = 1; i < elementCount; i++) + { + uint64_t elementToCheck = static_cast(vecCon->gtSimdVal.u64[i]); + if(firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + default: + isCreatedFromScalar = false; + break; + } + if(isCreatedFromScalar) + { + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + if(simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if(simdType == TYP_SIMD64) + { + broadcastName = NI_AVX512F_BroadcastScalarToVector512; + } + GenTree* constScalar = nullptr; + switch(simdBaseType) + { + case TYP_FLOAT: + { + float scalar = static_cast(vecCon->gtSimdVal.f32[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_DOUBLE: + { + double scalar = static_cast(vecCon->gtSimdVal.f64[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_INT: + { + int32_t scalar = static_cast(vecCon->gtSimdVal.i32[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_UINT: + { + uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); + break; + } + case TYP_LONG: + { + int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_ULONG: + { + uint64_t scalar = static_cast(vecCon->gtSimdVal.u64[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + break; + } + default: + unreached(); + } + GenTreeHWIntrinsic* createScalar = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + BlockRange().InsertBefore(vecCon, broadcastNode); + BlockRange().InsertBefore(broadcastNode, createScalar); + BlockRange().InsertBefore(createScalar, constScalar); + LIR::Use use; + BlockRange().TryGetUse(childNode, &use); + use.ReplaceWith(broadcastNode); + BlockRange().Remove(vecCon); + LowerNode(createScalar); + LowerNode(broadcastNode); + return IsContainableHWIntrinsicOp(parentNode, broadcastNode, supportsRegOptional); + } + } } } - return canBeContained; } @@ -8129,7 +8025,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - MakeSrcContained(node, op2); + MakeSrcContained(node, node->Op(2)); } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && From 4ee1f97848545d07b211663d8bc62cd78a1817ae Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 18 May 2023 14:43:53 -0700 Subject: [PATCH 34/44] Code changes based on the review --- src/coreclr/jit/emitxarch.cpp | 16 ++++++---------- src/coreclr/jit/emitxarch.h | 6 +++++- src/coreclr/jit/instr.cpp | 6 ++++++ 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 3148a98f5a09b0..e2d887a3ea6b90 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1273,19 +1273,15 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at // AddEvexPrefix: set Evex.b bit if EvexbContext is set in instruction descritor. // // Arguments: -// id -- instruction descriptor // code -- opcode bits. // // Return Value: // encoded code with Evex.b set if needed. // -emitter::code_t emitter::AddEvexbBitIfNeeded(const instrDesc* id, code_t code) +emitter::code_t emitter::AddEvexbBit(code_t code) { - if (id->idIsEvexbContext()) - { - hasEvexPrefix(code); - code |= EVEX_B_BIT; - } + assert(hasEvexPrefix(code)); + code |= EVEX_B_BIT; return code; } @@ -8177,6 +8173,7 @@ void emitter::emitIns_SIMD_R_R_A( } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_A(ins, attr, targetReg, indir); } @@ -8208,6 +8205,7 @@ void emitter::emitIns_SIMD_R_R_C(instruction ins, } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_C(ins, attr, targetReg, fldHnd, offs); } @@ -8270,6 +8268,7 @@ void emitter::emitIns_SIMD_R_R_S( } else { + assert(instOptions == INS_OPTS_NONE); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); emitIns_R_S(ins, attr, targetReg, varx, offs); } @@ -16710,7 +16709,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) else { code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEvexbBitIfNeeded(id, code); regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } @@ -16954,7 +16952,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEvexbBitIfNeeded(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form @@ -17197,7 +17194,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeRM(ins); code = AddSimdPrefixIfNeeded(id, code, size); - code = AddEvexbBitIfNeeded(id, code); code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 3081e5df051c95..3fdda27eb1008a 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -294,7 +294,7 @@ bool hasEvexPrefix(code_t code) return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; } code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); -code_t AddEvexbBitIfNeeded(const instrDesc* id, code_t code); +code_t AddEvexbBit(code_t code); //------------------------------------------------------------------------ // AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. @@ -315,6 +315,10 @@ code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); + if (id->idIsEvexbContext()) + { + code = AddEvexbBit(code); + } } else if (TakesVexPrefix(ins)) { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index bfbdf30093285b..823a1842e6d5fe 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -810,7 +810,9 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(hwintrinsic->isContained()); assert(hwintrinsic->OperIsMemoryLoad()); assert(hwintrinsic->GetOperandCount() == 1); + assert(varTypeIsFloating(simdBaseType)); GenTree* broadcastScalar = hwintrinsic->Op(1); + assert(broadcastScalar->isContained()); if (broadcastScalar->OperIs(GT_LCL_ADDR)) { addr = hwintrinsic->Op(1); @@ -1185,6 +1187,10 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { + if(!GetEmitter()->UseEvexEncoding()) + { + return false; + } // need to check if the datatype is EB compatible, say 32-, 64-bit. insFlags flags = instInfo[ins]; bool IsEmbBroadcastCompatible = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; From 97cb23afa2c3e695fba49594cab86f68bdde8512 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Thu, 18 May 2023 17:54:53 -0700 Subject: [PATCH 35/44] apply format patch --- src/coreclr/jit/instr.cpp | 4 +-- src/coreclr/jit/lowerxarch.cpp | 59 ++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 823a1842e6d5fe..9259682f4c27c0 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -1187,8 +1187,8 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { - if(!GetEmitter()->UseEvexEncoding()) - { + if (!GetEmitter()->UseEvexEncoding()) + { return false; } // need to check if the datatype is EB compatible, say 32-, 64-bit. diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 1191ae40168422..fbcd7616d6c823 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7400,14 +7400,15 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTreeVecCon* vecCon = childNode->AsVecCon(); canBeContained = !vecCon->IsAllBitsSet() && !vecCon->IsZero(); - //seek for opportunities to unfold the constant vector as the embedded broadcast candidate. - if(canBeContained && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && vecCon->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && parentNode->OperIsEmbBroadcastCompatible()) + // seek for opportunities to unfold the constant vector as the embedded broadcast candidate. + if (canBeContained && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + vecCon->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && parentNode->OperIsEmbBroadcastCompatible()) { - var_types simdType = parentNode->TypeGet(); - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; - int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); + var_types simdType = parentNode->TypeGet(); + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); switch (simdBaseType) { case TYP_FLOAT: @@ -7415,10 +7416,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case TYP_UINT: { uint32_t firstElement = static_cast(vecCon->gtSimdVal.u32[0]); - for(int i = 1; i < elementCount; i++) + for (int i = 1; i < elementCount; i++) { uint32_t elementToCheck = static_cast(vecCon->gtSimdVal.u32[i]); - if(firstElement != elementToCheck) + if (firstElement != elementToCheck) { isCreatedFromScalar = false; break; @@ -7432,10 +7433,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case TYP_ULONG: { uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); - for(int i = 1; i < elementCount; i++) + for (int i = 1; i < elementCount; i++) { uint64_t elementToCheck = static_cast(vecCon->gtSimdVal.u64[i]); - if(firstElement != elementToCheck) + if (firstElement != elementToCheck) { isCreatedFromScalar = false; break; @@ -7443,66 +7444,70 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } break; } - + default: isCreatedFromScalar = false; break; } - if(isCreatedFromScalar) + if (isCreatedFromScalar) { NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if(simdType == TYP_SIMD32) + if (simdType == TYP_SIMD32) { broadcastName = NI_AVX2_BroadcastScalarToVector256; } - else if(simdType == TYP_SIMD64) + else if (simdType == TYP_SIMD64) { broadcastName = NI_AVX512F_BroadcastScalarToVector512; } GenTree* constScalar = nullptr; - switch(simdBaseType) + switch (simdBaseType) { case TYP_FLOAT: { float scalar = static_cast(vecCon->gtSimdVal.f32[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; } case TYP_DOUBLE: { double scalar = static_cast(vecCon->gtSimdVal.f64[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; - } + } case TYP_INT: { int32_t scalar = static_cast(vecCon->gtSimdVal.i32[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_UINT: { - uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_INT); + uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); break; } case TYP_LONG: { int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_ULONG: - { + { uint64_t scalar = static_cast(vecCon->gtSimdVal.u64[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } default: unreached(); } - GenTreeHWIntrinsic* createScalar = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, 16); - GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); + GenTreeHWIntrinsic* createScalar = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, + simdBaseJitType, 16); + GenTreeHWIntrinsic* broadcastNode = + comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, + genTypeSize(simdType)); BlockRange().InsertBefore(vecCon, broadcastNode); BlockRange().InsertBefore(broadcastNode, createScalar); BlockRange().InsertBefore(createScalar, constScalar); From 37b57be821e66d7190aabccf8a0872483c2eefff Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 19 May 2023 13:16:21 -0700 Subject: [PATCH 36/44] support embedded broadcast for GT_IND as the operand of a broadcast node. --- src/coreclr/jit/instr.cpp | 30 +++++++--- src/coreclr/jit/lowerxarch.cpp | 100 ++++++++++++++++----------------- 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 9259682f4c27c0..fd32a230a46744 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -848,15 +848,20 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) // a special case is when the operand of CreateScalarUnsafe is in integer type, // CreateScalarUnsafe node will be fold, so we directly match a pattern of // broadcast -> LCL_VAR(TYP_(U)INT) - assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR, GT_CNS_INT)); + assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR, GT_CNS_INT, GT_IND)); GenTree* scalar = hwintrinsic->Op(1); - if (hwintrinsic->Op(1)->OperIs(GT_LCL_VAR)) + assert(scalar->isContained()); + if (scalar->OperIs(GT_LCL_VAR, GT_IND)) { + // This handles the case: + // BroadcastScalarToVector* -> LCL_VAR/IND. assert(scalar->isContained()); return genOperandDesc(scalar); } else { + // This handles the case: + // BroadcastScalarToVector* -> CNS_INT. ssize_t scalarValue = scalar->AsIntCon()->IconValue(); UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), genTypeSize(simdBaseType), simdBaseType); @@ -867,11 +872,22 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) case TYP_FLOAT: case TYP_DOUBLE: { - assert(hwintrinsic->Op(1)->OperIs(GT_HWINTRINSIC)); - op = hwintrinsic->Op(1); - assert(op->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); - assert(op->isContained()); - return genOperandDesc(op->AsHWIntrinsic()->Op(1)); + assert(hwintrinsic->isContained()); + assert(hwintrinsic->Op(1)->OperIs(GT_HWINTRINSIC, GT_IND)); + GenTree* scalar = hwintrinsic->Op(1); + assert(scalar->isContained()); + if(scalar->OperIs(GT_HWINTRINSIC)) + { + // This handles the case: + // BroadcastScalarToVector* -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL + return genOperandDesc(scalar->AsHWIntrinsic()->Op(1)); + } + else + { + // This handles the case: + // BroadcastScalarToVector* -> IND + return genOperandDesc(scalar); + } } default: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index fbcd7616d6c823..5ff0a1a4b195f6 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1631,27 +1631,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerFusedMultiplyAdd(node); break; - case NI_AVX_BroadcastScalarToVector256: - case NI_AVX_BroadcastScalarToVector128: - { - // there can be 2 cases hitting AVX_BroadcastScalarToVector* - // 1. pass the address as LCL_ADDR to AVX.BroadcastScalarToVector256() API - // 2. pass the address as LCL_VAR to AVX.BroadcastScalarToVector256() API - LIR::Use use; - bool foundUse = BlockRange().TryGetUse(node, &use); - if (foundUse && use.User()->OperIs(GT_HWINTRINSIC) && - use.User()->AsHWIntrinsic()->OperIsEmbBroadcastCompatible()) - { - GenTree* op = node->Op(1); - assert(op->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); - if (node == use.User()->AsHWIntrinsic()->Op(1)) - { - std::swap(use.User()->AsHWIntrinsic()->Op(1), use.User()->AsHWIntrinsic()->Op(2)); - } - } - break; - } - default: break; } @@ -7647,6 +7626,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { + // make the broadcast node containable when embedded broadcast can be enabled. if (intrinsicId == NI_SSE3_MoveAndDuplicate) { // NI_SSE3_MoveAndDuplicate is for Vector128 only. @@ -7656,51 +7636,67 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre parentNode->OperIsEmbBroadcastCompatible()) { GenTree* createScalar = childNode->AsHWIntrinsic()->Op(1); - if (createScalar->OperIs(GT_HWINTRINSIC) && - createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + switch (createScalar->OperGet()) { - GenTree* scalar = createScalar->AsHWIntrinsic()->Op(1); - if (scalar->OperIs(GT_LCL_VAR)) + case GT_HWINTRINSIC: { - switch (scalar->TypeGet()) + if(createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { - case TYP_FLOAT: - case TYP_DOUBLE: + // Handle the case for: + // BroadcastScalarTovector -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL. + GenTree* scalar = createScalar->AsHWIntrinsic()->Op(1); + if (scalar->OperIs(GT_LCL_VAR)) + { + switch (scalar->TypeGet()) + { + case TYP_FLOAT: + case TYP_DOUBLE: + { + const unsigned opLclNum = scalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister( + opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + MakeSrcContained(createScalar, scalar); + MakeSrcContained(childNode, createScalar); + return true; + } + + default: + return false; + } + } + else if (scalar->OperIs(GT_CNS_DBL)) { - const unsigned opLclNum = scalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister( - opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(createScalar, scalar); MakeSrcContained(childNode, createScalar); return true; } - - default: - return false; } + break; } - else if (scalar->OperIs(GT_CNS_DBL)) + case GT_LCL_VAR: { - MakeSrcContained(createScalar, scalar); + // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be + // fold, we need to specially handle this case. + assert(createScalar->TypeIs(TYP_INT) || createScalar->TypeIs(TYP_UINT) || + createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); + const unsigned opLclNum = createScalar->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(childNode, createScalar); return true; } - } - else if (createScalar->OperIs(GT_LCL_VAR)) - { - // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be - // fold, we need to specially handle this case. - assert(createScalar->TypeIs(TYP_INT) || createScalar->TypeIs(TYP_UINT) || - createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); - const unsigned opLclNum = createScalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(childNode, createScalar); - return true; - } - else if (createScalar->OperIs(GT_CNS_INT)) - { - MakeSrcContained(childNode, createScalar); - return true; + case GT_CNS_INT: + case GT_IND: + { + // For CNS_INT, similar to the GT_LVL_VAR case. + // If the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be + // fold, we need to specially handle this case. + + // For IND, handle the case for Avx2.BroadcastScalarToVector*(T*) + MakeSrcContained(childNode, createScalar); + return true; + } + default: + break; } } return false; From 14a370a666b9f8bf7b83200e5947ad8cc7a80b3d Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 19 May 2023 13:18:36 -0700 Subject: [PATCH 37/44] bug fixes: Long type should only be on 64-bit system. --- src/coreclr/jit/lowerxarch.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 5ff0a1a4b195f6..6cbc9574acf370 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7408,8 +7408,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } case TYP_DOUBLE: +#if defined(TARGET_AMD64) case TYP_LONG: case TYP_ULONG: +#endif // TARGET_AMD64 { uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); for (int i = 1; i < elementCount; i++) @@ -7466,6 +7468,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre constScalar = comp->gtNewIconNode(scalar, TYP_INT); break; } +#if defined(TARGET_AMD64) case TYP_LONG: { int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); @@ -7478,6 +7481,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } +#endif // TARGET_AMD64 default: unreached(); } From 64fec11168da29fb6430bb5776fb935712ac5f8a Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 19 May 2023 13:39:41 -0700 Subject: [PATCH 38/44] apply format patch --- src/coreclr/jit/instr.cpp | 12 ++++++------ src/coreclr/jit/lowerxarch.cpp | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index fd32a230a46744..09dc40fd7c5bef 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -853,14 +853,14 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(scalar->isContained()); if (scalar->OperIs(GT_LCL_VAR, GT_IND)) { - // This handles the case: + // This handles the case: // BroadcastScalarToVector* -> LCL_VAR/IND. assert(scalar->isContained()); return genOperandDesc(scalar); } else { - // This handles the case: + // This handles the case: // BroadcastScalarToVector* -> CNS_INT. ssize_t scalarValue = scalar->AsIntCon()->IconValue(); UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), @@ -876,15 +876,15 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(hwintrinsic->Op(1)->OperIs(GT_HWINTRINSIC, GT_IND)); GenTree* scalar = hwintrinsic->Op(1); assert(scalar->isContained()); - if(scalar->OperIs(GT_HWINTRINSIC)) + if (scalar->OperIs(GT_HWINTRINSIC)) { - // This handles the case: + // This handles the case: // BroadcastScalarToVector* -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL - return genOperandDesc(scalar->AsHWIntrinsic()->Op(1)); + return genOperandDesc(scalar->AsHWIntrinsic()->Op(1)); } else { - // This handles the case: + // This handles the case: // BroadcastScalarToVector* -> IND return genOperandDesc(scalar); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6cbc9574acf370..ea47d9d197252b 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7411,7 +7411,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre #if defined(TARGET_AMD64) case TYP_LONG: case TYP_ULONG: -#endif // TARGET_AMD64 +#endif // TARGET_AMD64 { uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); for (int i = 1; i < elementCount; i++) @@ -7481,7 +7481,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } -#endif // TARGET_AMD64 +#endif // TARGET_AMD64 default: unreached(); } @@ -7644,7 +7644,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { case GT_HWINTRINSIC: { - if(createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) + if (createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) { // Handle the case for: // BroadcastScalarTovector -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL. @@ -7679,10 +7679,11 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } case GT_LCL_VAR: { - // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be + // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node + // will be // fold, we need to specially handle this case. assert(createScalar->TypeIs(TYP_INT) || createScalar->TypeIs(TYP_UINT) || - createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); + createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); const unsigned opLclNum = createScalar->AsLclVar()->GetLclNum(); comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); MakeSrcContained(childNode, createScalar); @@ -7692,7 +7693,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case GT_IND: { // For CNS_INT, similar to the GT_LVL_VAR case. - // If the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node will be + // If the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node + // will be // fold, we need to specially handle this case. // For IND, handle the case for Avx2.BroadcastScalarToVector*(T*) From 45b7807d711f9dfd9294ac57f08eaccccc27fade Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Fri, 19 May 2023 16:06:53 -0700 Subject: [PATCH 39/44] Introduce MakeHWIntrinsicSrcContained(): This function will handle the case that constant vector is the operand of embedded broadcast ops. If the constant vector is eligible for embedded broadcast, will unfold the constatn vector to the corresponding broadcast intrinsic form. --- src/coreclr/jit/lower.h | 3 + src/coreclr/jit/lowerxarch.cpp | 294 +++++++++++++++++++-------------- 2 files changed, 171 insertions(+), 126 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 264405298e9681..4579763e210dd3 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -109,6 +109,9 @@ class Lowering final : public Phase #ifdef FEATURE_HW_INTRINSICS void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); +#ifdef TARGET_XARCH + void MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTree* childNode); +#endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS #ifdef DEBUG diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index ea47d9d197252b..df07dd77e8b516 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7379,130 +7379,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTreeVecCon* vecCon = childNode->AsVecCon(); canBeContained = !vecCon->IsAllBitsSet() && !vecCon->IsZero(); - // seek for opportunities to unfold the constant vector as the embedded broadcast candidate. - if (canBeContained && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - vecCon->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && parentNode->OperIsEmbBroadcastCompatible()) - { - var_types simdType = parentNode->TypeGet(); - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; - int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); - switch (simdBaseType) - { - case TYP_FLOAT: - case TYP_INT: - case TYP_UINT: - { - uint32_t firstElement = static_cast(vecCon->gtSimdVal.u32[0]); - for (int i = 1; i < elementCount; i++) - { - uint32_t elementToCheck = static_cast(vecCon->gtSimdVal.u32[i]); - if (firstElement != elementToCheck) - { - isCreatedFromScalar = false; - break; - } - } - break; - } - - case TYP_DOUBLE: -#if defined(TARGET_AMD64) - case TYP_LONG: - case TYP_ULONG: -#endif // TARGET_AMD64 - { - uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); - for (int i = 1; i < elementCount; i++) - { - uint64_t elementToCheck = static_cast(vecCon->gtSimdVal.u64[i]); - if (firstElement != elementToCheck) - { - isCreatedFromScalar = false; - break; - } - } - break; - } - - default: - isCreatedFromScalar = false; - break; - } - if (isCreatedFromScalar) - { - NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if (simdType == TYP_SIMD32) - { - broadcastName = NI_AVX2_BroadcastScalarToVector256; - } - else if (simdType == TYP_SIMD64) - { - broadcastName = NI_AVX512F_BroadcastScalarToVector512; - } - GenTree* constScalar = nullptr; - switch (simdBaseType) - { - case TYP_FLOAT: - { - float scalar = static_cast(vecCon->gtSimdVal.f32[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); - break; - } - case TYP_DOUBLE: - { - double scalar = static_cast(vecCon->gtSimdVal.f64[0]); - constScalar = comp->gtNewDconNode(scalar, simdBaseType); - break; - } - case TYP_INT: - { - int32_t scalar = static_cast(vecCon->gtSimdVal.i32[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_UINT: - { - uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_INT); - break; - } -#if defined(TARGET_AMD64) - case TYP_LONG: - { - int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_ULONG: - { - uint64_t scalar = static_cast(vecCon->gtSimdVal.u64[0]); - constScalar = comp->gtNewIconNode(scalar, TYP_LONG); - break; - } -#endif // TARGET_AMD64 - default: - unreached(); - } - GenTreeHWIntrinsic* createScalar = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, - simdBaseJitType, 16); - GenTreeHWIntrinsic* broadcastNode = - comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, simdBaseJitType, - genTypeSize(simdType)); - BlockRange().InsertBefore(vecCon, broadcastNode); - BlockRange().InsertBefore(broadcastNode, createScalar); - BlockRange().InsertBefore(createScalar, constScalar); - LIR::Use use; - BlockRange().TryGetUse(childNode, &use); - use.ReplaceWith(broadcastNode); - BlockRange().Remove(vecCon); - LowerNode(createScalar); - LowerNode(broadcastNode); - return IsContainableHWIntrinsicOp(parentNode, broadcastNode, supportsRegOptional); - } - } } } return canBeContained; @@ -7724,6 +7600,154 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } } +//---------------------------------------------------------------------------------------------- +// MakeHWIntrInsicSrcContained: Unfold the eligible constant vector when embedded broadcast is +// available. +// +// Arguments: +// parentNode - The hardware intrinsic node +// childNode - The operand node to try contain +// +void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTree* childNode) +{ + assert(childNode->OperIs(GT_CNS_VEC)); + GenTreeVecCon* vecCon = childNode->AsVecCon(); + if (vecCon->IsAllBitsSet() || vecCon->IsZero()) + { + // do not enable embedded broadcast for all 1/0 vectors. + MakeSrcContained(parentNode, childNode); + return; + } + var_types simdType = parentNode->TypeGet(); + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; + int elementCount = GenTreeVecCon::ElementCount(genTypeSize(simdType), simdBaseType); + switch (simdBaseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + uint32_t firstElement = static_cast(vecCon->gtSimdVal.u32[0]); + for (int i = 1; i < elementCount; i++) + { + uint32_t elementToCheck = static_cast(vecCon->gtSimdVal.u32[i]); + if (firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + case TYP_DOUBLE: +#if defined(TARGET_AMD64) + case TYP_LONG: + case TYP_ULONG: +#endif // TARGET_AMD64 + { + uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); + for (int i = 1; i < elementCount; i++) + { + uint64_t elementToCheck = static_cast(vecCon->gtSimdVal.u64[i]); + if (firstElement != elementToCheck) + { + isCreatedFromScalar = false; + break; + } + } + break; + } + + default: + isCreatedFromScalar = false; + break; + } + if (isCreatedFromScalar) + { + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + if (simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if (simdType == TYP_SIMD64) + { + broadcastName = NI_AVX512F_BroadcastScalarToVector512; + } + GenTree* constScalar = nullptr; + switch (simdBaseType) + { + case TYP_FLOAT: + { + float scalar = static_cast(vecCon->gtSimdVal.f32[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_DOUBLE: + { + double scalar = static_cast(vecCon->gtSimdVal.f64[0]); + constScalar = comp->gtNewDconNode(scalar, simdBaseType); + break; + } + case TYP_INT: + { + int32_t scalar = static_cast(vecCon->gtSimdVal.i32[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_UINT: + { + uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_INT); + break; + } +#if defined(TARGET_AMD64) + case TYP_LONG: + { + int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_ULONG: + { + uint64_t scalar = static_cast(vecCon->gtSimdVal.u64[0]); + constScalar = comp->gtNewIconNode(scalar, TYP_LONG); + break; + } +#endif // TARGET_AMD64 + default: + unreached(); + } + GenTreeHWIntrinsic* createScalar = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, + 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, + simdBaseJitType, genTypeSize(simdType)); + BlockRange().InsertBefore(childNode, broadcastNode); + BlockRange().InsertBefore(broadcastNode, createScalar); + BlockRange().InsertBefore(createScalar, constScalar); + LIR::Use use; + BlockRange().TryGetUse(childNode, &use); + use.ReplaceWith(broadcastNode); + BlockRange().Remove(childNode); + LowerNode(createScalar); + LowerNode(broadcastNode); + if (varTypeIsFloating(simdBaseType)) + { + MakeSrcContained(broadcastNode, createScalar); + } + else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG)) + { + MakeSrcContained(broadcastNode, constScalar); + } + MakeSrcContained(parentNode, broadcastNode); + return; + } + MakeSrcContained(parentNode, childNode); +} + //---------------------------------------------------------------------------------------------- // ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware // intrinsic node. @@ -8032,13 +8056,31 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - MakeSrcContained(node, node->Op(2)); + if (op2->OperIs(GT_CNS_VEC) && op2->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && + comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + node->OperIsEmbBroadcastCompatible()) + { + MakeHWIntrinsicSrcContained(node, op2); + } + else + { + MakeSrcContained(node, op2); + } } else if ((isCommutative || (intrinsicId == NI_BMI2_MultiplyNoFlags) || (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - MakeSrcContained(node, op1); + if (op1->OperIs(GT_CNS_VEC) && op1->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && + comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + node->OperIsEmbBroadcastCompatible()) + { + MakeHWIntrinsicSrcContained(node, op1); + } + else + { + MakeSrcContained(node, op1); + } // Swap the operands here to make the containment checks in codegen significantly simpler std::swap(node->Op(1), node->Op(2)); From cb8feb431aaf3caebc20c7a79278c6c8b6ecc1ed Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Mon, 22 May 2023 17:09:38 -0700 Subject: [PATCH 40/44] Code changes based on reviews: 1. a helper function to detect embedded broadcast compatible flag 2. contain logic improvement. 3. typo fixes. --- src/coreclr/jit/codegeninterface.h | 4 +- src/coreclr/jit/gentree.cpp | 4 +- src/coreclr/jit/hwintrinsiclistxarch.h | 2 +- src/coreclr/jit/instr.cpp | 31 +++++++-- src/coreclr/jit/lowerxarch.cpp | 94 +++++++------------------- 5 files changed, 53 insertions(+), 82 deletions(-) diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index c408eae8b5a0f7..c2bcedb8ea9b7e 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -127,7 +127,9 @@ class CodeGenInterface #define INST_FP 0x01 // is it a FP instruction? public: static bool instIsFP(instruction ins); - +#if defined(TARGET_XARCH) + static bool instIsEmbeddedBroadcastCompatible(instruction ins); +#endif // TARGET_XARCH //------------------------------------------------------------------------- // Liveness-related fields & methods public: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 30e09327805641..e8ef27402111e8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19144,8 +19144,8 @@ bool GenTree::isContainableHWIntrinsic() const case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { - // These intrinsic operations are contained as part of the operand of embedded broadcast compatiable - // instriction + // These intrinsic operations are contained as part of the operand of embedded broadcast compatible + // instruction return true; } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 39625449ae7fb9..246799bdfc1d00 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -826,7 +826,7 @@ HARDWARE_INTRINSIC(AVX2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, Abs, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pabsd, INS_invalid, INS_vpabsq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX512F, Add, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_EmbBroadcastCompatible) HARDWARE_INTRINSIC(AVX512F, AlignRight32, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignd, INS_valignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, AlignRight64, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_valignq, INS_valignq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, And, 64, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_vpandq, INS_vpandq, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 09dc40fd7c5bef..85380796aad2b5 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -335,6 +335,22 @@ bool CodeGenInterface::instIsFP(instruction ins) #endif } +#if defined(TARGET_XARCH) +/***************************************************************************** + * + * Returns non-zero if the given CPU instruction is an embedded broadcast + * compatible instruction. + */ + +// static inline +bool CodeGenInterface::instIsEmbeddedBroadcastCompatible(instruction ins) +{ + assert((unsigned)ins < ArrLen(instInfo)); + + return (instInfo[ins] & INS_Flags_EmbeddedBroadcastSupported) != 0; +} +#endif // TARGET_XARCH + /***************************************************************************** * * Generate a set instruction. @@ -1207,18 +1223,19 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { return false; } - // need to check if the datatype is EB compatible, say 32-, 64-bit. - insFlags flags = instInfo[ins]; - bool IsEmbBroadcastCompatible = (flags & INS_Flags_EmbeddedBroadcastSupported) != 0; + + bool IsEmbBroadcastCompatible = instIsEmbeddedBroadcastCompatible(ins); if (!IsEmbBroadcastCompatible) { return false; } - insFlags inputSize = static_cast((CodeGenInterface::instInfo[ins] & Input_Mask)); - // Embedded broadcast can be applied when operands are in the following forms. - // 1. Broadcast -> CreateScalar -> LCL_VAR/CNS + // 1. (contained)Broadcast -> ContainedNode + if (!op->isContained() || !op->OperIsHWIntrinsic()) + { + return false; + } bool IsEmbBroadcastEnabled = false; switch (op->OperGet()) { @@ -1239,7 +1256,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) // 1. embedded broadcast compatible intrinsics // 2. proper forms on the intrinsic operands. // 3. EVEX enabled. - return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled && GetEmitter()->UseEvexEncoding(); + return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index df07dd77e8b516..2555b953eecddf 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7515,70 +7515,23 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL) && parentNode->OperIsEmbBroadcastCompatible()) { - GenTree* createScalar = childNode->AsHWIntrinsic()->Op(1); - switch (createScalar->OperGet()) + GenTree* broadcastOperand = childNode->AsHWIntrinsic()->Op(1); + bool childSupportsRegOptional; + if (broadcastOperand->OperIs(GT_LCL_VAR)) { - case GT_HWINTRINSIC: - { - if (createScalar->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe) - { - // Handle the case for: - // BroadcastScalarTovector -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL. - GenTree* scalar = createScalar->AsHWIntrinsic()->Op(1); - if (scalar->OperIs(GT_LCL_VAR)) - { - switch (scalar->TypeGet()) - { - case TYP_FLOAT: - case TYP_DOUBLE: - { - const unsigned opLclNum = scalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister( - opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(createScalar, scalar); - MakeSrcContained(childNode, createScalar); - return true; - } - - default: - return false; - } - } - else if (scalar->OperIs(GT_CNS_DBL)) - { - MakeSrcContained(createScalar, scalar); - MakeSrcContained(childNode, createScalar); - return true; - } - } - break; - } - case GT_LCL_VAR: - { - // if the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node - // will be - // fold, we need to specially handle this case. - assert(createScalar->TypeIs(TYP_INT) || createScalar->TypeIs(TYP_UINT) || - createScalar->TypeIs(TYP_LONG) || createScalar->TypeIs(TYP_ULONG)); - const unsigned opLclNum = createScalar->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - MakeSrcContained(childNode, createScalar); - return true; - } - case GT_CNS_INT: - case GT_IND: - { - // For CNS_INT, similar to the GT_LVL_VAR case. - // If the operand of the CreateScalarUnsafe node is in Integer type, CreateScalarUnsafe node - // will be - // fold, we need to specially handle this case. - - // For IND, handle the case for Avx2.BroadcastScalarToVector*(T*) - MakeSrcContained(childNode, createScalar); - return true; - } - default: - break; + const unsigned opLclNum = broadcastOperand->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + } + else if (broadcastOperand->OperIs(GT_HWINTRINSIC) && + broadcastOperand->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)) + { + assert(broadcastOperand->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); + const unsigned opLclNum = broadcastOperand->AsHWIntrinsic()->Op(1)->AsLclVar()->GetLclNum(); + comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); + } + if (IsContainableHWIntrinsicOp(childNode->AsHWIntrinsic(), broadcastOperand, &childSupportsRegOptional)) + { + return true; } } return false; @@ -7612,12 +7565,8 @@ void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTr { assert(childNode->OperIs(GT_CNS_VEC)); GenTreeVecCon* vecCon = childNode->AsVecCon(); - if (vecCon->IsAllBitsSet() || vecCon->IsZero()) - { - // do not enable embedded broadcast for all 1/0 vectors. - MakeSrcContained(parentNode, childNode); - return; - } + assert(!vecCon->IsAllBitsSet()); + assert(!vecCon->IsZero()); var_types simdType = parentNode->TypeGet(); var_types simdBaseType = parentNode->GetSimdBaseType(); CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); @@ -7676,6 +7625,10 @@ void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTr { broadcastName = NI_AVX512F_BroadcastScalarToVector512; } + else + { + assert(simdType == TYP_SIMD16); + } GenTree* constScalar = nullptr; switch (simdBaseType) { @@ -8071,8 +8024,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) (intrinsicId == NI_BMI2_X64_MultiplyNoFlags)) && IsContainableHWIntrinsicOp(node, op1, &supportsOp1RegOptional)) { - if (op1->OperIs(GT_CNS_VEC) && op1->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && - comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + if (op1->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && node->OperIsEmbBroadcastCompatible()) { MakeHWIntrinsicSrcContained(node, op1); From 3fe0a2fc321a07103168eb58598855401b2ef798 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 23 May 2023 11:04:12 -0700 Subject: [PATCH 41/44] Code changes based on review --- src/coreclr/jit/instr.cpp | 121 +++++++++------------------------ src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 53 ++++++--------- 3 files changed, 53 insertions(+), 123 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 85380796aad2b5..be105a896c1ea9 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -827,17 +827,17 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) assert(hwintrinsic->OperIsMemoryLoad()); assert(hwintrinsic->GetOperandCount() == 1); assert(varTypeIsFloating(simdBaseType)); - GenTree* broadcastScalar = hwintrinsic->Op(1); - assert(broadcastScalar->isContained()); - if (broadcastScalar->OperIs(GT_LCL_ADDR)) + GenTree* hwintrinsicChild = hwintrinsic->Op(1); + assert(hwintrinsicChild->isContained()); + if (hwintrinsicChild->OperIs(GT_LCL_ADDR, GT_CLS_VAR_ADDR, GT_CNS_INT, GT_LEA)) { addr = hwintrinsic->Op(1); break; } else { - assert(broadcastScalar->OperIs(GT_LCL_VAR)); - return OperandDesc(simdBaseType, broadcastScalar); + assert(hwintrinsicChild->OperIs(GT_LCL_VAR)); + return OperandDesc(simdBaseType, hwintrinsicChild); } } @@ -846,68 +846,33 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: { - assert(op->isContained()); + assert(hwintrinsic->isContained()); if (intrinsicId == NI_SSE3_MoveAndDuplicate) { assert(simdBaseType == TYP_DOUBLE); } - // if broadcast node is contained, should mean that we have some forms like - // broadcast -> CreateScalarUnsafe -> scalar. - // if so, directly emit scalar. - switch (simdBaseType) + // If broadcast node is contained, should mean that we have some forms like + // Broadcast -> CreateScalarUnsafe -> Scalar. + // If so, directly emit scalar. + // In the codes below, we specially handle the `Broadcast -> CNS_INT` form and + // handle other cases recursively. + GenTree* hwintrinsicChild = hwintrinsic->Op(1); + if(hwintrinsicChild->OperIs(GT_CNS_INT)) { - case TYP_INT: - case TYP_UINT: - case TYP_LONG: - case TYP_ULONG: - { - // a special case is when the operand of CreateScalarUnsafe is in integer type, - // CreateScalarUnsafe node will be fold, so we directly match a pattern of - // broadcast -> LCL_VAR(TYP_(U)INT) - assert(hwintrinsic->Op(1)->OperIs(GT_LCL_VAR, GT_CNS_INT, GT_IND)); - GenTree* scalar = hwintrinsic->Op(1); - assert(scalar->isContained()); - if (scalar->OperIs(GT_LCL_VAR, GT_IND)) - { - // This handles the case: - // BroadcastScalarToVector* -> LCL_VAR/IND. - assert(scalar->isContained()); - return genOperandDesc(scalar); - } - else - { - // This handles the case: - // BroadcastScalarToVector* -> CNS_INT. - ssize_t scalarValue = scalar->AsIntCon()->IconValue(); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), + // a special case is when the operand of CreateScalarUnsafe is in integer type, + // CreateScalarUnsafe node will be fold, so we directly match a pattern of + // broadcast -> LCL_VAR(TYP_(U)INT) + assert(hwintrinsicChild->isContained()); + ssize_t scalarValue = hwintrinsicChild->AsIntCon()->IconValue(); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), genTypeSize(simdBaseType), simdBaseType); - return OperandDesc(compiler->eeFindJitDataOffs(cnum)); - } - } - - case TYP_FLOAT: - case TYP_DOUBLE: - { - assert(hwintrinsic->isContained()); - assert(hwintrinsic->Op(1)->OperIs(GT_HWINTRINSIC, GT_IND)); - GenTree* scalar = hwintrinsic->Op(1); - assert(scalar->isContained()); - if (scalar->OperIs(GT_HWINTRINSIC)) - { - // This handles the case: - // BroadcastScalarToVector* -> CreateScalarUnsafe -> LCL_VAR/CNS_DBL - return genOperandDesc(scalar->AsHWIntrinsic()->Op(1)); - } - else - { - // This handles the case: - // BroadcastScalarToVector* -> IND - return genOperandDesc(scalar); - } - } - - default: - unreached(); + return OperandDesc(compiler->eeFindJitDataOffs(cnum)); + } + else + { + // If the operand of broadcast is not a constant integer, + // we handle all the other cases recursively. + return genOperandDesc(hwintrinsicChild); } break; } @@ -1219,44 +1184,24 @@ void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenT #if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS) bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { + // To enable embedded broadcast, we need 3 things, + // 1. EVEX enabled. + // 2. Embedded broadcast compatible intrinsics + // 3. A contained broadcast scalar node if (!GetEmitter()->UseEvexEncoding()) { return false; } - - bool IsEmbBroadcastCompatible = instIsEmbeddedBroadcastCompatible(ins); - if (!IsEmbBroadcastCompatible) + if (!instIsEmbeddedBroadcastCompatible(ins)) { return false; } - - // Embedded broadcast can be applied when operands are in the following forms. - // 1. (contained)Broadcast -> ContainedNode if (!op->isContained() || !op->OperIsHWIntrinsic()) { return false; } - bool IsEmbBroadcastEnabled = false; - switch (op->OperGet()) - { - case GT_HWINTRINSIC: - { - if (op->isContained() && op->AsHWIntrinsic()->OperIsBroadcastScalar()) - { - IsEmbBroadcastEnabled = true; - } - break; - } - - default: - break; - } - - // to enable embedded broadcast, we need 3 things, - // 1. embedded broadcast compatible intrinsics - // 2. proper forms on the intrinsic operands. - // 3. EVEX enabled. - return IsEmbBroadcastCompatible && IsEmbBroadcastEnabled; + + return op->AsHWIntrinsic()->OperIsBroadcastScalar(); } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 4579763e210dd3..37dbedd12fb72f 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -110,7 +110,7 @@ class Lowering final : public Phase void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree* addr); void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node); #ifdef TARGET_XARCH - void MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTree* childNode); + void TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode); #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2555b953eecddf..480a5e25445b2e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7517,18 +7517,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre { GenTree* broadcastOperand = childNode->AsHWIntrinsic()->Op(1); bool childSupportsRegOptional; - if (broadcastOperand->OperIs(GT_LCL_VAR)) - { - const unsigned opLclNum = broadcastOperand->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - } - else if (broadcastOperand->OperIs(GT_HWINTRINSIC) && - broadcastOperand->AsHWIntrinsic()->Op(1)->OperIs(GT_LCL_VAR)) - { - assert(broadcastOperand->AsHWIntrinsic()->GetHWIntrinsicId() == NI_Vector128_CreateScalarUnsafe); - const unsigned opLclNum = broadcastOperand->AsHWIntrinsic()->Op(1)->AsLclVar()->GetLclNum(); - comp->lvaSetVarDoNotEnregister(opLclNum DEBUGARG(DoNotEnregisterReason::LiveInOutOfHandler)); - } if (IsContainableHWIntrinsicOp(childNode->AsHWIntrinsic(), broadcastOperand, &childSupportsRegOptional)) { return true; @@ -7540,8 +7528,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX_BroadcastScalarToVector128: case NI_AVX_BroadcastScalarToVector256: { - GenTree* childNodeOp = hwintrinsic->Op(1); - assert(childNodeOp->OperIs(GT_LCL_ADDR, GT_LCL_VAR)); return parentNode->OperIsEmbBroadcastCompatible(); } @@ -7554,19 +7540,18 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } //---------------------------------------------------------------------------------------------- -// MakeHWIntrInsicSrcContained: Unfold the eligible constant vector when embedded broadcast is -// available. +// TryFoldCnsVecForEmbeddedBroadcast: +// Unfold the eligible constant vector when embedded broadcast is +// available. // // Arguments: // parentNode - The hardware intrinsic node // childNode - The operand node to try contain // -void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTree* childNode) +void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeVecCon* childNode) { - assert(childNode->OperIs(GT_CNS_VEC)); - GenTreeVecCon* vecCon = childNode->AsVecCon(); - assert(!vecCon->IsAllBitsSet()); - assert(!vecCon->IsZero()); + assert(!childNode->IsAllBitsSet()); + assert(!childNode->IsZero()); var_types simdType = parentNode->TypeGet(); var_types simdBaseType = parentNode->GetSimdBaseType(); CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); @@ -7578,10 +7563,10 @@ void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTr case TYP_INT: case TYP_UINT: { - uint32_t firstElement = static_cast(vecCon->gtSimdVal.u32[0]); + uint32_t firstElement = static_cast(childNode->gtSimdVal.u32[0]); for (int i = 1; i < elementCount; i++) { - uint32_t elementToCheck = static_cast(vecCon->gtSimdVal.u32[i]); + uint32_t elementToCheck = static_cast(childNode->gtSimdVal.u32[i]); if (firstElement != elementToCheck) { isCreatedFromScalar = false; @@ -7597,10 +7582,10 @@ void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTr case TYP_ULONG: #endif // TARGET_AMD64 { - uint64_t firstElement = static_cast(vecCon->gtSimdVal.u64[0]); + uint64_t firstElement = static_cast(childNode->gtSimdVal.u64[0]); for (int i = 1; i < elementCount; i++) { - uint64_t elementToCheck = static_cast(vecCon->gtSimdVal.u64[i]); + uint64_t elementToCheck = static_cast(childNode->gtSimdVal.u64[i]); if (firstElement != elementToCheck) { isCreatedFromScalar = false; @@ -7634,38 +7619,38 @@ void Lowering::MakeHWIntrinsicSrcContained(GenTreeHWIntrinsic* parentNode, GenTr { case TYP_FLOAT: { - float scalar = static_cast(vecCon->gtSimdVal.f32[0]); + float scalar = static_cast(childNode->gtSimdVal.f32[0]); constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; } case TYP_DOUBLE: { - double scalar = static_cast(vecCon->gtSimdVal.f64[0]); + double scalar = static_cast(childNode->gtSimdVal.f64[0]); constScalar = comp->gtNewDconNode(scalar, simdBaseType); break; } case TYP_INT: { - int32_t scalar = static_cast(vecCon->gtSimdVal.i32[0]); + int32_t scalar = static_cast(childNode->gtSimdVal.i32[0]); constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_UINT: { - uint32_t scalar = static_cast(vecCon->gtSimdVal.u32[0]); + uint32_t scalar = static_cast(childNode->gtSimdVal.u32[0]); constScalar = comp->gtNewIconNode(scalar, TYP_INT); break; } #if defined(TARGET_AMD64) case TYP_LONG: { - int64_t scalar = static_cast(vecCon->gtSimdVal.i64[0]); + int64_t scalar = static_cast(childNode->gtSimdVal.i64[0]); constScalar = comp->gtNewIconNode(scalar, simdBaseType); break; } case TYP_ULONG: { - uint64_t scalar = static_cast(vecCon->gtSimdVal.u64[0]); + uint64_t scalar = static_cast(childNode->gtSimdVal.u64[0]); constScalar = comp->gtNewIconNode(scalar, TYP_LONG); break; } @@ -8009,11 +7994,11 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - if (op2->OperIs(GT_CNS_VEC) && op2->TypeIs(TYP_SIMD16, TYP_SIMD32, TYP_SIMD64) && + if (op2->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && node->OperIsEmbBroadcastCompatible()) { - MakeHWIntrinsicSrcContained(node, op2); + TryFoldCnsVecForEmbeddedBroadcast(node, op2->AsVecCon()); } else { @@ -8027,7 +8012,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (op1->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && node->OperIsEmbBroadcastCompatible()) { - MakeHWIntrinsicSrcContained(node, op1); + TryFoldCnsVecForEmbeddedBroadcast(node, op1->AsVecCon()); } else { From 6fb6e4894710089a8688491ddf71623e931481f2 Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 23 May 2023 11:16:51 -0700 Subject: [PATCH 42/44] apply format patch --- src/coreclr/jit/instr.cpp | 8 ++++---- src/coreclr/jit/lowerxarch.cpp | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index be105a896c1ea9..4b1b19e6ddb53e 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -857,15 +857,15 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) // In the codes below, we specially handle the `Broadcast -> CNS_INT` form and // handle other cases recursively. GenTree* hwintrinsicChild = hwintrinsic->Op(1); - if(hwintrinsicChild->OperIs(GT_CNS_INT)) + if (hwintrinsicChild->OperIs(GT_CNS_INT)) { // a special case is when the operand of CreateScalarUnsafe is in integer type, // CreateScalarUnsafe node will be fold, so we directly match a pattern of // broadcast -> LCL_VAR(TYP_(U)INT) assert(hwintrinsicChild->isContained()); ssize_t scalarValue = hwintrinsicChild->AsIntCon()->IconValue(); - UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), - genTypeSize(simdBaseType), simdBaseType); + UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), + genTypeSize(simdBaseType), simdBaseType); return OperandDesc(compiler->eeFindJitDataOffs(cnum)); } else @@ -1200,7 +1200,7 @@ bool CodeGenInterface::IsEmbeddedBroadcastEnabled(instruction ins, GenTree* op) { return false; } - + return op->AsHWIntrinsic()->OperIsBroadcastScalar(); } #endif // TARGET_XARCH && FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 480a5e25445b2e..9f498b7342f7d5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -7540,7 +7540,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre } //---------------------------------------------------------------------------------------------- -// TryFoldCnsVecForEmbeddedBroadcast: +// TryFoldCnsVecForEmbeddedBroadcast: // Unfold the eligible constant vector when embedded broadcast is // available. // @@ -7994,8 +7994,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableHWIntrinsicOp(node, op2, &supportsOp2RegOptional)) { - if (op2->OperIs(GT_CNS_VEC) && - comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + if (op2->OperIs(GT_CNS_VEC) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && node->OperIsEmbBroadcastCompatible()) { TryFoldCnsVecForEmbeddedBroadcast(node, op2->AsVecCon()); From 3b3d0d1de182535f0c4c48a800306e50fea0490d Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Tue, 23 May 2023 14:36:39 -0700 Subject: [PATCH 43/44] Code changes based on review: 1. deleted irrelevant comments. Move the contain check up to cover more cases. --- src/coreclr/jit/instr.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 4b1b19e6ddb53e..93c4e601bb7811 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -765,9 +765,7 @@ void CodeGen::inst_RV_SH( // logic for determining what "kind" of operand "op" is. // // Arguments: -// op - The operand node for which to obtain the descriptor -// instOptions - The optional parameter to track if embedded broadcast is enabled -// simdBaseType - The base data type of the emitting instruction. +// op - The operand node for which to obtain the descriptor. // // Return Value: // The operand descriptor for "op". @@ -857,12 +855,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) // In the codes below, we specially handle the `Broadcast -> CNS_INT` form and // handle other cases recursively. GenTree* hwintrinsicChild = hwintrinsic->Op(1); + assert(hwintrinsicChild->isContained()); if (hwintrinsicChild->OperIs(GT_CNS_INT)) { // a special case is when the operand of CreateScalarUnsafe is in integer type, // CreateScalarUnsafe node will be fold, so we directly match a pattern of // broadcast -> LCL_VAR(TYP_(U)INT) - assert(hwintrinsicChild->isContained()); ssize_t scalarValue = hwintrinsicChild->AsIntCon()->IconValue(); UNATIVE_OFFSET cnum = emit->emitDataConst(&scalarValue, genTypeSize(simdBaseType), genTypeSize(simdBaseType), simdBaseType); From 36af7b77bc2d6724e91a742f666cd4b2e145f62e Mon Sep 17 00:00:00 2001 From: Ruihan-Yin Date: Wed, 31 May 2023 16:06:43 -0700 Subject: [PATCH 44/44] Code changes based on review: 1. Update comment to keep up with the changes in InstrDesc. 2. Removed un-needed argumnet in the irrelevant method. --- src/coreclr/jit/emit.h | 4 ++-- src/coreclr/jit/emitxarch.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index ee22b55e2f2993..ac0bcf0dffef36 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -871,8 +871,8 @@ class emitter //////////////////////////////////////////////////////////////////////// // Space taken up to here (with/without prev offset, assuming host==target): - // x86: 52/48 bits - // amd64: 53/48 bits + // x86: 53/49 bits + // amd64: 54/49 bits // arm: 54/50 bits // arm64: 57/52 bits // loongarch64: 53/48 bits diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index e2d887a3ea6b90..dd6800347a59c5 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -8535,7 +8535,7 @@ void emitter::emitIns_SIMD_R_R_R_S( assert((op2Reg != targetReg) || (op1Reg == targetReg)); emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true); - emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs, INS_OPTS_NONE); + emitIns_R_R_S(ins, attr, targetReg, op2Reg, varx, offs); } //------------------------------------------------------------------------