From 0c4a5ed417989e9727a46fe2c929d577fd85c144 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 16 Aug 2022 23:17:59 +0300 Subject: [PATCH 1/3] Add a test --- .../JitBlue/Runtime_73951/Runtime_73951.cs | 76 +++++++++++++++++++ .../Runtime_73951/Runtime_73951.csproj | 9 +++ 2 files changed, 85 insertions(+) create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs new file mode 100644 index 00000000000000..1f4f5895fdc4c6 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Reflection; +using System.Runtime.CompilerServices; + +public class Runtime_73951 +{ + [ThreadStatic] + public static IRuntime s_rt; + [ThreadStatic] + public static S1 s_17; + + public static ushort s_result; + + public static int Main() + { + Problem(new Runtime()); + + return s_result == 0 ? 100 : 101; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void Problem(IRuntime rt) + { + s_rt = rt; + S0 vr21 = s_17.F1; + new S1(new object()).M105(vr21); + + var vr22 = new C0(vr21.F3); + s_rt.Capture(vr22.F1); + } + + public class C0 + { + public ushort F1; + public C0(ushort f1) + { + F1 = f1; + } + } + + public struct S0 + { + public uint F1; + public int F2; + public byte F3; + } + + public struct S1 + { + public object F0; + public S0 F1; + public S1(object f0) : this() + { + F0 = f0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public S1 M105(S0 arg0) + { + return this; + } + } + + public interface IRuntime + { + void Capture(ushort value); + } + + public class Runtime : IRuntime + { + public void Capture(ushort value) => s_result = value; + } +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj new file mode 100644 index 00000000000000..f492aeac9d056b --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj @@ -0,0 +1,9 @@ + + + Exe + True + + + + + \ No newline at end of file From 011f58c270bd01c1dbaf028a10672d604b8d8464 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 16 Aug 2022 22:49:34 +0300 Subject: [PATCH 2/3] Do not widen uses of normalize-on-load locals This breaks the assumption that all LCL_VAR uses of such locals, when typed small, are normalized (and equivalent to each other). Instead, explicitly check for legality of widening the loads in LSRA and codegen. --- src/coreclr/jit/codegenxarch.cpp | 38 +++++++++++++++++++++----------- src/coreclr/jit/lowerxarch.cpp | 12 ---------- src/coreclr/jit/lsraxarch.cpp | 3 ++- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index c788bbbe253a11..1ac2052c8b3805 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7892,9 +7892,9 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) for (GenTreeFieldList::Use& use : fieldList->Uses()) { - GenTree* const fieldNode = use.GetNode(); - const unsigned fieldOffset = use.GetOffset(); - var_types fieldType = use.GetType(); + GenTree* const fieldNode = use.GetNode(); + const unsigned fieldOffset = use.GetOffset(); + const var_types fieldType = use.GetType(); // Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the // field list in descending order by offset. @@ -7917,8 +7917,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) int adjustment = roundUp(currentOffset - fieldOffset, 4); if (fieldIsSlot && !varTypeIsSIMD(fieldType)) { - fieldType = genActualType(fieldType); - unsigned pushSize = genTypeSize(fieldType); + unsigned pushSize = genTypeSize(genActualType(fieldType)); assert((pushSize % 4) == 0); adjustment -= pushSize; while (adjustment != 0) @@ -7967,12 +7966,21 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) } bool canStoreWithPush = fieldIsSlot; - bool canLoadWithPush = varTypeIsI(fieldNode) || genIsValidIntReg(argReg); + bool canLoadWithPush = genIsValidIntReg(argReg); + if (argReg == REG_NA) + { + assert((genTypeSize(fieldNode) <= TARGET_POINTER_SIZE)); + assert(genTypeSize(genActualType(fieldNode)) == genTypeSize(genActualType(fieldType))); + + // We can widen local loads if the excess only affects padding bits. + canLoadWithPush = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || fieldNode->isUsedFromSpillTemp() || + (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); + } if (canStoreWithPush && canLoadWithPush) { assert(m_pushStkArg); - assert(genTypeSize(fieldNode) == TARGET_POINTER_SIZE); + assert(genTypeSize(fieldNode) <= TARGET_POINTER_SIZE); inst_TT(INS_push, emitActualTypeSize(fieldNode), fieldNode); currentOffset -= TARGET_POINTER_SIZE; @@ -7995,9 +8003,10 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) } else { - // TODO-XArch-CQ: using "ins_Load" here is conservative, as it will always - // extend, which we can avoid if the field type is smaller than the node type. - inst_RV_TT(ins_Load(fieldNode->TypeGet()), emitTypeSize(fieldNode), intTmpReg, fieldNode); + // Use the smaller "mov" instruction in case we do not need a sign/zero-extending load. + instruction loadIns = canLoadWithPush ? INS_mov : ins_Load(fieldNode->TypeGet()); + emitAttr loadSize = canLoadWithPush ? EA_PTRSIZE : emitTypeSize(fieldNode); + inst_RV_TT(loadIns, loadSize, intTmpReg, fieldNode); } argReg = intTmpReg; @@ -8012,13 +8021,16 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) else #endif // defined(FEATURE_SIMD) { - genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset); + // Using wide stores here avoids having to reserve a byteable register when we could not + // use "push" due to the field node being an indirection (i. e. for "!canLoadFullSlot"). + var_types storeType = canStoreFullSlot ? genActualType(fieldType) : fieldType; + genStoreRegToStackArg(storeType, argReg, fieldOffset - currentOffset); } if (m_pushStkArg) { - // We always push a slot-rounded size - currentOffset -= genTypeSize(fieldType); + // We always push a slot-rounded size. + currentOffset -= roundUp(genTypeSize(fieldType), TARGET_POINTER_SIZE); } } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4ef151f15b5f9c..7800ee166d2cc8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -507,18 +507,6 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // registers to be consumed atomically by the call. if (varTypeIsIntegralOrI(fieldNode)) { - // If we are loading from an in-memory local, we would like to use "push", but this - // is only legal if we can safely load all 4 bytes. Retype the local node here to - // TYP_INT for such legal cases to make downstream (LSRA & codegen) logic simpler. - // Retyping is ok because we model this node as STORE(LOAD). - // If the field came from promotion, we allow the padding to remain undefined, if - // from decomposition, the field type will be INT (naturally blocking the retyping). - if (varTypeIsSmall(fieldNode) && (genTypeSize(fieldType) <= genTypeSize(fieldNode)) && - fieldNode->OperIsLocalRead()) - { - fieldNode->ChangeType(TYP_INT); - } - if (IsContainableImmed(putArgStk, fieldNode)) { MakeSrcContained(putArgStk, fieldNode); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 9fdfe585c68e8e..11b962c76b4f36 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1514,7 +1514,8 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) // field is not in the same slot. (Note that we store the fields in reverse order.) const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); const bool canStoreWithPush = fieldIsSlot; - const bool canLoadWithPush = varTypeIsI(fieldNode); + const bool canLoadWithPush = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || + (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); if ((!canStoreWithPush || !canLoadWithPush) && (intTemp == nullptr)) { From 797ed8ee41d2af55e449d3e632ce2301636b14a4 Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 16 Aug 2022 22:59:15 +0300 Subject: [PATCH 3/3] "Load/StoreWithPush" -> "Load/StoreFullSlot" The load variant in codegen now has double purpose. --- src/coreclr/jit/codegenxarch.cpp | 12 ++++++------ src/coreclr/jit/lsraxarch.cpp | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 1ac2052c8b3805..7a8c6b362c2943 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7965,19 +7965,19 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) } } - bool canStoreWithPush = fieldIsSlot; - bool canLoadWithPush = genIsValidIntReg(argReg); + bool canStoreFullSlot = fieldIsSlot; + bool canLoadFullSlot = genIsValidIntReg(argReg); if (argReg == REG_NA) { assert((genTypeSize(fieldNode) <= TARGET_POINTER_SIZE)); assert(genTypeSize(genActualType(fieldNode)) == genTypeSize(genActualType(fieldType))); // We can widen local loads if the excess only affects padding bits. - canLoadWithPush = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || fieldNode->isUsedFromSpillTemp() || + canLoadFullSlot = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || fieldNode->isUsedFromSpillTemp() || (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); } - if (canStoreWithPush && canLoadWithPush) + if (canStoreFullSlot && canLoadFullSlot) { assert(m_pushStkArg); assert(genTypeSize(fieldNode) <= TARGET_POINTER_SIZE); @@ -8004,8 +8004,8 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) else { // Use the smaller "mov" instruction in case we do not need a sign/zero-extending load. - instruction loadIns = canLoadWithPush ? INS_mov : ins_Load(fieldNode->TypeGet()); - emitAttr loadSize = canLoadWithPush ? EA_PTRSIZE : emitTypeSize(fieldNode); + instruction loadIns = canLoadFullSlot ? INS_mov : ins_Load(fieldNode->TypeGet()); + emitAttr loadSize = canLoadFullSlot ? EA_PTRSIZE : emitTypeSize(fieldNode); inst_RV_TT(loadIns, loadSize, intTmpReg, fieldNode); } diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 11b962c76b4f36..1c97652c5a4764 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1512,18 +1512,18 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) // We can treat as a slot any field that is stored at a slot boundary, where the previous // field is not in the same slot. (Note that we store the fields in reverse order.) - const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); - const bool canStoreWithPush = fieldIsSlot; - const bool canLoadWithPush = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || + const bool canStoreFullSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); + const bool canLoadFullSlot = + (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); - if ((!canStoreWithPush || !canLoadWithPush) && (intTemp == nullptr)) + if ((!canStoreFullSlot || !canLoadFullSlot) && (intTemp == nullptr)) { intTemp = buildInternalIntRegisterDefForNode(putArgStk); } // We can only store bytes using byteable registers. - if (!canStoreWithPush && varTypeIsByte(fieldType)) + if (!canStoreFullSlot && varTypeIsByte(fieldType)) { intTemp->registerAssignment &= allByteRegs(); }