diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index c788bbbe253a11..7a8c6b362c2943 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -7892,9 +7892,9 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) for (GenTreeFieldList::Use& use : fieldList->Uses()) { - GenTree* const fieldNode = use.GetNode(); - const unsigned fieldOffset = use.GetOffset(); - var_types fieldType = use.GetType(); + GenTree* const fieldNode = use.GetNode(); + const unsigned fieldOffset = use.GetOffset(); + const var_types fieldType = use.GetType(); // Long-typed nodes should have been handled by the decomposition pass, and lowering should have sorted the // field list in descending order by offset. @@ -7917,8 +7917,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) int adjustment = roundUp(currentOffset - fieldOffset, 4); if (fieldIsSlot && !varTypeIsSIMD(fieldType)) { - fieldType = genActualType(fieldType); - unsigned pushSize = genTypeSize(fieldType); + unsigned pushSize = genTypeSize(genActualType(fieldType)); assert((pushSize % 4) == 0); adjustment -= pushSize; while (adjustment != 0) @@ -7966,13 +7965,22 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) } } - bool canStoreWithPush = fieldIsSlot; - bool canLoadWithPush = varTypeIsI(fieldNode) || genIsValidIntReg(argReg); + bool canStoreFullSlot = fieldIsSlot; + bool canLoadFullSlot = genIsValidIntReg(argReg); + if (argReg == REG_NA) + { + assert((genTypeSize(fieldNode) <= TARGET_POINTER_SIZE)); + assert(genTypeSize(genActualType(fieldNode)) == genTypeSize(genActualType(fieldType))); + + // We can widen local loads if the excess only affects padding bits. + canLoadFullSlot = (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || fieldNode->isUsedFromSpillTemp() || + (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); + } - if (canStoreWithPush && canLoadWithPush) + if (canStoreFullSlot && canLoadFullSlot) { assert(m_pushStkArg); - assert(genTypeSize(fieldNode) == TARGET_POINTER_SIZE); + assert(genTypeSize(fieldNode) <= TARGET_POINTER_SIZE); inst_TT(INS_push, emitActualTypeSize(fieldNode), fieldNode); currentOffset -= TARGET_POINTER_SIZE; @@ -7995,9 +8003,10 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) } else { - // TODO-XArch-CQ: using "ins_Load" here is conservative, as it will always - // extend, which we can avoid if the field type is smaller than the node type. - inst_RV_TT(ins_Load(fieldNode->TypeGet()), emitTypeSize(fieldNode), intTmpReg, fieldNode); + // Use the smaller "mov" instruction in case we do not need a sign/zero-extending load. + instruction loadIns = canLoadFullSlot ? INS_mov : ins_Load(fieldNode->TypeGet()); + emitAttr loadSize = canLoadFullSlot ? EA_PTRSIZE : emitTypeSize(fieldNode); + inst_RV_TT(loadIns, loadSize, intTmpReg, fieldNode); } argReg = intTmpReg; @@ -8012,13 +8021,16 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) else #endif // defined(FEATURE_SIMD) { - genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset); + // Using wide stores here avoids having to reserve a byteable register when we could not + // use "push" due to the field node being an indirection (i. e. for "!canLoadFullSlot"). + var_types storeType = canStoreFullSlot ? genActualType(fieldType) : fieldType; + genStoreRegToStackArg(storeType, argReg, fieldOffset - currentOffset); } if (m_pushStkArg) { - // We always push a slot-rounded size - currentOffset -= genTypeSize(fieldType); + // We always push a slot-rounded size. + currentOffset -= roundUp(genTypeSize(fieldType), TARGET_POINTER_SIZE); } } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4ef151f15b5f9c..7800ee166d2cc8 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -507,18 +507,6 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // registers to be consumed atomically by the call. if (varTypeIsIntegralOrI(fieldNode)) { - // If we are loading from an in-memory local, we would like to use "push", but this - // is only legal if we can safely load all 4 bytes. Retype the local node here to - // TYP_INT for such legal cases to make downstream (LSRA & codegen) logic simpler. - // Retyping is ok because we model this node as STORE(LOAD). - // If the field came from promotion, we allow the padding to remain undefined, if - // from decomposition, the field type will be INT (naturally blocking the retyping). - if (varTypeIsSmall(fieldNode) && (genTypeSize(fieldType) <= genTypeSize(fieldNode)) && - fieldNode->OperIsLocalRead()) - { - fieldNode->ChangeType(TYP_INT); - } - if (IsContainableImmed(putArgStk, fieldNode)) { MakeSrcContained(putArgStk, fieldNode); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 9fdfe585c68e8e..1c97652c5a4764 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1512,17 +1512,18 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) // We can treat as a slot any field that is stored at a slot boundary, where the previous // field is not in the same slot. (Note that we store the fields in reverse order.) - const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); - const bool canStoreWithPush = fieldIsSlot; - const bool canLoadWithPush = varTypeIsI(fieldNode); + const bool canStoreFullSlot = ((fieldOffset % 4) == 0) && ((prevOffset - fieldOffset) >= 4); + const bool canLoadFullSlot = + (genTypeSize(fieldNode) == TARGET_POINTER_SIZE) || + (fieldNode->OperIsLocalRead() && (genTypeSize(fieldNode) >= genTypeSize(fieldType))); - if ((!canStoreWithPush || !canLoadWithPush) && (intTemp == nullptr)) + if ((!canStoreFullSlot || !canLoadFullSlot) && (intTemp == nullptr)) { intTemp = buildInternalIntRegisterDefForNode(putArgStk); } // We can only store bytes using byteable registers. - if (!canStoreWithPush && varTypeIsByte(fieldType)) + if (!canStoreFullSlot && varTypeIsByte(fieldType)) { intTemp->registerAssignment &= allByteRegs(); } diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs new file mode 100644 index 00000000000000..1f4f5895fdc4c6 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.cs @@ -0,0 +1,76 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Reflection; +using System.Runtime.CompilerServices; + +public class Runtime_73951 +{ + [ThreadStatic] + public static IRuntime s_rt; + [ThreadStatic] + public static S1 s_17; + + public static ushort s_result; + + public static int Main() + { + Problem(new Runtime()); + + return s_result == 0 ? 100 : 101; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static void Problem(IRuntime rt) + { + s_rt = rt; + S0 vr21 = s_17.F1; + new S1(new object()).M105(vr21); + + var vr22 = new C0(vr21.F3); + s_rt.Capture(vr22.F1); + } + + public class C0 + { + public ushort F1; + public C0(ushort f1) + { + F1 = f1; + } + } + + public struct S0 + { + public uint F1; + public int F2; + public byte F3; + } + + public struct S1 + { + public object F0; + public S0 F1; + public S1(object f0) : this() + { + F0 = f0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public S1 M105(S0 arg0) + { + return this; + } + } + + public interface IRuntime + { + void Capture(ushort value); + } + + public class Runtime : IRuntime + { + public void Capture(ushort value) => s_result = value; + } +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj new file mode 100644 index 00000000000000..f492aeac9d056b --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_73951/Runtime_73951.csproj @@ -0,0 +1,9 @@ + + + Exe + True + + + + + \ No newline at end of file