From 392c6528f0f9a3fd649c647cb4336687002f62d4 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 18 Jun 2024 16:20:58 -0700 Subject: [PATCH 01/23] Initial commit --- src/coreclr/inc/yieldprocessornormalized.h | 33 ++ .../Runtime/yieldprocessornormalized.cpp | 103 +----- .../Runtime/yieldprocessornormalized.h | 227 +----------- src/coreclr/vm/finalizerthread.h | 9 + src/coreclr/vm/synch.h | 2 + src/coreclr/vm/yieldprocessornormalized.cpp | 294 +--------------- .../vm/yieldprocessornormalizedshared.cpp | 330 ++++++++++++++++++ 7 files changed, 378 insertions(+), 620 deletions(-) create mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 121e60b033356d..c8566e95749153 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -6,7 +6,11 @@ // Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where // the intention is to use the system-default implementation of YieldProcessor(). #define HAS_SYSTEM_YIELDPROCESSOR +#ifdef FEATURE_NATIVEAOT +FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } +#else FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } +#endif #ifdef YieldProcessor #undef YieldProcessor #endif @@ -20,6 +24,35 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } T() = delete; \ DISABLE_COPY(T) +#ifdef FEATURE_NATIVEAOT +#define static_assert_no_msg( cond ) static_assert( cond, #cond ) +// I haven't seen an equivalent for GetTickCount yet, defining it like this for now +unsigned int GetTickCount() { + return 0; +} +#define SIZE_T uintptr_t +// verify these are correct +typedef BYTE UINT8; +typedef ULONGLONG UINT64; + +template +T Min(T v1, T v2) +{ + // STATIC_CONTRACT_LEAF; + return v1 < v2 ? v1 : v2; +} + +template +T Max(T v1, T v2) +{ + // STATIC_CONTRACT_LEAF; + return v1 > v2 ? v1 : v2; +} + +void InitializeYieldProcessorNormalizedCrst(); +void EnsureYieldProcessorNormalizedInitialized(); +#endif + class YieldProcessorNormalization { public: diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp index 444d52b0114c03..a9eb72a147d6d0 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp @@ -14,105 +14,6 @@ #include "slist.h" #include "volatile.h" #include "yieldprocessornormalized.h" +#include "../../vm/synch.h" -#define ULONGLONG int64_t - -static Volatile s_isYieldProcessorNormalizedInitialized = false; -static CrstStatic s_initializeYieldProcessorNormalizedCrst; - -// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are -// tuned for Skylake processors -unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake -unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7; - -void InitializeYieldProcessorNormalizedCrst() -{ - WRAPPER_NO_CONTRACT; - s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized); -} - -static void InitializeYieldProcessorNormalized() -{ - WRAPPER_NO_CONTRACT; - - CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst); - - if (s_isYieldProcessorNormalizedInitialized) - { - return; - } - - // Intel pre-Skylake processor: measured typically 14-17 cycles per yield - // Intel post-Skylake processor: measured typically 125-150 cycles per yield - const int MeasureDurationMs = 10; - const int NsPerSecond = 1000 * 1000 * 1000; - - ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency(); - - if (ticksPerSecond < 1000 / MeasureDurationMs) - { - // High precision clock not available or clock resolution is too low, resort to defaults - s_isYieldProcessorNormalizedInitialized = true; - return; - } - - // Measure the nanosecond delay per yield - ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs); - unsigned int yieldCount = 0; - ULONGLONG startTicks = PalQueryPerformanceCounter(); - ULONGLONG elapsedTicks; - do - { - // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask - // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the - // low microsecond range. - for (int i = 0; i < 1000; ++i) - { - System_YieldProcessor(); - } - yieldCount += 1000; - - ULONGLONG nowTicks = PalQueryPerformanceCounter(); - elapsedTicks = nowTicks - startTicks; - } while (elapsedTicks < measureDurationTicks); - double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond); - if (nsPerYield < 1) - { - nsPerYield = 1; - } - - // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this - // value is naturally limited to MinNsPerNormalizedYield. - int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5); - if (yieldsPerNormalizedYield < 1) - { - yieldsPerNormalizedYield = 1; - } - _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield); - - // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to - // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a - // better job of allowing other work to run. - int optimalMaxNormalizedYieldsPerSpinIteration = - (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5); - if (optimalMaxNormalizedYieldsPerSpinIteration < 1) - { - optimalMaxNormalizedYieldsPerSpinIteration = 1; - } - - g_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration; - s_isYieldProcessorNormalizedInitialized = true; - - GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); -} - -void EnsureYieldProcessorNormalizedInitialized() -{ - WRAPPER_NO_CONTRACT; - - if (!s_isYieldProcessorNormalizedInitialized) - { - InitializeYieldProcessorNormalized(); - } -} +#include "../../vm/yieldprocessornormalizedshared.cpp" diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h index 8c74bf3cfe3002..6f29c02308df05 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h @@ -1,229 +1,4 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#pragma once - -#include - -// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where -// the intention is to use the system-default implementation of YieldProcessor(). -#define HAS_SYSTEM_YIELDPROCESSOR -FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } -#ifdef YieldProcessor -#undef YieldProcessor -#endif -#define YieldProcessor Dont_Use_YieldProcessor -#ifdef PalYieldProcessor -#undef PalYieldProcessor -#endif -#define PalYieldProcessor Dont_Use_PalYieldProcessor - -#define SIZE_T uintptr_t - -const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake -const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake - -extern unsigned int g_yieldsPerNormalizedYield; -extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration; - -void InitializeYieldProcessorNormalizedCrst(); -void EnsureYieldProcessorNormalizedInitialized(); - -class YieldProcessorNormalizationInfo -{ -private: - unsigned int yieldsPerNormalizedYield; - unsigned int optimalMaxNormalizedYieldsPerSpinIteration; - unsigned int optimalMaxYieldsPerSpinIteration; - -public: - YieldProcessorNormalizationInfo() - : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield), - optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration), - optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration) - { - } - - friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &); - friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int); - friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int); - friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int); -}; - -// See YieldProcessorNormalized() for preliminary info. Typical usage: -// if (!condition) -// { -// YieldProcessorNormalizationInfo normalizationInfo; -// do -// { -// YieldProcessorNormalized(normalizationInfo); -// } while (!condition); -// } -FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo) -{ - unsigned int n = normalizationInfo.yieldsPerNormalizedYield; - _ASSERTE(n != 0); - do - { - System_YieldProcessor(); - } while (--n != 0); -} - -// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the -// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following: -// - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value -// for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage -// and decrease scalability of the operation. -// while(!condition) -// { -// YieldProcessorNormalized(); -// } -// - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the -// condition, otherwise it may unnecessarily increase latency of the operation -// - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in -// yield count per iteration for each failed check of the condition, the progression can significantly magnify the second -// issue above on later iterations. -// - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each -// issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using -// System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method. -FORCEINLINE void YieldProcessorNormalized() -{ - YieldProcessorNormalized(YieldProcessorNormalizationInfo()); -} - -// See YieldProcessorNormalized(count) for preliminary info. Typical usage: -// if (!moreExpensiveCondition) -// { -// YieldProcessorNormalizationInfo normalizationInfo; -// do -// { -// YieldProcessorNormalized(normalizationInfo, 2); -// } while (!moreExpensiveCondition); -// } -FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count) -{ - _ASSERTE(count != 0); - - if (sizeof(SIZE_T) <= sizeof(unsigned int)) - { - // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield - // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). - const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; - if (count > MaxCount) - { - count = MaxCount; - } - } - - SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield; - _ASSERTE(n != 0); - do - { - System_YieldProcessor(); - } while (--n != 0); -} - -// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is -// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage: -// while(!moreExpensiveCondition) -// { -// YieldProcessorNormalized(2); -// } -FORCEINLINE void YieldProcessorNormalized(unsigned int count) -{ - YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count); -} - -// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary -// info. Typical usage: -// if (!condition) -// { -// YieldProcessorNormalizationInfo normalizationInfo; -// do -// { -// YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100); -// } while (!condition); -// } -FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( - const YieldProcessorNormalizationInfo &normalizationInfo, - unsigned int preSkylakeCount) -{ - _ASSERTE(preSkylakeCount != 0); - - if (sizeof(SIZE_T) <= sizeof(unsigned int)) - { - // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield - // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized(). - const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield; - if (preSkylakeCount > MaxCount) - { - preSkylakeCount = MaxCount; - } - } - - const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; - SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; - if (n == 0) - { - n = 1; - } - do - { - System_YieldProcessor(); - } while (--n != 0); -} - -// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned -// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in -// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a -// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage: -// while(!condition) -// { -// YieldProcessorNormalizedForPreSkylakeCount(100); -// } -FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount) -{ - YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount); -} - -// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the -// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait -// iteration exponentially up to a limit. Typical usage: -// if (!conditionThatMayNotBeSatisfiedSoon) -// { -// YieldProcessorNormalizationInfo normalizationInfo; -// do -// { -// YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally -// } while (!conditionThatMayNotBeSatisfiedSoon); -// } -FORCEINLINE void YieldProcessorWithBackOffNormalized( - const YieldProcessorNormalizationInfo &normalizationInfo, - unsigned int spinIteration) -{ - // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in - // InitializeYieldProcessorNormalized() - const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration = - NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1; - _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); - - // This shift value should be adjusted based on the asserted condition below - const uint8_t MaxShift = 3; - static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); - - unsigned int n; - if (spinIteration <= MaxShift && - ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration) - { - n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield; - } - else - { - n = normalizationInfo.optimalMaxYieldsPerSpinIteration; - } - _ASSERTE(n != 0); - do - { - System_YieldProcessor(); - } while (--n != 0); -} +#include "../../inc/yieldprocessornormalized.h" diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h index 5ea3cca02bebfe..3fb9822530a776 100644 --- a/src/coreclr/vm/finalizerthread.h +++ b/src/coreclr/vm/finalizerthread.h @@ -5,6 +5,13 @@ #ifndef _FINALIZER_THREAD_H_ #define _FINALIZER_THREAD_H_ +#ifdef FEATURE_NATIVEAOT +typedef void VOID; +GPTR_IMPL(Thread, g_pFinalizerThread); +// Global state variable indicating if the EE has been started up. +Volatile g_fEEStarted = FALSE; +#endif + class FinalizerThread { static BOOL fQuitFinalizer; @@ -50,6 +57,7 @@ class FinalizerThread static OBJECTREF GetNextFinalizableObject(); +#ifndef FEATURE_NATIVEAOT static void RaiseShutdownEvents() { WRAPPER_NO_CONTRACT; @@ -64,6 +72,7 @@ class FinalizerThread hEventFinalizerToShutDown->Wait(INFINITE, /*alertable*/ TRUE); } } +#endif static void FinalizerThreadWait(DWORD timeout = INFINITE); diff --git a/src/coreclr/vm/synch.h b/src/coreclr/vm/synch.h index 72e19f1c33b602..d07d89b2d5f772 100644 --- a/src/coreclr/vm/synch.h +++ b/src/coreclr/vm/synch.h @@ -134,6 +134,7 @@ class CLREvent : public CLREventBase }; +#ifndef FEATURE_NATIVEAOT // CLREventStatic // Same as CLREvent, but intended to be used for global variables. // Instances may leak their handle, because of the order in which @@ -142,6 +143,7 @@ class CLREvent : public CLREventBase class CLREventStatic : public CLREventBase { }; +#endif BOOL CLREventWaitWithTry(CLREventBase *pEvent, DWORD timeout, BOOL fAlertable, DWORD *pStatus); #endif diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 14166de34dd641..9b4c685be25460 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -7,296 +7,4 @@ #include "finalizerthread.h" -enum class NormalizationState : UINT8 -{ - Uninitialized, - Initialized, - Failed -}; - -static const int NsPerYieldMeasurementCount = 8; -static const unsigned int MeasurementPeriodMs = 4000; - -static const unsigned int NsPerS = 1000 * 1000 * 1000; - -static NormalizationState s_normalizationState = NormalizationState::Uninitialized; -static unsigned int s_previousNormalizationTimeMs; - -static UINT64 s_performanceCounterTicksPerS; -static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; -static int s_nextMeasurementIndex; -static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; - -static unsigned int DetermineMeasureDurationUs() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_PREEMPTIVE; - } - CONTRACTL_END; - - _ASSERTE(s_normalizationState != NormalizationState::Failed); - - // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration - // if the overhead seems high relative to the measure duration. - unsigned int measureDurationUs = 1; - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - UINT64 startTicks = li.QuadPart; - QueryPerformanceCounter(&li); - UINT64 elapsedTicks = li.QuadPart - startTicks; - if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration - { - measureDurationUs *= 4; - } - return measureDurationUs; -} - -static double MeasureNsPerYield(unsigned int measureDurationUs) -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_PREEMPTIVE; - } - CONTRACTL_END; - - _ASSERTE(s_normalizationState != NormalizationState::Failed); - - int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; - UINT64 ticksPerS = s_performanceCounterTicksPerS; - UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); - - LARGE_INTEGER li; - QueryPerformanceCounter(&li); - UINT64 startTicks = li.QuadPart; - - for (int i = 0; i < yieldCount; ++i) - { - System_YieldProcessor(); - } - - QueryPerformanceCounter(&li); - UINT64 elapsedTicks = li.QuadPart - startTicks; - while (elapsedTicks < measureDurationTicks) - { - int nextYieldCount = - Max(4, - elapsedTicks == 0 - ? yieldCount / 4 - : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); - for (int i = 0; i < nextYieldCount; ++i) - { - System_YieldProcessor(); - } - - QueryPerformanceCounter(&li); - elapsedTicks = li.QuadPart - startTicks; - yieldCount += nextYieldCount; - } - - // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op - const double MinNsPerYield = 0.1; - - // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to - // really take this long. Limit the maximum to keep the recorded values reasonable. - const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; - - return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); -} - -void YieldProcessorNormalization::PerformMeasurement() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_PREEMPTIVE; - } - CONTRACTL_END; - - _ASSERTE(s_isMeasurementScheduled); - - double latestNsPerYield; - if (s_normalizationState == NormalizationState::Initialized) - { - if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) - { - return; - } - - int nextMeasurementIndex = s_nextMeasurementIndex; - latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); - AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield); - if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) - { - nextMeasurementIndex = 0; - } - s_nextMeasurementIndex = nextMeasurementIndex; - } - else if (s_normalizationState == NormalizationState::Uninitialized) - { - LARGE_INTEGER li; - if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) - { - // High precision clock not available or clock resolution is too low, resort to defaults - s_normalizationState = NormalizationState::Failed; - return; - } - s_performanceCounterTicksPerS = li.QuadPart; - - unsigned int measureDurationUs = DetermineMeasureDurationUs(); - for (int i = 0; i < NsPerYieldMeasurementCount; ++i) - { - latestNsPerYield = MeasureNsPerYield(measureDurationUs); - AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield); - if (i == 0 || latestNsPerYield < s_establishedNsPerYield) - { - AtomicStore(&s_establishedNsPerYield, latestNsPerYield); - } - - if (i < NsPerYieldMeasurementCount - 1) - { - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); - } - } - } - else - { - _ASSERTE(s_normalizationState == NormalizationState::Failed); - return; - } - - double establishedNsPerYield = s_nsPerYieldMeasurements[0]; - for (int i = 1; i < NsPerYieldMeasurementCount; ++i) - { - double nsPerYield = s_nsPerYieldMeasurements[i]; - if (nsPerYield < establishedNsPerYield) - { - establishedNsPerYield = nsPerYield; - } - } - if (establishedNsPerYield != s_establishedNsPerYield) - { - AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); - } - - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); - - // Calculate the number of yields required to span the duration of a normalized yield - unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); - _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); - s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; - - // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to - // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a - // better job of allowing other work to run. - s_optimalMaxNormalizedYieldsPerSpinIteration = - Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); - _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); - - GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); - - s_previousNormalizationTimeMs = GetTickCount(); - s_normalizationState = NormalizationState::Initialized; - s_isMeasurementScheduled = false; -} - - -void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_ANY; - } - CONTRACTL_END; - - NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); - if (normalizationState == NormalizationState::Initialized) - { - if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) - { - return; - } - } - else if (normalizationState == NormalizationState::Uninitialized) - { - } - else - { - _ASSERTE(normalizationState == NormalizationState::Failed); - return; - } - - // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below - if (s_isMeasurementScheduled || !g_fEEStarted) - { - return; - } - - s_isMeasurementScheduled = true; - FinalizerThread::EnableFinalization(); -} - - -void YieldProcessorNormalization::FireMeasurementEvents() -{ - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; - MODE_ANY; - } - CONTRACTL_END; - - if (!EventEnabledYieldProcessorMeasurement()) - { - return; - } - - // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the - // recorded information, so try to enumerate the array with some care. - double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); - int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); - for (int i = 0; i < NsPerYieldMeasurementCount; ++i) - { - double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]); - if (nsPerYield != 0) // the array may not be fully initialized yet - { - FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); - } - - if (++nextIndex >= NsPerYieldMeasurementCount) - { - nextIndex = 0; - } - } -} - -double YieldProcessorNormalization::AtomicLoad(double *valueRef) -{ - WRAPPER_NO_CONTRACT; - -#ifdef TARGET_64BIT - return VolatileLoadWithoutBarrier(valueRef); -#else - return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); -#endif -} - -void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) -{ - WRAPPER_NO_CONTRACT; - -#ifdef TARGET_64BIT - *valueRef = value; -#else - InterlockedExchangeT(valueRef, value); -#endif -} - +#include "yieldprocessornormalizedshared.cpp" \ No newline at end of file diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp new file mode 100644 index 00000000000000..b012d221ef2913 --- /dev/null +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -0,0 +1,330 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +#include "finalizerthread.h" + +enum class NormalizationState : UINT8 +{ + Uninitialized, + Initialized, + Failed +}; + +static const int NsPerYieldMeasurementCount = 8; +static const unsigned int MeasurementPeriodMs = 4000; + +static const unsigned int NsPerS = 1000 * 1000 * 1000; + +static NormalizationState s_normalizationState = NormalizationState::Uninitialized; +static unsigned int s_previousNormalizationTimeMs; + +static UINT64 s_performanceCounterTicksPerS; +static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; +static int s_nextMeasurementIndex; +static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; + +static LARGE_INTEGER li; + +static UINT64 GetPerformanceCounter() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; +#ifndef FEATURE_NATIVEAOT + MODE_PREEMPTIVE; +#endif + } + CONTRACTL_END; + +#ifdef FEATURE_NATIVEAOT + return PalQueryPerformanceCounter(); +#else + QueryPerformanceCounter(&li); + return li.QuadPart; +#endif +} + +static unsigned int DetermineMeasureDurationUs() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; +#ifndef FEATURE_NATIVEAOT + MODE_PREEMPTIVE; +#endif + } + CONTRACTL_END; + + _ASSERTE(s_normalizationState != NormalizationState::Failed); + + // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration + // if the overhead seems high relative to the measure duration. + unsigned int measureDurationUs = 1; + UINT64 startTicks = GetPerformanceCounter(); + UINT64 elapsedTicks = GetPerformanceCounter() - startTicks; + if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration + { + measureDurationUs *= 4; + } + return measureDurationUs; +} + +static double MeasureNsPerYield(unsigned int measureDurationUs) +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; +#ifndef FEATURE_NATIVEAOT + MODE_PREEMPTIVE; +#endif + } + CONTRACTL_END; + + _ASSERTE(s_normalizationState != NormalizationState::Failed); + + int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; + UINT64 ticksPerS = s_performanceCounterTicksPerS; + UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); + + UINT64 startTicks = GetPerformanceCounter(); + + for (int i = 0; i < yieldCount; ++i) + { + System_YieldProcessor(); + } + + UINT64 elapsedTicks = GetPerformanceCounter() - startTicks; + while (elapsedTicks < measureDurationTicks) + { + int nextYieldCount = + Max(4, + elapsedTicks == 0 + ? yieldCount / 4 + : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); + for (int i = 0; i < nextYieldCount; ++i) + { + System_YieldProcessor(); + } + + elapsedTicks = GetPerformanceCounter() - startTicks; + yieldCount += nextYieldCount; + } + + // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op + const double MinNsPerYield = 0.1; + + // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to + // really take this long. Limit the maximum to keep the recorded values reasonable. + const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; + + return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); +} + +void YieldProcessorNormalization::PerformMeasurement() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; +#ifndef FEATURE_NATIVEAOT + MODE_PREEMPTIVE; +#endif + } + CONTRACTL_END; + + _ASSERTE(s_isMeasurementScheduled); + + double latestNsPerYield; + if (s_normalizationState == NormalizationState::Initialized) + { + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + + int nextMeasurementIndex = s_nextMeasurementIndex; + latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs()); + AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield); + if (++nextMeasurementIndex >= NsPerYieldMeasurementCount) + { + nextMeasurementIndex = 0; + } + s_nextMeasurementIndex = nextMeasurementIndex; + } + else if (s_normalizationState == NormalizationState::Uninitialized) + { +#ifdef FEATURE_NATIVEAOT + if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000) +#else + if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) +#endif + { + // High precision clock not available or clock resolution is too low, resort to defaults + s_normalizationState = NormalizationState::Failed; + return; + } + +#ifndef FEATURE_NATIVEAOT + s_performanceCounterTicksPerS = li.QuadPart; +#endif + + unsigned int measureDurationUs = DetermineMeasureDurationUs(); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) + { + latestNsPerYield = MeasureNsPerYield(measureDurationUs); + AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield); + if (i == 0 || latestNsPerYield < s_establishedNsPerYield) + { + AtomicStore(&s_establishedNsPerYield, latestNsPerYield); + } +#ifndef FEATURE_NATIVEAOT + if (i < NsPerYieldMeasurementCount - 1) + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); + } +#endif + } + } + else + { + _ASSERTE(s_normalizationState == NormalizationState::Failed); + return; + } + + double establishedNsPerYield = s_nsPerYieldMeasurements[0]; + for (int i = 1; i < NsPerYieldMeasurementCount; ++i) + { + double nsPerYield = s_nsPerYieldMeasurements[i]; + if (nsPerYield < establishedNsPerYield) + { + establishedNsPerYield = nsPerYield; + } + } + if (establishedNsPerYield != s_establishedNsPerYield) + { + AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); + } + +#ifndef FEATURE_NATIVEAOT + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); +#endif + + // Calculate the number of yields required to span the duration of a normalized yield + unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); + _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); + s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; + + // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to + // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a + // better job of allowing other work to run. + s_optimalMaxNormalizedYieldsPerSpinIteration = + Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); + _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); + + GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); + + s_previousNormalizationTimeMs = GetTickCount(); + s_normalizationState = NormalizationState::Initialized; + s_isMeasurementScheduled = false; +} + + +void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_ANY; + } + CONTRACTL_END; + + NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); + if (normalizationState == NormalizationState::Initialized) + { + if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + { + return; + } + } + else if (normalizationState == NormalizationState::Uninitialized) + { + } + else + { + _ASSERTE(normalizationState == NormalizationState::Failed); + return; + } + + // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below + if (s_isMeasurementScheduled || !g_fEEStarted) + { + return; + } + + s_isMeasurementScheduled = true; + FinalizerThread::EnableFinalization(); +} + +// EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT +#ifndef FEATURE_NATIVEAOT +void YieldProcessorNormalization::FireMeasurementEvents() +{ + CONTRACTL + { + NOTHROW; + GC_NOTRIGGER; + MODE_ANY; + } + CONTRACTL_END; + + if (!EventEnabledYieldProcessorMeasurement()) + { + return; + } + + // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the + // recorded information, so try to enumerate the array with some care. + double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield); + int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex); + for (int i = 0; i < NsPerYieldMeasurementCount; ++i) + { + double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]); + if (nsPerYield != 0) // the array may not be fully initialized yet + { + FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield); + } + + if (++nextIndex >= NsPerYieldMeasurementCount) + { + nextIndex = 0; + } + } +} +#endif + +double YieldProcessorNormalization::AtomicLoad(double *valueRef) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + return VolatileLoadWithoutBarrier(valueRef); +#else + return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); +#endif +} + +void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) +{ + WRAPPER_NO_CONTRACT; + +#ifdef TARGET_64BIT + *valueRef = value; +#else + InterlockedExchangeT(valueRef, value); +#endif +} + From 9237b9ffd627b5dda34fc0f36421100f339f2cb2 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 19 Jun 2024 14:08:13 -0700 Subject: [PATCH 02/23] Use PalGetTickCount64 --- src/coreclr/inc/yieldprocessornormalized.h | 4 ---- .../nativeaot/Runtime/yieldprocessornormalized.h | 1 + src/coreclr/vm/yieldprocessornormalizedshared.cpp | 15 ++++++++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index c8566e95749153..3b60b3ea2dcccf 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -26,10 +26,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } #ifdef FEATURE_NATIVEAOT #define static_assert_no_msg( cond ) static_assert( cond, #cond ) -// I haven't seen an equivalent for GetTickCount yet, defining it like this for now -unsigned int GetTickCount() { - return 0; -} #define SIZE_T uintptr_t // verify these are correct typedef BYTE UINT8; diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h index 6f29c02308df05..5539ebf90561bc 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h @@ -1,4 +1,5 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#include "PalRedhawk.h" #include "../../inc/yieldprocessornormalized.h" diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index b012d221ef2913..226439411e34cb 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -25,6 +25,15 @@ static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPer static LARGE_INTEGER li; +inline unsigned int GetTickCountPortable() +{ +#ifdef FEATURE_NATIVEAOT + return (unsigned int)PalGetTickCount64(); +#else + return GetTickCount(); +#endif +} + static UINT64 GetPerformanceCounter() { CONTRACTL @@ -140,7 +149,7 @@ void YieldProcessorNormalization::PerformMeasurement() double latestNsPerYield; if (s_normalizationState == NormalizationState::Initialized) { - if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs) { return; } @@ -226,7 +235,7 @@ void YieldProcessorNormalization::PerformMeasurement() GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); - s_previousNormalizationTimeMs = GetTickCount(); + s_previousNormalizationTimeMs = GetTickCountPortable(); s_normalizationState = NormalizationState::Initialized; s_isMeasurementScheduled = false; } @@ -245,7 +254,7 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState); if (normalizationState == NormalizationState::Initialized) { - if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs) + if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs) { return; } From 830d8d091b91ad035b126c4fa43aa7e38e7a2f8d Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 19 Jun 2024 14:52:29 -0700 Subject: [PATCH 03/23] Add limits.h --- src/coreclr/inc/yieldprocessornormalized.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 3b60b3ea2dcccf..4ee1c6170961f9 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -3,6 +3,9 @@ #pragma once +// not sure if necessary +#include + // Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where // the intention is to use the system-default implementation of YieldProcessor(). #define HAS_SYSTEM_YIELDPROCESSOR @@ -15,6 +18,10 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } #undef YieldProcessor #endif #define YieldProcessor Dont_Use_YieldProcessor +#ifdef PalYieldProcessor +#undef PalYieldProcessor +#endif +#define PalYieldProcessor Dont_Use_PalYieldProcessor #define DISABLE_COPY(T) \ T(const T &) = delete; \ From d0a884c45521da997067b8faa25423fc4e15ad9a Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 19 Jun 2024 15:31:36 -0700 Subject: [PATCH 04/23] Declare g_pFinalizerThread for Windows only --- src/coreclr/vm/finalizerthread.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h index 3fb9822530a776..c70d578557e575 100644 --- a/src/coreclr/vm/finalizerthread.h +++ b/src/coreclr/vm/finalizerthread.h @@ -7,7 +7,9 @@ #ifdef FEATURE_NATIVEAOT typedef void VOID; +#ifdef TARGET_WINDOWS GPTR_IMPL(Thread, g_pFinalizerThread); +#endif // Global state variable indicating if the EE has been started up. Volatile g_fEEStarted = FALSE; #endif From 165bbb9d7c58e3cae4ec279e16821349788dea43 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 19 Jun 2024 16:19:40 -0700 Subject: [PATCH 05/23] PR comments --- src/coreclr/inc/yieldprocessornormalized.h | 8 -------- src/coreclr/vm/finalizerthread.h | 12 +++++++----- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 4 ++++ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 4ee1c6170961f9..0233146eb096aa 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -14,14 +14,6 @@ FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } #else FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } #endif -#ifdef YieldProcessor -#undef YieldProcessor -#endif -#define YieldProcessor Dont_Use_YieldProcessor -#ifdef PalYieldProcessor -#undef PalYieldProcessor -#endif -#define PalYieldProcessor Dont_Use_PalYieldProcessor #define DISABLE_COPY(T) \ T(const T &) = delete; \ diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h index c70d578557e575..bda1e5e1a758b9 100644 --- a/src/coreclr/vm/finalizerthread.h +++ b/src/coreclr/vm/finalizerthread.h @@ -7,11 +7,11 @@ #ifdef FEATURE_NATIVEAOT typedef void VOID; -#ifdef TARGET_WINDOWS -GPTR_IMPL(Thread, g_pFinalizerThread); -#endif -// Global state variable indicating if the EE has been started up. -Volatile g_fEEStarted = FALSE; + +// #ifdef TARGET_WINDOWS +// GPTR_IMPL(Thread, g_pFinalizerThread); +// #endif + #endif class FinalizerThread @@ -44,12 +44,14 @@ class FinalizerThread static void FinalizeAllObjects(); public: +#ifndef FEATURE_NATIVEAOT static Thread* GetFinalizerThread() { LIMITED_METHOD_CONTRACT; _ASSERTE(g_pFinalizerThread != 0); return g_pFinalizerThread; } +#endif static BOOL IsCurrentThreadFinalizer(); diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 226439411e34cb..f898ff0ea18b23 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -268,8 +268,12 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() return; } +#ifdef FEATURE_NATIVEAOT + if (s_isMeasurementScheduled) +#else // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below if (s_isMeasurementScheduled || !g_fEEStarted) +#endif { return; } From b089bac2619c69fbf3bc524b1a74e97ff52cb748 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 19 Jun 2024 17:37:44 -0700 Subject: [PATCH 06/23] Fix build/x86 --- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index f898ff0ea18b23..ec0be79165042b 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -325,9 +325,14 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef) #ifdef TARGET_64BIT return VolatileLoadWithoutBarrier(valueRef); +#else +#ifdef FEATURE_NATIVEAOT + // return *PalInterlockedCompareExchange(valueRef, 0.0, 0.0); + return *valueRef; // TODO: fix this #else return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); #endif +#endif } void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) @@ -336,8 +341,12 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) #ifdef TARGET_64BIT *valueRef = value; +#else +#ifdef FEATURE_NATIVEAOT + PalInterlockedExchangePointer(valueRef, value); // TODO: verify it works or fix it #else InterlockedExchangeT(valueRef, value); #endif +#endif } From f4ed8e8f2332ef1ab1171e5e16c7a151e22e4654 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Fri, 21 Jun 2024 12:38:19 -0700 Subject: [PATCH 07/23] Remove finalizer thread from native aot --- src/coreclr/inc/yieldprocessornormalized.h | 2 -- src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 3 --- src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 3 +++ src/coreclr/nativeaot/Runtime/startup.cpp | 2 -- src/coreclr/utilcode/yieldprocessornormalized.cpp | 4 +++- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 14 ++++---------- 6 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 0233146eb096aa..16978adbace6b8 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -44,8 +44,6 @@ T Max(T v1, T v2) return v1 > v2 ? v1 : v2; } -void InitializeYieldProcessorNormalizedCrst(); -void EnsureYieldProcessorNormalizedInitialized(); #endif class YieldProcessorNormalization diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index 7a4d8a853a8b74..0c6454407073e7 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -46,9 +46,6 @@ uint32_t WINAPI FinalizerStart(void* pContext) g_pFinalizerThread = PTR_Thread(pThread); - // We have some time until the first finalization request - use the time to calibrate normalized waits. - EnsureYieldProcessorNormalizedInitialized(); - // Wait for a finalization request. uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE); ASSERT(uResult == WAIT_OBJECT_0); diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp index c5bbcc22842776..7927de44195341 100644 --- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp @@ -37,6 +37,9 @@ #include "RhConfig.h" #include +// I believe there is a better way to use this file but doing this for now +#include "../../utilcode/yieldprocessornormalized.cpp" + FCIMPL0(void, RhDebugBreak) { PalDebugBreak(); diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp index f87bc947d970ac..947d8fac05a205 100644 --- a/src/coreclr/nativeaot/Runtime/startup.cpp +++ b/src/coreclr/nativeaot/Runtime/startup.cpp @@ -130,8 +130,6 @@ static bool InitDLL(HANDLE hPalInstance) #endif #endif // !USE_PORTABLE_HELPERS - InitializeYieldProcessorNormalizedCrst(); - #ifdef STRESS_LOG uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize(); uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel(); diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp index 020d8d7cc79e4e..4b5a5537e49f79 100644 --- a/src/coreclr/utilcode/yieldprocessornormalized.cpp +++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#ifndef FEATURE_NATIVEAOT #include "stdafx.h" +#endif #include "yieldprocessornormalized.h" bool YieldProcessorNormalization::s_isMeasurementScheduled; @@ -14,4 +16,4 @@ unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIte (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration / YieldProcessorNormalization::TargetNsPerNormalizedYield + 0.5 - ); + ); \ No newline at end of file diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index ec0be79165042b..ccbddd1d24b5dd 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -1,7 +1,9 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +#ifndef FEATURE_NATIVEAOT #include "finalizerthread.h" +#endif enum class NormalizationState : UINT8 { @@ -36,16 +38,6 @@ inline unsigned int GetTickCountPortable() static UINT64 GetPerformanceCounter() { - CONTRACTL - { - NOTHROW; - GC_NOTRIGGER; -#ifndef FEATURE_NATIVEAOT - MODE_PREEMPTIVE; -#endif - } - CONTRACTL_END; - #ifdef FEATURE_NATIVEAOT return PalQueryPerformanceCounter(); #else @@ -279,7 +271,9 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() } s_isMeasurementScheduled = true; +#ifndef FEATURE_NATIVEAOT FinalizerThread::EnableFinalization(); +#endif } // EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT From b64656829fa81d845acbc9109bcf36d6e2c8e815 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Fri, 21 Jun 2024 15:06:15 -0700 Subject: [PATCH 08/23] Remove unnecessary code --- src/coreclr/inc/yieldprocessornormalized.h | 16 ++++------------ .../utilcode/yieldprocessornormalized.cpp | 2 +- src/coreclr/vm/finalizerthread.h | 13 ------------- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 16978adbace6b8..532f167521dca6 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -3,11 +3,6 @@ #pragma once -// not sure if necessary -#include - -// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where -// the intention is to use the system-default implementation of YieldProcessor(). #define HAS_SYSTEM_YIELDPROCESSOR #ifdef FEATURE_NATIVEAOT FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } @@ -24,9 +19,7 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } DISABLE_COPY(T) #ifdef FEATURE_NATIVEAOT -#define static_assert_no_msg( cond ) static_assert( cond, #cond ) #define SIZE_T uintptr_t -// verify these are correct typedef BYTE UINT8; typedef ULONGLONG UINT64; @@ -43,7 +36,6 @@ T Max(T v1, T v2) // STATIC_CONTRACT_LEAF; return v1 > v2 ? v1 : v2; } - #endif class YieldProcessorNormalization @@ -295,10 +287,10 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized( { // This shift value should be adjusted based on the asserted conditions below const UINT8 MaxShift = 3; - static_assert_no_msg( - ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); - static_assert_no_msg( - ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration); + static_assert( + ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); + static_assert( + ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); unsigned int n; if (spinIteration <= MaxShift && diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp index 4b5a5537e49f79..92d6ef943c9d7f 100644 --- a/src/coreclr/utilcode/yieldprocessornormalized.cpp +++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp @@ -16,4 +16,4 @@ unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIte (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration / YieldProcessorNormalization::TargetNsPerNormalizedYield + 0.5 - ); \ No newline at end of file + ); diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h index bda1e5e1a758b9..5ea3cca02bebfe 100644 --- a/src/coreclr/vm/finalizerthread.h +++ b/src/coreclr/vm/finalizerthread.h @@ -5,15 +5,6 @@ #ifndef _FINALIZER_THREAD_H_ #define _FINALIZER_THREAD_H_ -#ifdef FEATURE_NATIVEAOT -typedef void VOID; - -// #ifdef TARGET_WINDOWS -// GPTR_IMPL(Thread, g_pFinalizerThread); -// #endif - -#endif - class FinalizerThread { static BOOL fQuitFinalizer; @@ -44,14 +35,12 @@ class FinalizerThread static void FinalizeAllObjects(); public: -#ifndef FEATURE_NATIVEAOT static Thread* GetFinalizerThread() { LIMITED_METHOD_CONTRACT; _ASSERTE(g_pFinalizerThread != 0); return g_pFinalizerThread; } -#endif static BOOL IsCurrentThreadFinalizer(); @@ -61,7 +50,6 @@ class FinalizerThread static OBJECTREF GetNextFinalizableObject(); -#ifndef FEATURE_NATIVEAOT static void RaiseShutdownEvents() { WRAPPER_NO_CONTRACT; @@ -76,7 +64,6 @@ class FinalizerThread hEventFinalizerToShutDown->Wait(INFINITE, /*alertable*/ TRUE); } } -#endif static void FinalizerThreadWait(DWORD timeout = INFINITE); From 412715876eda72280ef9933b9446fd80233e4806 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Fri, 21 Jun 2024 19:23:28 -0700 Subject: [PATCH 09/23] PR comments + Fix InterlockedExchange --- src/coreclr/gc/env/gcenv.os.h | 6 ----- src/coreclr/inc/yieldprocessornormalized.h | 27 ++++++++----------- .../Runtime/yieldprocessornormalized.cpp | 1 - .../vm/yieldprocessornormalizedshared.cpp | 23 ++++++++-------- 4 files changed, 22 insertions(+), 35 deletions(-) diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h index 75015b77262dd9..17640543bd3814 100644 --- a/src/coreclr/gc/env/gcenv.os.h +++ b/src/coreclr/gc/env/gcenv.os.h @@ -6,12 +6,6 @@ #ifndef __GCENV_OS_H__ #define __GCENV_OS_H__ -#ifdef HAS_SYSTEM_YIELDPROCESSOR -// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC. -#undef YieldProcessor -#define YieldProcessor System_YieldProcessor -#endif - #define NUMA_NODE_UNDEFINED UINT16_MAX bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index); diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 532f167521dca6..42db5467f2344a 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -3,7 +3,6 @@ #pragma once -#define HAS_SYSTEM_YIELDPROCESSOR #ifdef FEATURE_NATIVEAOT FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); } #else @@ -19,10 +18,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } DISABLE_COPY(T) #ifdef FEATURE_NATIVEAOT -#define SIZE_T uintptr_t -typedef BYTE UINT8; -typedef ULONGLONG UINT64; - template T Min(T v1, T v2) { @@ -162,9 +157,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo { _ASSERTE(count != 0); - if (sizeof(SIZE_T) <= sizeof(unsigned int)) + if (sizeof(size_t) <= sizeof(unsigned int)) { - // On platforms with a small SIZE_T, prevent overflow on the multiply below + // On platforms with a small size_t, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (count > MaxCount) { @@ -172,7 +167,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo } } - SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield; + size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield; _ASSERTE(n != 0); do { @@ -207,9 +202,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( { _ASSERTE(preSkylakeCount != 0); - if (sizeof(SIZE_T) <= sizeof(unsigned int)) + if (sizeof(size_t) <= sizeof(unsigned int)) { - // On platforms with a small SIZE_T, prevent overflow on the multiply below + // On platforms with a small size_t, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (preSkylakeCount > MaxCount) { @@ -218,7 +213,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount( } const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; - SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; + size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; if (n == 0) { n = 1; @@ -245,9 +240,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl _ASSERTE(preSkylakeCount != 0); - if (sizeof(SIZE_T) <= sizeof(unsigned int)) + if (sizeof(size_t) <= sizeof(unsigned int)) { - // On platforms with a small SIZE_T, prevent overflow on the multiply below + // On platforms with a small size_t, prevent overflow on the multiply below const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield; if (preSkylakeCount > MaxCount) { @@ -256,8 +251,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl } const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8; - SIZE_T n = - (SIZE_T)preSkylakeCount * + size_t n = + (size_t)preSkylakeCount * YieldProcessorNormalization::s_yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor; if (n == 0) @@ -286,7 +281,7 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized( unsigned int spinIteration) { // This shift value should be adjusted based on the asserted conditions below - const UINT8 MaxShift = 3; + const uint8_t MaxShift = 3; static_assert( ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, ""); static_assert( diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp index a9eb72a147d6d0..2e8a7b8a8ad461 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp @@ -14,6 +14,5 @@ #include "slist.h" #include "volatile.h" #include "yieldprocessornormalized.h" -#include "../../vm/synch.h" #include "../../vm/yieldprocessornormalizedshared.cpp" diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index ccbddd1d24b5dd..4d1ded67833760 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -5,7 +5,7 @@ #include "finalizerthread.h" #endif -enum class NormalizationState : UINT8 +enum class NormalizationState : uint8_t { Uninitialized, Initialized, @@ -20,7 +20,7 @@ static const unsigned int NsPerS = 1000 * 1000 * 1000; static NormalizationState s_normalizationState = NormalizationState::Uninitialized; static unsigned int s_previousNormalizationTimeMs; -static UINT64 s_performanceCounterTicksPerS; +static uint64_t s_performanceCounterTicksPerS; static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; static int s_nextMeasurementIndex; static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; @@ -36,7 +36,7 @@ inline unsigned int GetTickCountPortable() #endif } -static UINT64 GetPerformanceCounter() +static uint64_t GetPerformanceCounter() { #ifdef FEATURE_NATIVEAOT return PalQueryPerformanceCounter(); @@ -63,8 +63,8 @@ static unsigned int DetermineMeasureDurationUs() // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration // if the overhead seems high relative to the measure duration. unsigned int measureDurationUs = 1; - UINT64 startTicks = GetPerformanceCounter(); - UINT64 elapsedTicks = GetPerformanceCounter() - startTicks; + uint64_t startTicks = GetPerformanceCounter(); + uint64_t elapsedTicks = GetPerformanceCounter() - startTicks; if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration { measureDurationUs *= 4; @@ -87,17 +87,17 @@ static double MeasureNsPerYield(unsigned int measureDurationUs) _ASSERTE(s_normalizationState != NormalizationState::Failed); int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1; - UINT64 ticksPerS = s_performanceCounterTicksPerS; - UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); + uint64_t ticksPerS = s_performanceCounterTicksPerS; + uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000); - UINT64 startTicks = GetPerformanceCounter(); + uint64_t startTicks = GetPerformanceCounter(); for (int i = 0; i < yieldCount; ++i) { System_YieldProcessor(); } - UINT64 elapsedTicks = GetPerformanceCounter() - startTicks; + uint64_t elapsedTicks = GetPerformanceCounter() - startTicks; while (elapsedTicks < measureDurationTicks) { int nextYieldCount = @@ -321,8 +321,7 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef) return VolatileLoadWithoutBarrier(valueRef); #else #ifdef FEATURE_NATIVEAOT - // return *PalInterlockedCompareExchange(valueRef, 0.0, 0.0); - return *valueRef; // TODO: fix this + return *PalInterlockedCompareExchange64((int64_t *)valueRef, 0, 0); #else return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); #endif @@ -337,7 +336,7 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) *valueRef = value; #else #ifdef FEATURE_NATIVEAOT - PalInterlockedExchangePointer(valueRef, value); // TODO: verify it works or fix it + PalInterlockedExchange64((int64_t *) valueRef, value); #else InterlockedExchangeT(valueRef, value); #endif From a86278282dc64dfcda5d93e955e1ace15f8d1e2e Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Mon, 24 Jun 2024 13:39:39 -0700 Subject: [PATCH 10/23] Add TODOs --- src/coreclr/inc/yieldprocessornormalized.h | 2 ++ src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 2 +- src/coreclr/vm/yieldprocessornormalized.cpp | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index 42db5467f2344a..b17659c983d729 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -21,6 +21,7 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } template T Min(T v1, T v2) { + // TODO: check it's safe to remove STATIC_CONTRACT_LEAF // STATIC_CONTRACT_LEAF; return v1 < v2 ? v1 : v2; } @@ -28,6 +29,7 @@ T Min(T v1, T v2) template T Max(T v1, T v2) { + // TODO: check it's safe to remove STATIC_CONTRACT_LEAF // STATIC_CONTRACT_LEAF; return v1 > v2 ? v1 : v2; } diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp index 7927de44195341..6dd20382c06dbd 100644 --- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp @@ -37,7 +37,7 @@ #include "RhConfig.h" #include -// I believe there is a better way to use this file but doing this for now +// TODO: Check if there's a better way to use this file #include "../../utilcode/yieldprocessornormalized.cpp" FCIMPL0(void, RhDebugBreak) diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp index 9b4c685be25460..258e30d634c7ce 100644 --- a/src/coreclr/vm/yieldprocessornormalized.cpp +++ b/src/coreclr/vm/yieldprocessornormalized.cpp @@ -7,4 +7,4 @@ #include "finalizerthread.h" -#include "yieldprocessornormalizedshared.cpp" \ No newline at end of file +#include "yieldprocessornormalizedshared.cpp" From 73d3d71b888869fbb85287c28eefb9a1db256b0d Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Mon, 24 Jun 2024 22:49:07 -0700 Subject: [PATCH 11/23] Use max/min and RhEnableFinalization --- src/coreclr/inc/yieldprocessornormalized.h | 18 ------------------ .../vm/yieldprocessornormalizedshared.cpp | 14 +++++++++----- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h index b17659c983d729..e37bf79f0c5089 100644 --- a/src/coreclr/inc/yieldprocessornormalized.h +++ b/src/coreclr/inc/yieldprocessornormalized.h @@ -17,24 +17,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); } T() = delete; \ DISABLE_COPY(T) -#ifdef FEATURE_NATIVEAOT -template -T Min(T v1, T v2) -{ - // TODO: check it's safe to remove STATIC_CONTRACT_LEAF - // STATIC_CONTRACT_LEAF; - return v1 < v2 ? v1 : v2; -} - -template -T Max(T v1, T v2) -{ - // TODO: check it's safe to remove STATIC_CONTRACT_LEAF - // STATIC_CONTRACT_LEAF; - return v1 > v2 ? v1 : v2; -} -#endif - class YieldProcessorNormalization { public: diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 4d1ded67833760..7a1a6665a98852 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -27,6 +27,8 @@ static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPer static LARGE_INTEGER li; +void RhEnableFinalization(); + inline unsigned int GetTickCountPortable() { #ifdef FEATURE_NATIVEAOT @@ -101,7 +103,7 @@ static double MeasureNsPerYield(unsigned int measureDurationUs) while (elapsedTicks < measureDurationTicks) { int nextYieldCount = - Max(4, + max(4, elapsedTicks == 0 ? yieldCount / 4 : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1); @@ -121,7 +123,7 @@ static double MeasureNsPerYield(unsigned int measureDurationUs) // really take this long. Limit the maximum to keep the recorded values reasonable. const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1; - return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); + return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield)); } void YieldProcessorNormalization::PerformMeasurement() @@ -214,7 +216,7 @@ void YieldProcessorNormalization::PerformMeasurement() #endif // Calculate the number of yields required to span the duration of a normalized yield - unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); + unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield); s_yieldsPerNormalizedYield = yieldsPerNormalizedYield; @@ -222,7 +224,7 @@ void YieldProcessorNormalization::PerformMeasurement() // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a // better job of allowing other work to run. s_optimalMaxNormalizedYieldsPerSpinIteration = - Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); + max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5)); _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration); GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield); @@ -271,7 +273,9 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() } s_isMeasurementScheduled = true; -#ifndef FEATURE_NATIVEAOT +#ifdef FEATURE_NATIVEAOT + RhEnableFinalization(); +#else FinalizerThread::EnableFinalization(); #endif } From 0d226dab0218e3760da497a216fcf693a7d8fbea Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Mon, 24 Jun 2024 22:53:49 -0700 Subject: [PATCH 12/23] Remove TODO --- src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp index 6dd20382c06dbd..f6a5d809db9e03 100644 --- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp @@ -37,7 +37,6 @@ #include "RhConfig.h" #include -// TODO: Check if there's a better way to use this file #include "../../utilcode/yieldprocessornormalized.cpp" FCIMPL0(void, RhDebugBreak) From 4c315c81cbcde9c3d8a290118fce5ef2001bc590 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 25 Jun 2024 14:10:24 -0700 Subject: [PATCH 13/23] Fix Interlocked calls --- src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 7 +++++++ src/coreclr/vm/yieldprocessornormalizedshared.cpp | 7 +++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h index 187ad26fb8bf11..766feab6e1d8ab 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h @@ -42,6 +42,13 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32 return _InterlockedExchange((long volatile *)pDst, iValue); } +EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t); +#pragma intrinsic(_InterlockedExchange) +FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) +{ + return _InterlockedExchange64(pDst, iValue); +} + EXTERN_C long __PN__MACHINECALL_CDECL_OR_DEFAULT _InterlockedCompareExchange(long volatile *, long, long); #pragma intrinsic(_InterlockedCompareExchange) FORCEINLINE int32_t PalInterlockedCompareExchange(_Inout_ int32_t volatile *pDst, int32_t iValue, int32_t iComparand) diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 7a1a6665a98852..1d4406e6cbba41 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -325,7 +325,9 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef) return VolatileLoadWithoutBarrier(valueRef); #else #ifdef FEATURE_NATIVEAOT - return *PalInterlockedCompareExchange64((int64_t *)valueRef, 0, 0); + static_assert(sizeof(int64_t) == sizeof(double)); + int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0); + return *(double*)(int64_t*)(&intRes); #else return InterlockedCompareExchangeT(valueRef, 0.0, 0.0); #endif @@ -340,7 +342,8 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) *valueRef = value; #else #ifdef FEATURE_NATIVEAOT - PalInterlockedExchange64((int64_t *) valueRef, value); + static_assert(sizeof(int64_t) == sizeof(double)); + PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value); #else InterlockedExchangeT(valueRef, value); #endif From 17dac7e4a9e422b88c7a119d3825b96ab769cbf6 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 25 Jun 2024 21:01:15 -0700 Subject: [PATCH 14/23] Fix static_assert --- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 1d4406e6cbba41..7e2a6caf548d30 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -325,7 +325,7 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef) return VolatileLoadWithoutBarrier(valueRef); #else #ifdef FEATURE_NATIVEAOT - static_assert(sizeof(int64_t) == sizeof(double)); + static_assert(sizeof(int64_t) == sizeof(double), ""); int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0); return *(double*)(int64_t*)(&intRes); #else @@ -342,7 +342,7 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value) *valueRef = value; #else #ifdef FEATURE_NATIVEAOT - static_assert(sizeof(int64_t) == sizeof(double)); + static_assert(sizeof(int64_t) == sizeof(double), ""); PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value); #else InterlockedExchangeT(valueRef, value); From e09fa54dd976b2e63fed36dee54026e7b328d403 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 26 Jun 2024 13:26:54 -0700 Subject: [PATCH 15/23] Add PerformMeasurement call --- src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index 0c6454407073e7..d05b880fe9b730 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -55,6 +55,11 @@ uint32_t WINAPI FinalizerStart(void* pContext) UInt32_BOOL fResult = PalSetEvent(hFinalizerEvent); ASSERT(fResult); + if (YieldProcessorNormalization::IsMeasurementScheduled()) + { + YieldProcessorNormalization::PerformMeasurement(); + } + // Run the managed portion of the finalizer. This call will never return. ProcessFinalizers(); From 73f3fef2ce0e03c8cec24bc227b20d02bd957ed5 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 26 Jun 2024 15:17:00 -0700 Subject: [PATCH 16/23] Add YieldProcessorMeasurement --- .../nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst | 1 + src/coreclr/vm/yieldprocessornormalizedshared.cpp | 5 ----- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst index 901af659ff84b6..0f4c932719a399 100644 --- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst +++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst @@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount ThreadRunning WaitHandleWaitStart WaitHandleWaitStop +YieldProcessorMeasurement diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 7e2a6caf548d30..8aebd1959bd8cc 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -211,9 +211,7 @@ void YieldProcessorNormalization::PerformMeasurement() AtomicStore(&s_establishedNsPerYield, establishedNsPerYield); } -#ifndef FEATURE_NATIVEAOT FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); -#endif // Calculate the number of yields required to span the duration of a normalized yield unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5)); @@ -280,8 +278,6 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary() #endif } -// EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT -#ifndef FEATURE_NATIVEAOT void YieldProcessorNormalization::FireMeasurementEvents() { CONTRACTL @@ -315,7 +311,6 @@ void YieldProcessorNormalization::FireMeasurementEvents() } } } -#endif double YieldProcessorNormalization::AtomicLoad(double *valueRef) { From e07035e60840a0b5ec38cc5d84fd1fa28c0ac815 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Wed, 26 Jun 2024 17:02:50 -0700 Subject: [PATCH 17/23] Nit --- src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 2 +- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h index 766feab6e1d8ab..1fb290e41dcf29 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h @@ -43,7 +43,7 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32 } EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t); -#pragma intrinsic(_InterlockedExchange) +#pragma intrinsic(_InterlockedExchange64) FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) { return _InterlockedExchange64(pDst, iValue); diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 8aebd1959bd8cc..0d6464bbe43d42 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -183,12 +183,10 @@ void YieldProcessorNormalization::PerformMeasurement() { AtomicStore(&s_establishedNsPerYield, latestNsPerYield); } -#ifndef FEATURE_NATIVEAOT if (i < NsPerYieldMeasurementCount - 1) { FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield); } -#endif } } else From 9a76a1825731a78bf729457a33219565b86f9c45 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Thu, 27 Jun 2024 12:53:46 -0700 Subject: [PATCH 18/23] Fix PerformMeasurement --- .../nativeaot/Runtime/FinalizerHelpers.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index d05b880fe9b730..ecafa5e4c75523 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -46,6 +46,11 @@ uint32_t WINAPI FinalizerStart(void* pContext) g_pFinalizerThread = PTR_Thread(pThread); + if (YieldProcessorNormalization::IsMeasurementScheduled()) + { + YieldProcessorNormalization::PerformMeasurement(); + } + // Wait for a finalization request. uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE); ASSERT(uResult == WAIT_OBJECT_0); @@ -55,11 +60,6 @@ uint32_t WINAPI FinalizerStart(void* pContext) UInt32_BOOL fResult = PalSetEvent(hFinalizerEvent); ASSERT(fResult); - if (YieldProcessorNormalization::IsMeasurementScheduled()) - { - YieldProcessorNormalization::PerformMeasurement(); - } - // Run the managed portion of the finalizer. This call will never return. ProcessFinalizers(); @@ -186,6 +186,11 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest() // Indicate that the current round of finalizations is complete. EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount) { + if (YieldProcessorNormalization::IsMeasurementScheduled()) + { + YieldProcessorNormalization::PerformMeasurement(); + } + FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); g_FinalizerDoneEvent.Set(); } From 6519b0b3e8a9636bfba54d755c9d3a56b8278153 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Thu, 27 Jun 2024 13:19:44 -0700 Subject: [PATCH 19/23] Move PerformMeasurement --- src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index ecafa5e4c75523..cf289143afbc91 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -186,13 +186,13 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest() // Indicate that the current round of finalizations is complete. EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount) { + FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); + g_FinalizerDoneEvent.Set(); + if (YieldProcessorNormalization::IsMeasurementScheduled()) { YieldProcessorNormalization::PerformMeasurement(); } - - FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId()); - g_FinalizerDoneEvent.Set(); } // From 9606eb93142b9c3dc0dbd60033f0b146078adb94 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 2 Jul 2024 14:40:44 -0700 Subject: [PATCH 20/23] Fix PalInterlockedExchange64 --- .../Runtime/windows/PalRedhawkInline.h | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h index 1fb290e41dcf29..0f05e6143b3a48 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h @@ -42,13 +42,6 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32 return _InterlockedExchange((long volatile *)pDst, iValue); } -EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t); -#pragma intrinsic(_InterlockedExchange64) -FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) -{ - return _InterlockedExchange64(pDst, iValue); -} - EXTERN_C long __PN__MACHINECALL_CDECL_OR_DEFAULT _InterlockedCompareExchange(long volatile *, long, long); #pragma intrinsic(_InterlockedCompareExchange) FORCEINLINE int32_t PalInterlockedCompareExchange(_Inout_ int32_t volatile *pDst, int32_t iValue, int32_t iComparand) @@ -63,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD return _InterlockedCompareExchange64(pDst, iValue, iComparand); } +#ifdef HOST_X86 +FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) +{ + int64_t iOldValue; + do { + iOldValue = *pDst; + } while (PalInterlockedCompareExchange64(Target, + iValue, + iOldValue) != iOldValue); + return iOldValue; +} +#else // HOST_X86 +EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t); +#pragma intrinsic(_InterlockedExchange64) +FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue) +{ + return _InterlockedExchange64(pDst, iValue); +} +#endif // HOST_X86 + #if defined(HOST_AMD64) || defined(HOST_ARM64) EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *); #pragma intrinsic(_InterlockedCompareExchange128) From 234d61bace73373cc0288b5aee4f032a534314f5 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 2 Jul 2024 15:42:20 -0700 Subject: [PATCH 21/23] PR comments --- src/coreclr/nativeaot/Runtime/Crst.h | 1 - src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 5 ----- src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 2 -- src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp | 2 ++ src/coreclr/utilcode/yieldprocessornormalized.cpp | 3 --- src/coreclr/vm/synch.h | 2 -- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 7 +------ 7 files changed, 3 insertions(+), 19 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h index 31bf8fde9eec8a..4ab9db08e0f5e3 100644 --- a/src/coreclr/nativeaot/Runtime/Crst.h +++ b/src/coreclr/nativeaot/Runtime/Crst.h @@ -20,7 +20,6 @@ enum CrstType CrstRestrictedCallouts, CrstGcStressControl, CrstThreadStore, - CrstYieldProcessorNormalized, CrstEventPipe, CrstEventPipeConfig, CrstGcEvent, diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp index cf289143afbc91..3af6a3fbf21751 100644 --- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp @@ -46,11 +46,6 @@ uint32_t WINAPI FinalizerStart(void* pContext) g_pFinalizerThread = PTR_Thread(pThread); - if (YieldProcessorNormalization::IsMeasurementScheduled()) - { - YieldProcessorNormalization::PerformMeasurement(); - } - // Wait for a finalization request. uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE); ASSERT(uResult == WAIT_OBJECT_0); diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp index f6a5d809db9e03..c5bbcc22842776 100644 --- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp @@ -37,8 +37,6 @@ #include "RhConfig.h" #include -#include "../../utilcode/yieldprocessornormalized.cpp" - FCIMPL0(void, RhDebugBreak) { PalDebugBreak(); diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp index 2e8a7b8a8ad461..efaf4e8bb20704 100644 --- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp +++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp @@ -15,4 +15,6 @@ #include "volatile.h" #include "yieldprocessornormalized.h" +#include "../../utilcode/yieldprocessornormalized.cpp" + #include "../../vm/yieldprocessornormalizedshared.cpp" diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp index 92d6ef943c9d7f..c6aaaa19557fa7 100644 --- a/src/coreclr/utilcode/yieldprocessornormalized.cpp +++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp @@ -1,9 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#ifndef FEATURE_NATIVEAOT -#include "stdafx.h" -#endif #include "yieldprocessornormalized.h" bool YieldProcessorNormalization::s_isMeasurementScheduled; diff --git a/src/coreclr/vm/synch.h b/src/coreclr/vm/synch.h index d07d89b2d5f772..72e19f1c33b602 100644 --- a/src/coreclr/vm/synch.h +++ b/src/coreclr/vm/synch.h @@ -134,7 +134,6 @@ class CLREvent : public CLREventBase }; -#ifndef FEATURE_NATIVEAOT // CLREventStatic // Same as CLREvent, but intended to be used for global variables. // Instances may leak their handle, because of the order in which @@ -143,7 +142,6 @@ class CLREvent : public CLREventBase class CLREventStatic : public CLREventBase { }; -#endif BOOL CLREventWaitWithTry(CLREventBase *pEvent, DWORD timeout, BOOL fAlertable, DWORD *pStatus); #endif diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index 0d6464bbe43d42..ff1343d4fce9ae 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -1,10 +1,6 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -#ifndef FEATURE_NATIVEAOT -#include "finalizerthread.h" -#endif - enum class NormalizationState : uint8_t { Uninitialized, @@ -25,8 +21,6 @@ static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount]; static int s_nextMeasurementIndex; static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield; -static LARGE_INTEGER li; - void RhEnableFinalization(); inline unsigned int GetTickCountPortable() @@ -43,6 +37,7 @@ static uint64_t GetPerformanceCounter() #ifdef FEATURE_NATIVEAOT return PalQueryPerformanceCounter(); #else + LARGE_INTEGER li; QueryPerformanceCounter(&li); return li.QuadPart; #endif From 51c457344d84932985edfb9218ee8e977e192a8b Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 2 Jul 2024 17:04:34 -0700 Subject: [PATCH 22/23] Fix build --- src/coreclr/vm/yieldprocessornormalizedshared.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp index ff1343d4fce9ae..05daee21947376 100644 --- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp +++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp @@ -157,6 +157,7 @@ void YieldProcessorNormalization::PerformMeasurement() #ifdef FEATURE_NATIVEAOT if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000) #else + LARGE_INTEGER li; if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000) #endif { From e8e1290d0b4985111cd985be31e377fd403b16c6 Mon Sep 17 00:00:00 2001 From: Eduardo Manuel Velarde Polar Date: Tue, 2 Jul 2024 17:43:13 -0700 Subject: [PATCH 23/23] Fix PalInterlocked --- src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h index 0f05e6143b3a48..1f2a74dcd15100 100644 --- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h @@ -62,7 +62,7 @@ FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int int64_t iOldValue; do { iOldValue = *pDst; - } while (PalInterlockedCompareExchange64(Target, + } while (PalInterlockedCompareExchange64(pDst, iValue, iOldValue) != iOldValue); return iOldValue;