From 392c6528f0f9a3fd649c647cb4336687002f62d4 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 18 Jun 2024 16:20:58 -0700
Subject: [PATCH 01/23] Initial commit

---
 src/coreclr/inc/yieldprocessornormalized.h    |  33 ++
 .../Runtime/yieldprocessornormalized.cpp      | 103 +-----
 .../Runtime/yieldprocessornormalized.h        | 227 +-----------
 src/coreclr/vm/finalizerthread.h              |   9 +
 src/coreclr/vm/synch.h                        |   2 +
 src/coreclr/vm/yieldprocessornormalized.cpp   | 294 +---------------
 .../vm/yieldprocessornormalizedshared.cpp     | 330 ++++++++++++++++++
 7 files changed, 378 insertions(+), 620 deletions(-)
 create mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 121e60b033356d..c8566e95749153 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -6,7 +6,11 @@
 // Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
 // the intention is to use the system-default implementation of YieldProcessor().
 #define HAS_SYSTEM_YIELDPROCESSOR
+#ifdef FEATURE_NATIVEAOT
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#else
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#endif
 #ifdef YieldProcessor
 #undef YieldProcessor
 #endif
@@ -20,6 +24,35 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
     T() = delete; \
     DISABLE_COPY(T)
 
+#ifdef FEATURE_NATIVEAOT
+#define static_assert_no_msg( cond ) static_assert( cond, #cond )
+// I haven't seen an equivalent for GetTickCount yet, defining it like this for now
+unsigned int GetTickCount() {
+    return 0;
+}
+#define SIZE_T uintptr_t
+// verify these are correct
+typedef BYTE UINT8;
+typedef ULONGLONG UINT64;
+
+template <typename T>
+T Min(T v1, T v2)
+{
+    // STATIC_CONTRACT_LEAF;
+    return v1 < v2 ? v1 : v2;
+}
+
+template <typename T>
+T Max(T v1, T v2)
+{
+    // STATIC_CONTRACT_LEAF;
+    return v1 > v2 ? v1 : v2;
+}
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+#endif
+
 class YieldProcessorNormalization
 {
 public:
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index 444d52b0114c03..a9eb72a147d6d0 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -14,105 +14,6 @@
 #include "slist.h"
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
+#include "../../vm/synch.h"
 
-#define ULONGLONG int64_t
-
-static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
-static CrstStatic s_initializeYieldProcessorNormalizedCrst;
-
-// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
-// tuned for Skylake processors
-unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
-unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
-
-void InitializeYieldProcessorNormalizedCrst()
-{
-    WRAPPER_NO_CONTRACT;
-    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
-}
-
-static void InitializeYieldProcessorNormalized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
-
-    if (s_isYieldProcessorNormalizedInitialized)
-    {
-        return;
-    }
-
-    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
-    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
-    const int MeasureDurationMs = 10;
-    const int NsPerSecond = 1000 * 1000 * 1000;
-
-    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
-
-    if (ticksPerSecond < 1000 / MeasureDurationMs)
-    {
-        // High precision clock not available or clock resolution is too low, resort to defaults
-        s_isYieldProcessorNormalizedInitialized = true;
-        return;
-    }
-
-    // Measure the nanosecond delay per yield
-    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
-    unsigned int yieldCount = 0;
-      ULONGLONG startTicks = PalQueryPerformanceCounter();
-    ULONGLONG elapsedTicks;
-    do
-    {
-        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
-        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
-        // low microsecond range.
-        for (int i = 0; i < 1000; ++i)
-        {
-            System_YieldProcessor();
-        }
-        yieldCount += 1000;
-
-        ULONGLONG nowTicks = PalQueryPerformanceCounter();
-        elapsedTicks = nowTicks - startTicks;
-    } while (elapsedTicks < measureDurationTicks);
-    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
-    if (nsPerYield < 1)
-    {
-        nsPerYield = 1;
-    }
-
-    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
-    // value is naturally limited to MinNsPerNormalizedYield.
-    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
-    if (yieldsPerNormalizedYield < 1)
-    {
-        yieldsPerNormalizedYield = 1;
-    }
-    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    int optimalMaxNormalizedYieldsPerSpinIteration =
-        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
-    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
-    {
-        optimalMaxNormalizedYieldsPerSpinIteration = 1;
-    }
-
-    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
-    s_isYieldProcessorNormalizedInitialized = true;
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-}
-
-void EnsureYieldProcessorNormalizedInitialized()
-{
-    WRAPPER_NO_CONTRACT;
-
-    if (!s_isYieldProcessorNormalizedInitialized)
-    {
-        InitializeYieldProcessorNormalized();
-    }
-}
+#include "../../vm/yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
index 8c74bf3cfe3002..6f29c02308df05 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
@@ -1,229 +1,4 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#pragma once
-
-#include <limits.h>
-
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
-#define HAS_SYSTEM_YIELDPROCESSOR
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#ifdef YieldProcessor
-#undef YieldProcessor
-#endif
-#define YieldProcessor Dont_Use_YieldProcessor
-#ifdef PalYieldProcessor
-#undef PalYieldProcessor
-#endif
-#define PalYieldProcessor Dont_Use_PalYieldProcessor
-
-#define SIZE_T uintptr_t
-
-const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
-const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
-
-extern unsigned int g_yieldsPerNormalizedYield;
-extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
-
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
-
-class YieldProcessorNormalizationInfo
-{
-private:
-    unsigned int yieldsPerNormalizedYield;
-    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
-    unsigned int optimalMaxYieldsPerSpinIteration;
-
-public:
-    YieldProcessorNormalizationInfo()
-        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
-        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
-        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-    }
-
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
-    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
-    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
-};
-
-// See YieldProcessorNormalized() for preliminary info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
-{
-    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
-// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
-//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
-//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
-//     and decrease scalability of the operation.
-//         while(!condition)
-//         {
-//             YieldProcessorNormalized();
-//         }
-//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
-//     condition, otherwise it may unnecessarily increase latency of the operation
-//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
-//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
-//     issue above on later iterations.
-//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
-//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
-//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
-FORCEINLINE void YieldProcessorNormalized()
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
-}
-
-// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
-//     if (!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalized(normalizationInfo, 2);
-//         } while (!moreExpensiveCondition);
-//     }
-FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
-{
-    _ASSERTE(count != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (count > MaxCount)
-        {
-            count = MaxCount;
-        }
-    }
-
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
-// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
-//     while(!moreExpensiveCondition)
-//     {
-//         YieldProcessorNormalized(2);
-//     }
-FORCEINLINE void YieldProcessorNormalized(unsigned int count)
-{
-    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
-}
-
-// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
-// info. Typical usage:
-//     if (!condition)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
-//         } while (!condition);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int preSkylakeCount)
-{
-    _ASSERTE(preSkylakeCount != 0);
-
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
-    {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
-        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
-        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
-        if (preSkylakeCount > MaxCount)
-        {
-            preSkylakeCount = MaxCount;
-        }
-    }
-
-    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
-    if (n == 0)
-    {
-        n = 1;
-    }
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
-
-// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
-// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
-// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
-// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
-//     while(!condition)
-//     {
-//         YieldProcessorNormalizedForPreSkylakeCount(100);
-//     }
-FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
-{
-    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
-}
-
-// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
-// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
-// iteration exponentially up to a limit. Typical usage:
-//     if (!conditionThatMayNotBeSatisfiedSoon)
-//     {
-//         YieldProcessorNormalizationInfo normalizationInfo;
-//         do
-//         {
-//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
-//         } while (!conditionThatMayNotBeSatisfiedSoon);
-//     }
-FORCEINLINE void YieldProcessorWithBackOffNormalized(
-    const YieldProcessorNormalizationInfo &normalizationInfo,
-    unsigned int spinIteration)
-{
-    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
-    // InitializeYieldProcessorNormalized()
-    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
-        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
-    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    // This shift value should be adjusted based on the asserted condition below
-    const uint8_t MaxShift = 3;
-    static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-
-    unsigned int n;
-    if (spinIteration <= MaxShift &&
-        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
-    {
-        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
-    }
-    else
-    {
-        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
-    }
-    _ASSERTE(n != 0);
-    do
-    {
-        System_YieldProcessor();
-    } while (--n != 0);
-}
+#include "../../inc/yieldprocessornormalized.h"
diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h
index 5ea3cca02bebfe..3fb9822530a776 100644
--- a/src/coreclr/vm/finalizerthread.h
+++ b/src/coreclr/vm/finalizerthread.h
@@ -5,6 +5,13 @@
 #ifndef _FINALIZER_THREAD_H_
 #define _FINALIZER_THREAD_H_
 
+#ifdef FEATURE_NATIVEAOT
+typedef void VOID;
+GPTR_IMPL(Thread, g_pFinalizerThread);
+// Global state variable indicating if the EE has been started up.
+Volatile<BOOL> g_fEEStarted = FALSE;
+#endif
+
 class FinalizerThread
 {
     static BOOL fQuitFinalizer;
@@ -50,6 +57,7 @@ class FinalizerThread
 
     static OBJECTREF GetNextFinalizableObject();
 
+#ifndef FEATURE_NATIVEAOT
     static void RaiseShutdownEvents()
     {
         WRAPPER_NO_CONTRACT;
@@ -64,6 +72,7 @@ class FinalizerThread
             hEventFinalizerToShutDown->Wait(INFINITE, /*alertable*/ TRUE);
         }
     }
+#endif
 
     static void FinalizerThreadWait(DWORD timeout = INFINITE);
 
diff --git a/src/coreclr/vm/synch.h b/src/coreclr/vm/synch.h
index 72e19f1c33b602..d07d89b2d5f772 100644
--- a/src/coreclr/vm/synch.h
+++ b/src/coreclr/vm/synch.h
@@ -134,6 +134,7 @@ class CLREvent : public CLREventBase
 };
 
 
+#ifndef FEATURE_NATIVEAOT
 // CLREventStatic
 //   Same as CLREvent, but intended to be used for global variables.
 //   Instances may leak their handle, because of the order in which
@@ -142,6 +143,7 @@ class CLREvent : public CLREventBase
 class CLREventStatic : public CLREventBase
 {
 };
+#endif
 
 BOOL CLREventWaitWithTry(CLREventBase *pEvent, DWORD timeout, BOOL fAlertable, DWORD *pStatus);
 #endif
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 14166de34dd641..9b4c685be25460 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -7,296 +7,4 @@
 
 #include "finalizerthread.h"
 
-enum class NormalizationState : UINT8
-{
-    Uninitialized,
-    Initialized,
-    Failed
-};
-
-static const int NsPerYieldMeasurementCount = 8;
-static const unsigned int MeasurementPeriodMs = 4000;
-
-static const unsigned int NsPerS = 1000 * 1000 * 1000;
-
-static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
-static unsigned int s_previousNormalizationTimeMs;
-
-static UINT64 s_performanceCounterTicksPerS;
-static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
-static int s_nextMeasurementIndex;
-static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
-
-static unsigned int DetermineMeasureDurationUs()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
-    // if the overhead seems high relative to the measure duration.
-    unsigned int measureDurationUs = 1;
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    UINT64 startTicks = li.QuadPart;
-    QueryPerformanceCounter(&li);
-    UINT64 elapsedTicks = li.QuadPart - startTicks;
-    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
-    {
-        measureDurationUs *= 4;
-    }
-    return measureDurationUs;
-}
-
-static double MeasureNsPerYield(unsigned int measureDurationUs)
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
-    UINT64 ticksPerS = s_performanceCounterTicksPerS;
-    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
-
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    UINT64 startTicks = li.QuadPart;
-
-    for (int i = 0; i < yieldCount; ++i)
-    {
-        System_YieldProcessor();
-    }
-
-    QueryPerformanceCounter(&li);
-    UINT64 elapsedTicks = li.QuadPart - startTicks;
-    while (elapsedTicks < measureDurationTicks)
-    {
-        int nextYieldCount =
-            Max(4,
-                elapsedTicks == 0
-                    ? yieldCount / 4
-                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
-        for (int i = 0; i < nextYieldCount; ++i)
-        {
-            System_YieldProcessor();
-        }
-
-        QueryPerformanceCounter(&li);
-        elapsedTicks = li.QuadPart - startTicks;
-        yieldCount += nextYieldCount;
-    }
-
-    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
-    const double MinNsPerYield = 0.1;
-
-    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
-    // really take this long. Limit the maximum to keep the recorded values reasonable.
-    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
-
-    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
-}
-
-void YieldProcessorNormalization::PerformMeasurement()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_PREEMPTIVE;
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_isMeasurementScheduled);
-
-    double latestNsPerYield;
-    if (s_normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-
-        int nextMeasurementIndex = s_nextMeasurementIndex;
-        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
-        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
-        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
-        {
-            nextMeasurementIndex = 0;
-        }
-        s_nextMeasurementIndex = nextMeasurementIndex;
-    }
-    else if (s_normalizationState == NormalizationState::Uninitialized)
-    {
-        LARGE_INTEGER li;
-        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
-        {
-            // High precision clock not available or clock resolution is too low, resort to defaults
-            s_normalizationState = NormalizationState::Failed;
-            return;
-        }
-        s_performanceCounterTicksPerS = li.QuadPart;
-
-        unsigned int measureDurationUs = DetermineMeasureDurationUs();
-        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-        {
-            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
-            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
-            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
-            {
-                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
-            }
-
-            if (i < NsPerYieldMeasurementCount - 1)
-            {
-                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-            }
-        }
-    }
-    else
-    {
-        _ASSERTE(s_normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
-    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = s_nsPerYieldMeasurements[i];
-        if (nsPerYield < establishedNsPerYield)
-        {
-            establishedNsPerYield = nsPerYield;
-        }
-    }
-    if (establishedNsPerYield != s_establishedNsPerYield)
-    {
-        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
-    }
-
-    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-
-    // Calculate the number of yields required to span the duration of a normalized yield
-    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
-    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
-    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    s_optimalMaxNormalizedYieldsPerSpinIteration =
-        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
-    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-
-    s_previousNormalizationTimeMs = GetTickCount();
-    s_normalizationState = NormalizationState::Initialized;
-    s_isMeasurementScheduled = false;
-}
-
-
-void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
-    if (normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-    }
-    else if (normalizationState == NormalizationState::Uninitialized)
-    {
-    }
-    else
-    {
-        _ASSERTE(normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
-    if (s_isMeasurementScheduled || !g_fEEStarted)
-    {
-        return;
-    }
-
-    s_isMeasurementScheduled = true;
-    FinalizerThread::EnableFinalization();
-}
-
-
-void YieldProcessorNormalization::FireMeasurementEvents()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    if (!EventEnabledYieldProcessorMeasurement())
-    {
-        return;
-    }
-
-    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
-    // recorded information, so try to enumerate the array with some care.
-    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
-    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
-    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
-        if (nsPerYield != 0) // the array may not be fully initialized yet
-        {
-            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
-        }
-
-        if (++nextIndex >= NsPerYieldMeasurementCount)
-        {
-            nextIndex = 0;
-        }
-    }
-}
-
-double YieldProcessorNormalization::AtomicLoad(double *valueRef)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    return VolatileLoadWithoutBarrier(valueRef);
-#else
-    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
-#endif
-}
-
-void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    *valueRef = value;
-#else
-    InterlockedExchangeT(valueRef, value);
-#endif
-}
-
+#include "yieldprocessornormalizedshared.cpp"
\ No newline at end of file
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
new file mode 100644
index 00000000000000..b012d221ef2913
--- /dev/null
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -0,0 +1,330 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+#include "finalizerthread.h"
+
+enum class NormalizationState : UINT8
+{
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
+
+static LARGE_INTEGER li;
+
+static UINT64 GetPerformanceCounter()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+#ifdef FEATURE_NATIVEAOT
+    return PalQueryPerformanceCounter();
+#else
+    QueryPerformanceCounter(&li);
+    return li.QuadPart;
+#endif
+}
+
+static unsigned int DetermineMeasureDurationUs()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    UINT64 startTicks = GetPerformanceCounter();
+    UINT64 elapsedTicks = GetPerformanceCounter() - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
+    {
+        measureDurationUs *= 4;
+    }
+    return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    UINT64 ticksPerS = s_performanceCounterTicksPerS;
+    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
+
+    UINT64 startTicks = GetPerformanceCounter();
+
+    for (int i = 0; i < yieldCount; ++i)
+    {
+        System_YieldProcessor();
+    }
+
+    UINT64 elapsedTicks = GetPerformanceCounter() - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            Max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
+        {
+            System_YieldProcessor();
+        }
+
+        elapsedTicks = GetPerformanceCounter() - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+#ifndef FEATURE_NATIVEAOT
+        MODE_PREEMPTIVE;
+#endif
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
+    }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+#ifdef FEATURE_NATIVEAOT
+        if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000)
+#else
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+#endif
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+
+#ifndef FEATURE_NATIVEAOT
+        s_performanceCounterTicksPerS = li.QuadPart;
+#endif
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+            }
+#ifndef FEATURE_NATIVEAOT
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+#endif
+        }
+    }
+    else
+    {
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
+    }
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
+
+#ifndef FEATURE_NATIVEAOT
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+#endif
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCount();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+    {
+        return;
+    }
+
+    s_isMeasurementScheduled = true;
+    FinalizerThread::EnableFinalization();
+}
+
+// EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT
+#ifndef FEATURE_NATIVEAOT
+void YieldProcessorNormalization::FireMeasurementEvents()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    if (!EventEnabledYieldProcessorMeasurement())
+    {
+        return;
+    }
+
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
+    }
+}
+#endif
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
+}
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+}
+

From 9237b9ffd627b5dda34fc0f36421100f339f2cb2 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 19 Jun 2024 14:08:13 -0700
Subject: [PATCH 02/23] Use PalGetTickCount64

---
 src/coreclr/inc/yieldprocessornormalized.h        |  4 ----
 .../nativeaot/Runtime/yieldprocessornormalized.h  |  1 +
 src/coreclr/vm/yieldprocessornormalizedshared.cpp | 15 ++++++++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index c8566e95749153..3b60b3ea2dcccf 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -26,10 +26,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
 
 #ifdef FEATURE_NATIVEAOT
 #define static_assert_no_msg( cond ) static_assert( cond, #cond )
-// I haven't seen an equivalent for GetTickCount yet, defining it like this for now
-unsigned int GetTickCount() {
-    return 0;
-}
 #define SIZE_T uintptr_t
 // verify these are correct
 typedef BYTE UINT8;
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
index 6f29c02308df05..5539ebf90561bc 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
@@ -1,4 +1,5 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#include "PalRedhawk.h"
 #include "../../inc/yieldprocessornormalized.h"
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index b012d221ef2913..226439411e34cb 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -25,6 +25,15 @@ static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPer
 
 static LARGE_INTEGER li;
 
+inline unsigned int GetTickCountPortable()
+{
+#ifdef FEATURE_NATIVEAOT
+    return (unsigned int)PalGetTickCount64();
+#else
+    return GetTickCount();
+#endif
+}
+
 static UINT64 GetPerformanceCounter()
 {
     CONTRACTL
@@ -140,7 +149,7 @@ void YieldProcessorNormalization::PerformMeasurement()
     double latestNsPerYield;
     if (s_normalizationState == NormalizationState::Initialized)
     {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
         {
             return;
         }
@@ -226,7 +235,7 @@ void YieldProcessorNormalization::PerformMeasurement()
 
     GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
 
-    s_previousNormalizationTimeMs = GetTickCount();
+    s_previousNormalizationTimeMs = GetTickCountPortable();
     s_normalizationState = NormalizationState::Initialized;
     s_isMeasurementScheduled = false;
 }
@@ -245,7 +254,7 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
     NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
     if (normalizationState == NormalizationState::Initialized)
     {
-        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
         {
             return;
         }

From 830d8d091b91ad035b126c4fa43aa7e38e7a2f8d Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 19 Jun 2024 14:52:29 -0700
Subject: [PATCH 03/23] Add limits.h

---
 src/coreclr/inc/yieldprocessornormalized.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 3b60b3ea2dcccf..4ee1c6170961f9 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,6 +3,9 @@
 
 #pragma once
 
+// not sure if necessary
+#include <limits.h>
+
 // Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
 // the intention is to use the system-default implementation of YieldProcessor().
 #define HAS_SYSTEM_YIELDPROCESSOR
@@ -15,6 +18,10 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
 #undef YieldProcessor
 #endif
 #define YieldProcessor Dont_Use_YieldProcessor
+#ifdef PalYieldProcessor
+#undef PalYieldProcessor
+#endif
+#define PalYieldProcessor Dont_Use_PalYieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \

From d0a884c45521da997067b8faa25423fc4e15ad9a Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 19 Jun 2024 15:31:36 -0700
Subject: [PATCH 04/23] Declare g_pFinalizerThread for Windows only

---
 src/coreclr/vm/finalizerthread.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h
index 3fb9822530a776..c70d578557e575 100644
--- a/src/coreclr/vm/finalizerthread.h
+++ b/src/coreclr/vm/finalizerthread.h
@@ -7,7 +7,9 @@
 
 #ifdef FEATURE_NATIVEAOT
 typedef void VOID;
+#ifdef TARGET_WINDOWS
 GPTR_IMPL(Thread, g_pFinalizerThread);
+#endif
 // Global state variable indicating if the EE has been started up.
 Volatile<BOOL> g_fEEStarted = FALSE;
 #endif

From 165bbb9d7c58e3cae4ec279e16821349788dea43 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 19 Jun 2024 16:19:40 -0700
Subject: [PATCH 05/23] PR comments

---
 src/coreclr/inc/yieldprocessornormalized.h        |  8 --------
 src/coreclr/vm/finalizerthread.h                  | 12 +++++++-----
 src/coreclr/vm/yieldprocessornormalizedshared.cpp |  4 ++++
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 4ee1c6170961f9..0233146eb096aa 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -14,14 +14,6 @@ FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
 #else
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
 #endif
-#ifdef YieldProcessor
-#undef YieldProcessor
-#endif
-#define YieldProcessor Dont_Use_YieldProcessor
-#ifdef PalYieldProcessor
-#undef PalYieldProcessor
-#endif
-#define PalYieldProcessor Dont_Use_PalYieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h
index c70d578557e575..bda1e5e1a758b9 100644
--- a/src/coreclr/vm/finalizerthread.h
+++ b/src/coreclr/vm/finalizerthread.h
@@ -7,11 +7,11 @@
 
 #ifdef FEATURE_NATIVEAOT
 typedef void VOID;
-#ifdef TARGET_WINDOWS
-GPTR_IMPL(Thread, g_pFinalizerThread);
-#endif
-// Global state variable indicating if the EE has been started up.
-Volatile<BOOL> g_fEEStarted = FALSE;
+
+// #ifdef TARGET_WINDOWS
+// GPTR_IMPL(Thread, g_pFinalizerThread);
+// #endif
+
 #endif
 
 class FinalizerThread
@@ -44,12 +44,14 @@ class FinalizerThread
     static void FinalizeAllObjects();
 
 public:
+#ifndef FEATURE_NATIVEAOT
     static Thread* GetFinalizerThread()
     {
         LIMITED_METHOD_CONTRACT;
         _ASSERTE(g_pFinalizerThread != 0);
         return g_pFinalizerThread;
     }
+#endif
 
     static BOOL IsCurrentThreadFinalizer();
 
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 226439411e34cb..f898ff0ea18b23 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -268,8 +268,12 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
         return;
     }
 
+#ifdef FEATURE_NATIVEAOT
+    if (s_isMeasurementScheduled)
+#else
     // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
     if (s_isMeasurementScheduled || !g_fEEStarted)
+#endif
     {
         return;
     }

From b089bac2619c69fbf3bc524b1a74e97ff52cb748 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 19 Jun 2024 17:37:44 -0700
Subject: [PATCH 06/23] Fix build/x86

---
 src/coreclr/vm/yieldprocessornormalizedshared.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index f898ff0ea18b23..ec0be79165042b 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -325,9 +325,14 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef)
 
 #ifdef TARGET_64BIT
     return VolatileLoadWithoutBarrier(valueRef);
+#else
+#ifdef FEATURE_NATIVEAOT
+    // return *PalInterlockedCompareExchange(valueRef, 0.0, 0.0);
+    return *valueRef; // TODO: fix this
 #else
     return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
 #endif
+#endif
 }
 
 void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
@@ -336,8 +341,12 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
 
 #ifdef TARGET_64BIT
     *valueRef = value;
+#else
+#ifdef FEATURE_NATIVEAOT
+    PalInterlockedExchangePointer(valueRef, value); // TODO: verify it works or fix it
 #else
     InterlockedExchangeT(valueRef, value);
 #endif
+#endif
 }
 

From f4ed8e8f2332ef1ab1171e5e16c7a151e22e4654 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Fri, 21 Jun 2024 12:38:19 -0700
Subject: [PATCH 07/23] Remove finalizer thread from native aot

---
 src/coreclr/inc/yieldprocessornormalized.h         |  2 --
 src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp |  3 ---
 src/coreclr/nativeaot/Runtime/MiscHelpers.cpp      |  3 +++
 src/coreclr/nativeaot/Runtime/startup.cpp          |  2 --
 src/coreclr/utilcode/yieldprocessornormalized.cpp  |  4 +++-
 src/coreclr/vm/yieldprocessornormalizedshared.cpp  | 14 ++++----------
 6 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 0233146eb096aa..16978adbace6b8 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -44,8 +44,6 @@ T Max(T v1, T v2)
     return v1 > v2 ? v1 : v2;
 }
 
-void InitializeYieldProcessorNormalizedCrst();
-void EnsureYieldProcessorNormalizedInitialized();
 #endif
 
 class YieldProcessorNormalization
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index 7a4d8a853a8b74..0c6454407073e7 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -46,9 +46,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
-    // We have some time until the first finalization request - use the time to calibrate normalized waits.
-    EnsureYieldProcessorNormalizedInitialized();
-
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
index c5bbcc22842776..7927de44195341 100644
--- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
@@ -37,6 +37,9 @@
 #include "RhConfig.h"
 #include <minipal/cpuid.h>
 
+// I believe there is a better way to use this file but doing this for now
+#include "../../utilcode/yieldprocessornormalized.cpp"
+
 FCIMPL0(void, RhDebugBreak)
 {
     PalDebugBreak();
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
index f87bc947d970ac..947d8fac05a205 100644
--- a/src/coreclr/nativeaot/Runtime/startup.cpp
+++ b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -130,8 +130,6 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
-    InitializeYieldProcessorNormalizedCrst();
-
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 020d8d7cc79e4e..4b5a5537e49f79 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -1,7 +1,9 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#ifndef FEATURE_NATIVEAOT
 #include "stdafx.h"
+#endif
 #include "yieldprocessornormalized.h"
 
 bool YieldProcessorNormalization::s_isMeasurementScheduled;
@@ -14,4 +16,4 @@ unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIte
         (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
         YieldProcessorNormalization::TargetNsPerNormalizedYield +
         0.5
-    );
+    );
\ No newline at end of file
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index ec0be79165042b..ccbddd1d24b5dd 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -1,7 +1,9 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#ifndef FEATURE_NATIVEAOT
 #include "finalizerthread.h"
+#endif
 
 enum class NormalizationState : UINT8
 {
@@ -36,16 +38,6 @@ inline unsigned int GetTickCountPortable()
 
 static UINT64 GetPerformanceCounter()
 {
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
 #ifdef FEATURE_NATIVEAOT
     return PalQueryPerformanceCounter();
 #else
@@ -279,7 +271,9 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
     }
 
     s_isMeasurementScheduled = true;
+#ifndef FEATURE_NATIVEAOT
     FinalizerThread::EnableFinalization();
+#endif
 }
 
 // EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT

From b64656829fa81d845acbc9109bcf36d6e2c8e815 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Fri, 21 Jun 2024 15:06:15 -0700
Subject: [PATCH 08/23] Remove unnecessary code

---
 src/coreclr/inc/yieldprocessornormalized.h       | 16 ++++------------
 .../utilcode/yieldprocessornormalized.cpp        |  2 +-
 src/coreclr/vm/finalizerthread.h                 | 13 -------------
 3 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 16978adbace6b8..532f167521dca6 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,11 +3,6 @@
 
 #pragma once
 
-// not sure if necessary
-#include <limits.h>
-
-// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
-// the intention is to use the system-default implementation of YieldProcessor().
 #define HAS_SYSTEM_YIELDPROCESSOR
 #ifdef FEATURE_NATIVEAOT
 FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
@@ -24,9 +19,7 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
     DISABLE_COPY(T)
 
 #ifdef FEATURE_NATIVEAOT
-#define static_assert_no_msg( cond ) static_assert( cond, #cond )
 #define SIZE_T uintptr_t
-// verify these are correct
 typedef BYTE UINT8;
 typedef ULONGLONG UINT64;
 
@@ -43,7 +36,6 @@ T Max(T v1, T v2)
     // STATIC_CONTRACT_LEAF;
     return v1 > v2 ? v1 : v2;
 }
-
 #endif
 
 class YieldProcessorNormalization
@@ -295,10 +287,10 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
 {
     // This shift value should be adjusted based on the asserted conditions below
     const UINT8 MaxShift = 3;
-    static_assert_no_msg(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-    static_assert_no_msg(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    static_assert(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 4b5a5537e49f79..92d6ef943c9d7f 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -16,4 +16,4 @@ unsigned int YieldProcessorNormalization::s_optimalMaxNormalizedYieldsPerSpinIte
         (double)YieldProcessorNormalization::TargetMaxNsPerSpinIteration /
         YieldProcessorNormalization::TargetNsPerNormalizedYield +
         0.5
-    );
\ No newline at end of file
+    );
diff --git a/src/coreclr/vm/finalizerthread.h b/src/coreclr/vm/finalizerthread.h
index bda1e5e1a758b9..5ea3cca02bebfe 100644
--- a/src/coreclr/vm/finalizerthread.h
+++ b/src/coreclr/vm/finalizerthread.h
@@ -5,15 +5,6 @@
 #ifndef _FINALIZER_THREAD_H_
 #define _FINALIZER_THREAD_H_
 
-#ifdef FEATURE_NATIVEAOT
-typedef void VOID;
-
-// #ifdef TARGET_WINDOWS
-// GPTR_IMPL(Thread, g_pFinalizerThread);
-// #endif
-
-#endif
-
 class FinalizerThread
 {
     static BOOL fQuitFinalizer;
@@ -44,14 +35,12 @@ class FinalizerThread
     static void FinalizeAllObjects();
 
 public:
-#ifndef FEATURE_NATIVEAOT
     static Thread* GetFinalizerThread()
     {
         LIMITED_METHOD_CONTRACT;
         _ASSERTE(g_pFinalizerThread != 0);
         return g_pFinalizerThread;
     }
-#endif
 
     static BOOL IsCurrentThreadFinalizer();
 
@@ -61,7 +50,6 @@ class FinalizerThread
 
     static OBJECTREF GetNextFinalizableObject();
 
-#ifndef FEATURE_NATIVEAOT
     static void RaiseShutdownEvents()
     {
         WRAPPER_NO_CONTRACT;
@@ -76,7 +64,6 @@ class FinalizerThread
             hEventFinalizerToShutDown->Wait(INFINITE, /*alertable*/ TRUE);
         }
     }
-#endif
 
     static void FinalizerThreadWait(DWORD timeout = INFINITE);
 

From 412715876eda72280ef9933b9446fd80233e4806 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Fri, 21 Jun 2024 19:23:28 -0700
Subject: [PATCH 09/23] PR comments + Fix InterlockedExchange

---
 src/coreclr/gc/env/gcenv.os.h                 |  6 -----
 src/coreclr/inc/yieldprocessornormalized.h    | 27 ++++++++-----------
 .../Runtime/yieldprocessornormalized.cpp      |  1 -
 .../vm/yieldprocessornormalizedshared.cpp     | 23 ++++++++--------
 4 files changed, 22 insertions(+), 35 deletions(-)

diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
index 75015b77262dd9..17640543bd3814 100644
--- a/src/coreclr/gc/env/gcenv.os.h
+++ b/src/coreclr/gc/env/gcenv.os.h
@@ -6,12 +6,6 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
-#ifdef HAS_SYSTEM_YIELDPROCESSOR
-// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
-#undef YieldProcessor
-#define YieldProcessor System_YieldProcessor
-#endif
-
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 532f167521dca6..42db5467f2344a 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#define HAS_SYSTEM_YIELDPROCESSOR
 #ifdef FEATURE_NATIVEAOT
 FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
 #else
@@ -19,10 +18,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
     DISABLE_COPY(T)
 
 #ifdef FEATURE_NATIVEAOT
-#define SIZE_T uintptr_t
-typedef BYTE UINT8;
-typedef ULONGLONG UINT64;
-
 template <typename T>
 T Min(T v1, T v2)
 {
@@ -162,9 +157,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
@@ -172,7 +167,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
         }
     }
 
-    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -207,9 +202,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -218,7 +213,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -245,9 +240,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    if (sizeof(size_t) <= sizeof(unsigned int))
     {
-        // On platforms with a small SIZE_T, prevent overflow on the multiply below
+        // On platforms with a small size_t, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -256,8 +251,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    SIZE_T n =
-        (SIZE_T)preSkylakeCount *
+    size_t n =
+        (size_t)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -286,7 +281,7 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const UINT8 MaxShift = 3;
+    const uint8_t MaxShift = 3;
     static_assert(
         ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
     static_assert(
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index a9eb72a147d6d0..2e8a7b8a8ad461 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -14,6 +14,5 @@
 #include "slist.h"
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
-#include "../../vm/synch.h"
 
 #include "../../vm/yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index ccbddd1d24b5dd..4d1ded67833760 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -5,7 +5,7 @@
 #include "finalizerthread.h"
 #endif
 
-enum class NormalizationState : UINT8
+enum class NormalizationState : uint8_t
 {
     Uninitialized,
     Initialized,
@@ -20,7 +20,7 @@ static const unsigned int NsPerS = 1000 * 1000 * 1000;
 static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
 static unsigned int s_previousNormalizationTimeMs;
 
-static UINT64 s_performanceCounterTicksPerS;
+static uint64_t s_performanceCounterTicksPerS;
 static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
 static int s_nextMeasurementIndex;
 static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
@@ -36,7 +36,7 @@ inline unsigned int GetTickCountPortable()
 #endif
 }
 
-static UINT64 GetPerformanceCounter()
+static uint64_t GetPerformanceCounter()
 {
 #ifdef FEATURE_NATIVEAOT
     return PalQueryPerformanceCounter();
@@ -63,8 +63,8 @@ static unsigned int DetermineMeasureDurationUs()
     // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
     // if the overhead seems high relative to the measure duration.
     unsigned int measureDurationUs = 1;
-    UINT64 startTicks = GetPerformanceCounter();
-    UINT64 elapsedTicks = GetPerformanceCounter() - startTicks;
+    uint64_t startTicks = GetPerformanceCounter();
+    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
     if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
     {
         measureDurationUs *= 4;
@@ -87,17 +87,17 @@ static double MeasureNsPerYield(unsigned int measureDurationUs)
     _ASSERTE(s_normalizationState != NormalizationState::Failed);
 
     int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
-    UINT64 ticksPerS = s_performanceCounterTicksPerS;
-    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
+    uint64_t ticksPerS = s_performanceCounterTicksPerS;
+    uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
 
-    UINT64 startTicks = GetPerformanceCounter();
+    uint64_t startTicks = GetPerformanceCounter();
 
     for (int i = 0; i < yieldCount; ++i)
     {
         System_YieldProcessor();
     }
 
-    UINT64 elapsedTicks = GetPerformanceCounter() - startTicks;
+    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
     while (elapsedTicks < measureDurationTicks)
     {
         int nextYieldCount =
@@ -321,8 +321,7 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef)
     return VolatileLoadWithoutBarrier(valueRef);
 #else
 #ifdef FEATURE_NATIVEAOT
-    // return *PalInterlockedCompareExchange(valueRef, 0.0, 0.0);
-    return *valueRef; // TODO: fix this
+    return *PalInterlockedCompareExchange64((int64_t *)valueRef, 0, 0);
 #else
     return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
 #endif
@@ -337,7 +336,7 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
     *valueRef = value;
 #else
 #ifdef FEATURE_NATIVEAOT
-    PalInterlockedExchangePointer(valueRef, value); // TODO: verify it works or fix it
+    PalInterlockedExchange64((int64_t *) valueRef, value);
 #else
     InterlockedExchangeT(valueRef, value);
 #endif

From a86278282dc64dfcda5d93e955e1ace15f8d1e2e Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Mon, 24 Jun 2024 13:39:39 -0700
Subject: [PATCH 10/23] Add TODOs

---
 src/coreclr/inc/yieldprocessornormalized.h    | 2 ++
 src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 2 +-
 src/coreclr/vm/yieldprocessornormalized.cpp   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index 42db5467f2344a..b17659c983d729 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -21,6 +21,7 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
 template <typename T>
 T Min(T v1, T v2)
 {
+    // TODO: check it's safe to remove STATIC_CONTRACT_LEAF
     // STATIC_CONTRACT_LEAF;
     return v1 < v2 ? v1 : v2;
 }
@@ -28,6 +29,7 @@ T Min(T v1, T v2)
 template <typename T>
 T Max(T v1, T v2)
 {
+    // TODO: check it's safe to remove STATIC_CONTRACT_LEAF
     // STATIC_CONTRACT_LEAF;
     return v1 > v2 ? v1 : v2;
 }
diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
index 7927de44195341..6dd20382c06dbd 100644
--- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
@@ -37,7 +37,7 @@
 #include "RhConfig.h"
 #include <minipal/cpuid.h>
 
-// I believe there is a better way to use this file but doing this for now
+// TODO: Check if there's a better way to use this file
 #include "../../utilcode/yieldprocessornormalized.cpp"
 
 FCIMPL0(void, RhDebugBreak)
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 9b4c685be25460..258e30d634c7ce 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -7,4 +7,4 @@
 
 #include "finalizerthread.h"
 
-#include "yieldprocessornormalizedshared.cpp"
\ No newline at end of file
+#include "yieldprocessornormalizedshared.cpp"

From 73d3d71b888869fbb85287c28eefb9a1db256b0d Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Mon, 24 Jun 2024 22:49:07 -0700
Subject: [PATCH 11/23] Use max/min and RhEnableFinalization

---
 src/coreclr/inc/yieldprocessornormalized.h     | 18 ------------------
 .../vm/yieldprocessornormalizedshared.cpp      | 14 +++++++++-----
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index b17659c983d729..e37bf79f0c5089 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -17,24 +17,6 @@ FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
     T() = delete; \
     DISABLE_COPY(T)
 
-#ifdef FEATURE_NATIVEAOT
-template <typename T>
-T Min(T v1, T v2)
-{
-    // TODO: check it's safe to remove STATIC_CONTRACT_LEAF
-    // STATIC_CONTRACT_LEAF;
-    return v1 < v2 ? v1 : v2;
-}
-
-template <typename T>
-T Max(T v1, T v2)
-{
-    // TODO: check it's safe to remove STATIC_CONTRACT_LEAF
-    // STATIC_CONTRACT_LEAF;
-    return v1 > v2 ? v1 : v2;
-}
-#endif
-
 class YieldProcessorNormalization
 {
 public:
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 4d1ded67833760..7a1a6665a98852 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -27,6 +27,8 @@ static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPer
 
 static LARGE_INTEGER li;
 
+void RhEnableFinalization();
+
 inline unsigned int GetTickCountPortable()
 {
 #ifdef FEATURE_NATIVEAOT
@@ -101,7 +103,7 @@ static double MeasureNsPerYield(unsigned int measureDurationUs)
     while (elapsedTicks < measureDurationTicks)
     {
         int nextYieldCount =
-            Max(4,
+            max(4,
                 elapsedTicks == 0
                     ? yieldCount / 4
                     : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
@@ -121,7 +123,7 @@ static double MeasureNsPerYield(unsigned int measureDurationUs)
     // really take this long. Limit the maximum to keep the recorded values reasonable.
     const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
 
-    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+    return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
 }
 
 void YieldProcessorNormalization::PerformMeasurement()
@@ -214,7 +216,7 @@ void YieldProcessorNormalization::PerformMeasurement()
 #endif
 
     // Calculate the number of yields required to span the duration of a normalized yield
-    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
     _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
     s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
 
@@ -222,7 +224,7 @@ void YieldProcessorNormalization::PerformMeasurement()
     // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
     // better job of allowing other work to run.
     s_optimalMaxNormalizedYieldsPerSpinIteration =
-        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+        max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
     _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
@@ -271,7 +273,9 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
     }
 
     s_isMeasurementScheduled = true;
-#ifndef FEATURE_NATIVEAOT
+#ifdef FEATURE_NATIVEAOT
+    RhEnableFinalization();
+#else
     FinalizerThread::EnableFinalization();
 #endif
 }

From 0d226dab0218e3760da497a216fcf693a7d8fbea Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Mon, 24 Jun 2024 22:53:49 -0700
Subject: [PATCH 12/23] Remove TODO

---
 src/coreclr/nativeaot/Runtime/MiscHelpers.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
index 6dd20382c06dbd..f6a5d809db9e03 100644
--- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
@@ -37,7 +37,6 @@
 #include "RhConfig.h"
 #include <minipal/cpuid.h>
 
-// TODO: Check if there's a better way to use this file
 #include "../../utilcode/yieldprocessornormalized.cpp"
 
 FCIMPL0(void, RhDebugBreak)

From 4c315c81cbcde9c3d8a290118fce5ef2001bc590 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 25 Jun 2024 14:10:24 -0700
Subject: [PATCH 13/23] Fix Interlocked calls

---
 src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 7 +++++++
 src/coreclr/vm/yieldprocessornormalizedshared.cpp        | 7 +++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 187ad26fb8bf11..766feab6e1d8ab 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -42,6 +42,13 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32
     return _InterlockedExchange((long volatile *)pDst, iValue);
 }
 
+EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
+#pragma intrinsic(_InterlockedExchange)
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    return _InterlockedExchange64(pDst, iValue);
+}
+
 EXTERN_C long __PN__MACHINECALL_CDECL_OR_DEFAULT _InterlockedCompareExchange(long volatile *, long, long);
 #pragma intrinsic(_InterlockedCompareExchange)
 FORCEINLINE int32_t PalInterlockedCompareExchange(_Inout_ int32_t volatile *pDst, int32_t iValue, int32_t iComparand)
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 7a1a6665a98852..1d4406e6cbba41 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -325,7 +325,9 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef)
     return VolatileLoadWithoutBarrier(valueRef);
 #else
 #ifdef FEATURE_NATIVEAOT
-    return *PalInterlockedCompareExchange64((int64_t *)valueRef, 0, 0);
+    static_assert(sizeof(int64_t) == sizeof(double));
+    int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0);
+    return *(double*)(int64_t*)(&intRes);
 #else
     return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
 #endif
@@ -340,7 +342,8 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
     *valueRef = value;
 #else
 #ifdef FEATURE_NATIVEAOT
-    PalInterlockedExchange64((int64_t *) valueRef, value);
+    static_assert(sizeof(int64_t) == sizeof(double));
+    PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value);
 #else
     InterlockedExchangeT(valueRef, value);
 #endif

From 17dac7e4a9e422b88c7a119d3825b96ab769cbf6 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 25 Jun 2024 21:01:15 -0700
Subject: [PATCH 14/23] Fix static_assert

---
 src/coreclr/vm/yieldprocessornormalizedshared.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 1d4406e6cbba41..7e2a6caf548d30 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -325,7 +325,7 @@ double YieldProcessorNormalization::AtomicLoad(double *valueRef)
     return VolatileLoadWithoutBarrier(valueRef);
 #else
 #ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double));
+    static_assert(sizeof(int64_t) == sizeof(double), "");
     int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0);
     return *(double*)(int64_t*)(&intRes);
 #else
@@ -342,7 +342,7 @@ void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
     *valueRef = value;
 #else
 #ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double));
+    static_assert(sizeof(int64_t) == sizeof(double), "");
     PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value);
 #else
     InterlockedExchangeT(valueRef, value);

From e09fa54dd976b2e63fed36dee54026e7b328d403 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 26 Jun 2024 13:26:54 -0700
Subject: [PATCH 15/23] Add PerformMeasurement call

---
 src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index 0c6454407073e7..d05b880fe9b730 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -55,6 +55,11 @@ uint32_t WINAPI FinalizerStart(void* pContext)
     UInt32_BOOL fResult = PalSetEvent(hFinalizerEvent);
     ASSERT(fResult);
 
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+
     // Run the managed portion of the finalizer. This call will never return.
 
     ProcessFinalizers();

From 73f3fef2ce0e03c8cec24bc227b20d02bd957ed5 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 26 Jun 2024 15:17:00 -0700
Subject: [PATCH 16/23] Add YieldProcessorMeasurement

---
 .../nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst   | 1 +
 src/coreclr/vm/yieldprocessornormalizedshared.cpp            | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
index 901af659ff84b6..0f4c932719a399 100644
--- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
+++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
+YieldProcessorMeasurement
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 7e2a6caf548d30..8aebd1959bd8cc 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -211,9 +211,7 @@ void YieldProcessorNormalization::PerformMeasurement()
         AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
     }
 
-#ifndef FEATURE_NATIVEAOT
     FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-#endif
 
     // Calculate the number of yields required to span the duration of a normalized yield
     unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
@@ -280,8 +278,6 @@ void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
 #endif
 }
 
-// EventEnabledYieldProcessorMeasurement and FireEtwYieldProcessorMeasurement aren't available for AOT
-#ifndef FEATURE_NATIVEAOT
 void YieldProcessorNormalization::FireMeasurementEvents()
 {
     CONTRACTL
@@ -315,7 +311,6 @@ void YieldProcessorNormalization::FireMeasurementEvents()
         }
     }
 }
-#endif
 
 double YieldProcessorNormalization::AtomicLoad(double *valueRef)
 {

From e07035e60840a0b5ec38cc5d84fd1fa28c0ac815 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Wed, 26 Jun 2024 17:02:50 -0700
Subject: [PATCH 17/23] Nit

---
 src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 2 +-
 src/coreclr/vm/yieldprocessornormalizedshared.cpp        | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 766feab6e1d8ab..1fb290e41dcf29 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -43,7 +43,7 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32
 }
 
 EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange)
+#pragma intrinsic(_InterlockedExchange64)
 FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
 {
     return _InterlockedExchange64(pDst, iValue);
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 8aebd1959bd8cc..0d6464bbe43d42 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -183,12 +183,10 @@ void YieldProcessorNormalization::PerformMeasurement()
             {
                 AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
             }
-#ifndef FEATURE_NATIVEAOT
             if (i < NsPerYieldMeasurementCount - 1)
             {
                 FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
             }
-#endif
         }
     }
     else

From 9a76a1825731a78bf729457a33219565b86f9c45 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Thu, 27 Jun 2024 12:53:46 -0700
Subject: [PATCH 18/23] Fix PerformMeasurement

---
 .../nativeaot/Runtime/FinalizerHelpers.cpp        | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index d05b880fe9b730..ecafa5e4c75523 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -46,6 +46,11 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -55,11 +60,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
     UInt32_BOOL fResult = PalSetEvent(hFinalizerEvent);
     ASSERT(fResult);
 
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
-
     // Run the managed portion of the finalizer. This call will never return.
 
     ProcessFinalizers();
@@ -186,6 +186,11 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest()
 // Indicate that the current round of finalizations is complete.
 EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
+    if (YieldProcessorNormalization::IsMeasurementScheduled())
+    {
+        YieldProcessorNormalization::PerformMeasurement();
+    }
+
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
 }

From 6519b0b3e8a9636bfba54d755c9d3a56b8278153 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Thu, 27 Jun 2024 13:19:44 -0700
Subject: [PATCH 19/23] Move PerformMeasurement

---
 src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index ecafa5e4c75523..cf289143afbc91 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -186,13 +186,13 @@ EXTERN_C UInt32_BOOL QCALLTYPE RhpWaitForFinalizerRequest()
 // Indicate that the current round of finalizations is complete.
 EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
+    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
+    g_FinalizerDoneEvent.Set();
+
     if (YieldProcessorNormalization::IsMeasurementScheduled())
     {
         YieldProcessorNormalization::PerformMeasurement();
     }
-
-    FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
-    g_FinalizerDoneEvent.Set();
 }
 
 //

From 9606eb93142b9c3dc0dbd60033f0b146078adb94 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 2 Jul 2024 14:40:44 -0700
Subject: [PATCH 20/23] Fix PalInterlockedExchange64

---
 .../Runtime/windows/PalRedhawkInline.h        | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 1fb290e41dcf29..0f05e6143b3a48 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -42,13 +42,6 @@ FORCEINLINE int32_t PalInterlockedExchange(_Inout_ int32_t volatile *pDst, int32
     return _InterlockedExchange((long volatile *)pDst, iValue);
 }
 
-EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange64)
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    return _InterlockedExchange64(pDst, iValue);
-}
-
 EXTERN_C long __PN__MACHINECALL_CDECL_OR_DEFAULT _InterlockedCompareExchange(long volatile *, long, long);
 #pragma intrinsic(_InterlockedCompareExchange)
 FORCEINLINE int32_t PalInterlockedCompareExchange(_Inout_ int32_t volatile *pDst, int32_t iValue, int32_t iComparand)
@@ -63,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
+#ifdef HOST_X86
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    int64_t iOldValue;
+    do {
+        iOldValue = *pDst;
+    } while (PalInterlockedCompareExchange64(Target,
+                                          iValue,
+                                          iOldValue) != iOldValue);
+    return iOldValue;
+}
+#else // HOST_X86
+EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
+#pragma intrinsic(_InterlockedExchange64)
+FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
+{
+    return _InterlockedExchange64(pDst, iValue);
+}
+#endif // HOST_X86
+
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)

From 234d61bace73373cc0288b5aee4f032a534314f5 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 2 Jul 2024 15:42:20 -0700
Subject: [PATCH 21/23] PR comments

---
 src/coreclr/nativeaot/Runtime/Crst.h                       | 1 -
 src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp         | 5 -----
 src/coreclr/nativeaot/Runtime/MiscHelpers.cpp              | 2 --
 src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp | 2 ++
 src/coreclr/utilcode/yieldprocessornormalized.cpp          | 3 ---
 src/coreclr/vm/synch.h                                     | 2 --
 src/coreclr/vm/yieldprocessornormalizedshared.cpp          | 7 +------
 7 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
index 31bf8fde9eec8a..4ab9db08e0f5e3 100644
--- a/src/coreclr/nativeaot/Runtime/Crst.h
+++ b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,7 +20,6 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
-    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index cf289143afbc91..3af6a3fbf21751 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -46,11 +46,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
-
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
diff --git a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
index f6a5d809db9e03..c5bbcc22842776 100644
--- a/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/MiscHelpers.cpp
@@ -37,8 +37,6 @@
 #include "RhConfig.h"
 #include <minipal/cpuid.h>
 
-#include "../../utilcode/yieldprocessornormalized.cpp"
-
 FCIMPL0(void, RhDebugBreak)
 {
     PalDebugBreak();
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index 2e8a7b8a8ad461..efaf4e8bb20704 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,4 +15,6 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
+#include "../../utilcode/yieldprocessornormalized.cpp"
+
 #include "../../vm/yieldprocessornormalizedshared.cpp"
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index 92d6ef943c9d7f..c6aaaa19557fa7 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -1,9 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#ifndef FEATURE_NATIVEAOT
-#include "stdafx.h"
-#endif
 #include "yieldprocessornormalized.h"
 
 bool YieldProcessorNormalization::s_isMeasurementScheduled;
diff --git a/src/coreclr/vm/synch.h b/src/coreclr/vm/synch.h
index d07d89b2d5f772..72e19f1c33b602 100644
--- a/src/coreclr/vm/synch.h
+++ b/src/coreclr/vm/synch.h
@@ -134,7 +134,6 @@ class CLREvent : public CLREventBase
 };
 
 
-#ifndef FEATURE_NATIVEAOT
 // CLREventStatic
 //   Same as CLREvent, but intended to be used for global variables.
 //   Instances may leak their handle, because of the order in which
@@ -143,7 +142,6 @@ class CLREvent : public CLREventBase
 class CLREventStatic : public CLREventBase
 {
 };
-#endif
 
 BOOL CLREventWaitWithTry(CLREventBase *pEvent, DWORD timeout, BOOL fAlertable, DWORD *pStatus);
 #endif
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index 0d6464bbe43d42..ff1343d4fce9ae 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -1,10 +1,6 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#ifndef FEATURE_NATIVEAOT
-#include "finalizerthread.h"
-#endif
-
 enum class NormalizationState : uint8_t
 {
     Uninitialized,
@@ -25,8 +21,6 @@ static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
 static int s_nextMeasurementIndex;
 static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
 
-static LARGE_INTEGER li;
-
 void RhEnableFinalization();
 
 inline unsigned int GetTickCountPortable()
@@ -43,6 +37,7 @@ static uint64_t GetPerformanceCounter()
 #ifdef FEATURE_NATIVEAOT
     return PalQueryPerformanceCounter();
 #else
+    LARGE_INTEGER li;
     QueryPerformanceCounter(&li);
     return li.QuadPart;
 #endif

From 51c457344d84932985edfb9218ee8e977e192a8b Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 2 Jul 2024 17:04:34 -0700
Subject: [PATCH 22/23] Fix build

---
 src/coreclr/vm/yieldprocessornormalizedshared.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
index ff1343d4fce9ae..05daee21947376 100644
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
@@ -157,6 +157,7 @@ void YieldProcessorNormalization::PerformMeasurement()
 #ifdef FEATURE_NATIVEAOT
         if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000)
 #else
+        LARGE_INTEGER li;
         if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
 #endif
         {

From e8e1290d0b4985111cd985be31e377fd403b16c6 Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Tue, 2 Jul 2024 17:43:13 -0700
Subject: [PATCH 23/23] Fix PalInterlocked

---
 src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 0f05e6143b3a48..1f2a74dcd15100 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -62,7 +62,7 @@ FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int
     int64_t iOldValue;
     do {
         iOldValue = *pDst;
-    } while (PalInterlockedCompareExchange64(Target,
+    } while (PalInterlockedCompareExchange64(pDst,
                                           iValue,
                                           iOldValue) != iOldValue);
     return iOldValue;