Auto merge of #142978 - Kobzol:query-hit, r=oli-obk

bors · bors · commit b94bd12401d2 · 2025-07-02T11:41:14.000Z
Add new self-profiling event to cheaply aggregate query cache hit counts Self-profile can record various types of things, some of them are not enabled, like query cache hits. Rustc currently records cache hits as "instant" measureme events, which records the thread ID, current timestamp, and constructs an individual event for each such cache hit. This is incredibly expensive, in a small hello world benchmark that just depends on serde, it makes compilation with nightly go from ~3s (with `-Zself-profile`) to ~15s (with `-Zself-profile -Zself-profile-events=default,query-cache-hit`). We'd like to add query cache hits to rustc-perf (rust-lang/rustc-perf#2168), but there we only need the actualy cache hit counts, not the timestamp/thread ID metadata associated with it. This PR adds a new `query-cache-hit-count` event. Instead of generating individual instant events, it simply aggregates cache hit counts per *query invocation* (so a combination of a query and its arguments, if I understand it correctly) using an atomic counter. At the end of the compilation session, these counts are then dumped to the self-profile log using integer events (in a similar fashion as how we record artifact sizes). I suppose that we could dedup the query invocations in rustc directly, but I don't think it's really required. In local experiments with the hello world + serde case, the query invocation records generated ~30 KiB more data in the self-profile, which was ~10% increase in this case. With this PR, the overhead of `-Zself-profile` seems to be the same as before, at least on my machine, so I also enabled query cache hit counts by default when self profiling is enabled. We should also modify `analyzeme`, specifically [this](https://github.com/rust-lang/measureme/blob/master/analyzeme/src/analysis.rs#L139), and make it load the integer events with query cache hit counts. I can do that as a follow-up, it's not required to be done in sync with this PR, and it doesn't require changes in rustc. CC `@cjgillot` r? `@oli-obk`
diff --git a/compiler/rustc_data_structures/src/profiling.rs b/compiler/rustc_data_structures/src/profiling.rs
@@ -88,6 +88,7 @@ use std::fmt::Display;
 use std::intrinsics::unlikely;
 use std::path::Path;
 use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::{Duration, Instant};
 use std::{fs, process};
 
@@ -99,12 +100,15 @@ use tracing::warn;
 
 use crate::fx::FxHashMap;
 use crate::outline;
+use crate::sync::AtomicU64;
 
 bitflags::bitflags! {
     #[derive(Clone, Copy)]
     struct EventFilter: u16 {
         const GENERIC_ACTIVITIES  = 1 << 0;
         const QUERY_PROVIDERS     = 1 << 1;
+        /// Store detailed instant events, including timestamp and thread ID,
+        /// per each query cache hit. Note that this is quite expensive.
         const QUERY_CACHE_HITS    = 1 << 2;
         const QUERY_BLOCKED       = 1 << 3;
         const INCR_CACHE_LOADS    = 1 << 4;
@@ -113,16 +117,20 @@ bitflags::bitflags! {
         const FUNCTION_ARGS       = 1 << 6;
         const LLVM                = 1 << 7;
         const INCR_RESULT_HASHING = 1 << 8;
-        const ARTIFACT_SIZES = 1 << 9;
+        const ARTIFACT_SIZES      = 1 << 9;
+        /// Store aggregated counts of cache hits per query invocation.
+        const QUERY_CACHE_HIT_COUNTS  = 1 << 10;
 
         const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
                         Self::QUERY_PROVIDERS.bits() |
                         Self::QUERY_BLOCKED.bits() |
                         Self::INCR_CACHE_LOADS.bits() |
                         Self::INCR_RESULT_HASHING.bits() |
-                        Self::ARTIFACT_SIZES.bits();
+                        Self::ARTIFACT_SIZES.bits() |
+                        Self::QUERY_CACHE_HIT_COUNTS.bits();
 
         const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
+        const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
     }
 }
 
@@ -134,6 +142,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
     ("generic-activity", EventFilter::GENERIC_ACTIVITIES),
     ("query-provider", EventFilter::QUERY_PROVIDERS),
     ("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
+    ("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
     ("query-blocked", EventFilter::QUERY_BLOCKED),
     ("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
     ("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +420,24 @@ impl SelfProfilerRef {
         #[inline(never)]
         #[cold]
         fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
-            profiler_ref.instant_query_event(
-                |profiler| profiler.query_cache_hit_event_kind,
-                query_invocation_id,
-            );
+            if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+                profiler_ref
+                    .profiler
+                    .as_ref()
+                    .unwrap()
+                    .increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
+            }
+            if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+                profiler_ref.instant_query_event(
+                    |profiler| profiler.query_cache_hit_event_kind,
+                    query_invocation_id,
+                );
+            }
         }
 
-        if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
+        // We check both kinds of query cache hit events at once, to reduce overhead in the
+        // common case (with self-profile disabled).
+        if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
             cold_call(self, query_invocation_id);
         }
     }
@@ -489,6 +509,35 @@ impl SelfProfilerRef {
         self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
     }
 
+    /// Store query cache hits to the self-profile log.
+    /// Should be called once at the end of the compilation session.
+    ///
+    /// The cache hits are stored per **query invocation**, not **per query kind/type**.
+    /// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
+    /// IDs.
+    pub fn store_query_cache_hits(&self) {
+        if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
+            let profiler = self.profiler.as_ref().unwrap();
+            let query_hits = profiler.query_hits.read();
+            let builder = EventIdBuilder::new(&profiler.profiler);
+            let thread_id = get_thread_id();
+            for (query_invocation, hit_count) in query_hits.iter().enumerate() {
+                let hit_count = hit_count.load(Ordering::Relaxed);
+                // No need to record empty cache hit counts
+                if hit_count > 0 {
+                    let event_id =
+                        builder.from_label(StringId::new_virtual(query_invocation as u64));
+                    profiler.profiler.record_integer_event(
+                        profiler.query_cache_hit_count_event_kind,
+                        event_id,
+                        thread_id,
+                        hit_count,
+                    );
+                }
+            }
+        }
+    }
+
     #[inline]
     pub fn enabled(&self) -> bool {
         self.profiler.is_some()
@@ -537,13 +586,28 @@ pub struct SelfProfiler {
 
     string_cache: RwLock<FxHashMap<String, StringId>>,
 
+    /// Recording individual query cache hits as "instant" measureme events
+    /// is incredibly expensive. Instead of doing that, we simply aggregate
+    /// cache hit *counts* per query invocation, and then store the final count
+    /// of cache hits per invocation at the end of the compilation session.
+    ///
+    /// With this approach, we don't know the individual thread IDs and timestamps
+    /// of cache hits, but it has very little overhead on top of `-Zself-profile`.
+    /// Recording the cache hits as individual events made compilation 3-5x slower.
+    ///
+    /// Query invocation IDs should be monotonic integers, so we can store them in a vec,
+    /// rather than using a hashmap.
+    query_hits: RwLock<Vec<AtomicU64>>,
+
     query_event_kind: StringId,
     generic_activity_event_kind: StringId,
     incremental_load_result_event_kind: StringId,
     incremental_result_hashing_event_kind: StringId,
     query_blocked_event_kind: StringId,
     query_cache_hit_event_kind: StringId,
     artifact_size_event_kind: StringId,
+    /// Total cache hits per query invocation
+    query_cache_hit_count_event_kind: StringId,
 }
 
 impl SelfProfiler {
@@ -573,6 +637,7 @@ impl SelfProfiler {
         let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
         let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
         let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
+        let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
 
         let mut event_filter_mask = EventFilter::empty();
 
@@ -618,6 +683,8 @@ impl SelfProfiler {
             query_blocked_event_kind,
             query_cache_hit_event_kind,
             artifact_size_event_kind,
+            query_cache_hit_count_event_kind,
+            query_hits: Default::default(),
         })
     }
 
@@ -627,6 +694,25 @@ impl SelfProfiler {
         self.profiler.alloc_string(s)
     }
 
+    /// Store a cache hit of a query invocation
+    pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
+        // Fast path: assume that the query was already encountered before, and just record
+        // a cache hit.
+        let mut guard = self.query_hits.upgradable_read();
+        let query_hits = &guard;
+        let index = id.0 as usize;
+        if index < query_hits.len() {
+            // We only want to increment the count, no other synchronization is required
+            query_hits[index].fetch_add(1, Ordering::Relaxed);
+        } else {
+            // If not, we need to extend the query hit map to the highest observed ID
+            guard.with_upgraded(|vec| {
+                vec.resize_with(index + 1, || AtomicU64::new(0));
+                vec[index] = AtomicU64::from(1);
+            });
+        }
+    }
+
     /// Gets a `StringId` for the given string. This method makes sure that
     /// any strings going through it will only be allocated once in the
     /// profiling data.
diff --git a/compiler/rustc_query_impl/src/profiling_support.rs b/compiler/rustc_query_impl/src/profiling_support.rs
@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {
     for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {
         alloc(tcx, &mut string_cache)
     }
+    tcx.sess.prof.store_query_cache_hits();
 }

Original file line number	Diff line number	Diff line change
`@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {`
`259`	`259`	`for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {`
`260`	`260`	`alloc(tcx, &mut string_cache)`
`261`	`261`	`}`
	`262`	`+ tcx.sess.prof.store_query_cache_hits();`
`262`	`263`	`}`