Skip to content

Commit b94bd12

Browse files
committed
Auto merge of #142978 - Kobzol:query-hit, r=oli-obk
Add new self-profiling event to cheaply aggregate query cache hit counts Self-profile can record various types of things, some of them are not enabled, like query cache hits. Rustc currently records cache hits as "instant" measureme events, which records the thread ID, current timestamp, and constructs an individual event for each such cache hit. This is incredibly expensive, in a small hello world benchmark that just depends on serde, it makes compilation with nightly go from ~3s (with `-Zself-profile`) to ~15s (with `-Zself-profile -Zself-profile-events=default,query-cache-hit`). We'd like to add query cache hits to rustc-perf (rust-lang/rustc-perf#2168), but there we only need the actualy cache hit counts, not the timestamp/thread ID metadata associated with it. This PR adds a new `query-cache-hit-count` event. Instead of generating individual instant events, it simply aggregates cache hit counts per *query invocation* (so a combination of a query and its arguments, if I understand it correctly) using an atomic counter. At the end of the compilation session, these counts are then dumped to the self-profile log using integer events (in a similar fashion as how we record artifact sizes). I suppose that we could dedup the query invocations in rustc directly, but I don't think it's really required. In local experiments with the hello world + serde case, the query invocation records generated ~30 KiB more data in the self-profile, which was ~10% increase in this case. With this PR, the overhead of `-Zself-profile` seems to be the same as before, at least on my machine, so I also enabled query cache hit counts by default when self profiling is enabled. We should also modify `analyzeme`, specifically [this](https://github.com/rust-lang/measureme/blob/master/analyzeme/src/analysis.rs#L139), and make it load the integer events with query cache hit counts. I can do that as a follow-up, it's not required to be done in sync with this PR, and it doesn't require changes in rustc. CC `@cjgillot` r? `@oli-obk`
2 parents f51c987 + b49ca02 commit b94bd12

File tree

2 files changed

+94
-7
lines changed

2 files changed

+94
-7
lines changed

compiler/rustc_data_structures/src/profiling.rs

Lines changed: 93 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ use std::fmt::Display;
8888
use std::intrinsics::unlikely;
8989
use std::path::Path;
9090
use std::sync::Arc;
91+
use std::sync::atomic::Ordering;
9192
use std::time::{Duration, Instant};
9293
use std::{fs, process};
9394

@@ -99,12 +100,15 @@ use tracing::warn;
99100

100101
use crate::fx::FxHashMap;
101102
use crate::outline;
103+
use crate::sync::AtomicU64;
102104

103105
bitflags::bitflags! {
104106
#[derive(Clone, Copy)]
105107
struct EventFilter: u16 {
106108
const GENERIC_ACTIVITIES = 1 << 0;
107109
const QUERY_PROVIDERS = 1 << 1;
110+
/// Store detailed instant events, including timestamp and thread ID,
111+
/// per each query cache hit. Note that this is quite expensive.
108112
const QUERY_CACHE_HITS = 1 << 2;
109113
const QUERY_BLOCKED = 1 << 3;
110114
const INCR_CACHE_LOADS = 1 << 4;
@@ -113,16 +117,20 @@ bitflags::bitflags! {
113117
const FUNCTION_ARGS = 1 << 6;
114118
const LLVM = 1 << 7;
115119
const INCR_RESULT_HASHING = 1 << 8;
116-
const ARTIFACT_SIZES = 1 << 9;
120+
const ARTIFACT_SIZES = 1 << 9;
121+
/// Store aggregated counts of cache hits per query invocation.
122+
const QUERY_CACHE_HIT_COUNTS = 1 << 10;
117123

118124
const DEFAULT = Self::GENERIC_ACTIVITIES.bits() |
119125
Self::QUERY_PROVIDERS.bits() |
120126
Self::QUERY_BLOCKED.bits() |
121127
Self::INCR_CACHE_LOADS.bits() |
122128
Self::INCR_RESULT_HASHING.bits() |
123-
Self::ARTIFACT_SIZES.bits();
129+
Self::ARTIFACT_SIZES.bits() |
130+
Self::QUERY_CACHE_HIT_COUNTS.bits();
124131

125132
const ARGS = Self::QUERY_KEYS.bits() | Self::FUNCTION_ARGS.bits();
133+
const QUERY_CACHE_HIT_COMBINED = Self::QUERY_CACHE_HITS.bits() | Self::QUERY_CACHE_HIT_COUNTS.bits();
126134
}
127135
}
128136

@@ -134,6 +142,7 @@ const EVENT_FILTERS_BY_NAME: &[(&str, EventFilter)] = &[
134142
("generic-activity", EventFilter::GENERIC_ACTIVITIES),
135143
("query-provider", EventFilter::QUERY_PROVIDERS),
136144
("query-cache-hit", EventFilter::QUERY_CACHE_HITS),
145+
("query-cache-hit-count", EventFilter::QUERY_CACHE_HITS),
137146
("query-blocked", EventFilter::QUERY_BLOCKED),
138147
("incr-cache-load", EventFilter::INCR_CACHE_LOADS),
139148
("query-keys", EventFilter::QUERY_KEYS),
@@ -411,13 +420,24 @@ impl SelfProfilerRef {
411420
#[inline(never)]
412421
#[cold]
413422
fn cold_call(profiler_ref: &SelfProfilerRef, query_invocation_id: QueryInvocationId) {
414-
profiler_ref.instant_query_event(
415-
|profiler| profiler.query_cache_hit_event_kind,
416-
query_invocation_id,
417-
);
423+
if profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
424+
profiler_ref
425+
.profiler
426+
.as_ref()
427+
.unwrap()
428+
.increment_query_cache_hit_counters(QueryInvocationId(query_invocation_id.0));
429+
}
430+
if unlikely(profiler_ref.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
431+
profiler_ref.instant_query_event(
432+
|profiler| profiler.query_cache_hit_event_kind,
433+
query_invocation_id,
434+
);
435+
}
418436
}
419437

420-
if unlikely(self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HITS)) {
438+
// We check both kinds of query cache hit events at once, to reduce overhead in the
439+
// common case (with self-profile disabled).
440+
if unlikely(self.event_filter_mask.intersects(EventFilter::QUERY_CACHE_HIT_COMBINED)) {
421441
cold_call(self, query_invocation_id);
422442
}
423443
}
@@ -489,6 +509,35 @@ impl SelfProfilerRef {
489509
self.profiler.as_ref().map(|p| p.get_or_alloc_cached_string(s))
490510
}
491511

512+
/// Store query cache hits to the self-profile log.
513+
/// Should be called once at the end of the compilation session.
514+
///
515+
/// The cache hits are stored per **query invocation**, not **per query kind/type**.
516+
/// `analyzeme` can later deduplicate individual query labels from the QueryInvocationId event
517+
/// IDs.
518+
pub fn store_query_cache_hits(&self) {
519+
if self.event_filter_mask.contains(EventFilter::QUERY_CACHE_HIT_COUNTS) {
520+
let profiler = self.profiler.as_ref().unwrap();
521+
let query_hits = profiler.query_hits.read();
522+
let builder = EventIdBuilder::new(&profiler.profiler);
523+
let thread_id = get_thread_id();
524+
for (query_invocation, hit_count) in query_hits.iter().enumerate() {
525+
let hit_count = hit_count.load(Ordering::Relaxed);
526+
// No need to record empty cache hit counts
527+
if hit_count > 0 {
528+
let event_id =
529+
builder.from_label(StringId::new_virtual(query_invocation as u64));
530+
profiler.profiler.record_integer_event(
531+
profiler.query_cache_hit_count_event_kind,
532+
event_id,
533+
thread_id,
534+
hit_count,
535+
);
536+
}
537+
}
538+
}
539+
}
540+
492541
#[inline]
493542
pub fn enabled(&self) -> bool {
494543
self.profiler.is_some()
@@ -537,13 +586,28 @@ pub struct SelfProfiler {
537586

538587
string_cache: RwLock<FxHashMap<String, StringId>>,
539588

589+
/// Recording individual query cache hits as "instant" measureme events
590+
/// is incredibly expensive. Instead of doing that, we simply aggregate
591+
/// cache hit *counts* per query invocation, and then store the final count
592+
/// of cache hits per invocation at the end of the compilation session.
593+
///
594+
/// With this approach, we don't know the individual thread IDs and timestamps
595+
/// of cache hits, but it has very little overhead on top of `-Zself-profile`.
596+
/// Recording the cache hits as individual events made compilation 3-5x slower.
597+
///
598+
/// Query invocation IDs should be monotonic integers, so we can store them in a vec,
599+
/// rather than using a hashmap.
600+
query_hits: RwLock<Vec<AtomicU64>>,
601+
540602
query_event_kind: StringId,
541603
generic_activity_event_kind: StringId,
542604
incremental_load_result_event_kind: StringId,
543605
incremental_result_hashing_event_kind: StringId,
544606
query_blocked_event_kind: StringId,
545607
query_cache_hit_event_kind: StringId,
546608
artifact_size_event_kind: StringId,
609+
/// Total cache hits per query invocation
610+
query_cache_hit_count_event_kind: StringId,
547611
}
548612

549613
impl SelfProfiler {
@@ -573,6 +637,7 @@ impl SelfProfiler {
573637
let query_blocked_event_kind = profiler.alloc_string("QueryBlocked");
574638
let query_cache_hit_event_kind = profiler.alloc_string("QueryCacheHit");
575639
let artifact_size_event_kind = profiler.alloc_string("ArtifactSize");
640+
let query_cache_hit_count_event_kind = profiler.alloc_string("QueryCacheHitCount");
576641

577642
let mut event_filter_mask = EventFilter::empty();
578643

@@ -618,6 +683,8 @@ impl SelfProfiler {
618683
query_blocked_event_kind,
619684
query_cache_hit_event_kind,
620685
artifact_size_event_kind,
686+
query_cache_hit_count_event_kind,
687+
query_hits: Default::default(),
621688
})
622689
}
623690

@@ -627,6 +694,25 @@ impl SelfProfiler {
627694
self.profiler.alloc_string(s)
628695
}
629696

697+
/// Store a cache hit of a query invocation
698+
pub fn increment_query_cache_hit_counters(&self, id: QueryInvocationId) {
699+
// Fast path: assume that the query was already encountered before, and just record
700+
// a cache hit.
701+
let mut guard = self.query_hits.upgradable_read();
702+
let query_hits = &guard;
703+
let index = id.0 as usize;
704+
if index < query_hits.len() {
705+
// We only want to increment the count, no other synchronization is required
706+
query_hits[index].fetch_add(1, Ordering::Relaxed);
707+
} else {
708+
// If not, we need to extend the query hit map to the highest observed ID
709+
guard.with_upgraded(|vec| {
710+
vec.resize_with(index + 1, || AtomicU64::new(0));
711+
vec[index] = AtomicU64::from(1);
712+
});
713+
}
714+
}
715+
630716
/// Gets a `StringId` for the given string. This method makes sure that
631717
/// any strings going through it will only be allocated once in the
632718
/// profiling data.

compiler/rustc_query_impl/src/profiling_support.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,4 +259,5 @@ pub fn alloc_self_profile_query_strings(tcx: TyCtxt<'_>) {
259259
for alloc in super::ALLOC_SELF_PROFILE_QUERY_STRINGS.iter() {
260260
alloc(tcx, &mut string_cache)
261261
}
262+
tcx.sess.prof.store_query_cache_hits();
262263
}

0 commit comments

Comments
 (0)