Skip to content

Commit eed0152

Browse files
Stephane EranianIngo Molnar
authored andcommitted
perf_events: Fix time tracking in samples
This patch corrects time tracking in samples. Without this patch both time_enabled and time_running are bogus when user asks for PERF_SAMPLE_READ. One uses PERF_SAMPLE_READ to sample the values of other counters in each sample. Because of multiplexing, it is necessary to know both time_enabled, time_running to be able to scale counts correctly. In this second version of the patch, we maintain a shadow copy of ctx->time which allows us to compute ctx->time without calling update_context_time() from NMI context. We avoid the issue that update_context_time() must always be called with ctx->lock held. We do not keep shadow copies of the other event timings because if the lead event is overflowing then it is active and thus it's been scheduled in via event_sched_in() in which case neither tstamp_stopped, tstamp_running can be modified. This timing logic only applies to samples when PERF_SAMPLE_READ is used. Note that this patch does not address timing issues related to sampling inheritance between tasks. This will be addressed in a future patch. With this patch, the libpfm4 example task_smpl now reports correct counts (shown on 2.4GHz Core 2): $ task_smpl -p 2400000000 -e unhalted_core_cycles:u,instructions_retired:u,baclears noploop 5 noploop for 5 seconds IIP:0x000000004006d6 PID:5596 TID:5596 TIME:466,210,211,430 STREAM_ID:33 PERIOD:2,400,000,000 ENA=1,010,157,814 RUN=1,010,157,814 NR=3 2,400,000,254 unhalted_core_cycles:u (33) 2,399,273,744 instructions_retired:u (34) 53,340 baclears (35) Signed-off-by: Stephane Eranian <[email protected]> Signed-off-by: Peter Zijlstra <[email protected]> LKML-Reference: <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent 7e55055 commit eed0152

File tree

2 files changed

+44
-8
lines changed

2 files changed

+44
-8
lines changed

include/linux/perf_event.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,16 @@ struct perf_event {
747747
u64 tstamp_running;
748748
u64 tstamp_stopped;
749749

750+
/*
751+
* timestamp shadows the actual context timing but it can
752+
* be safely used in NMI interrupt context. It reflects the
753+
* context time as it was when the event was last scheduled in.
754+
*
755+
* ctx_time already accounts for ctx->timestamp. Therefore to
756+
* compute ctx_time for a sample, simply add perf_clock().
757+
*/
758+
u64 shadow_ctx_time;
759+
750760
struct perf_event_attr attr;
751761
struct hw_perf_event hw;
752762

kernel/perf_event.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,8 @@ event_sched_in(struct perf_event *event,
674674

675675
event->tstamp_running += ctx->time - event->tstamp_stopped;
676676

677+
event->shadow_ctx_time = ctx->time - ctx->timestamp;
678+
677679
if (!is_software_event(event))
678680
cpuctx->active_oncpu++;
679681
ctx->nr_active++;
@@ -3396,19 +3398,20 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
33963398
}
33973399

33983400
static void perf_output_read_one(struct perf_output_handle *handle,
3399-
struct perf_event *event)
3401+
struct perf_event *event,
3402+
u64 enabled, u64 running)
34003403
{
34013404
u64 read_format = event->attr.read_format;
34023405
u64 values[4];
34033406
int n = 0;
34043407

34053408
values[n++] = perf_event_count(event);
34063409
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3407-
values[n++] = event->total_time_enabled +
3410+
values[n++] = enabled +
34083411
atomic64_read(&event->child_total_time_enabled);
34093412
}
34103413
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3411-
values[n++] = event->total_time_running +
3414+
values[n++] = running +
34123415
atomic64_read(&event->child_total_time_running);
34133416
}
34143417
if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
34213424
* XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
34223425
*/
34233426
static void perf_output_read_group(struct perf_output_handle *handle,
3424-
struct perf_event *event)
3427+
struct perf_event *event,
3428+
u64 enabled, u64 running)
34253429
{
34263430
struct perf_event *leader = event->group_leader, *sub;
34273431
u64 read_format = event->attr.read_format;
@@ -3431,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
34313435
values[n++] = 1 + leader->nr_siblings;
34323436

34333437
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3434-
values[n++] = leader->total_time_enabled;
3438+
values[n++] = enabled;
34353439

34363440
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3437-
values[n++] = leader->total_time_running;
3441+
values[n++] = running;
34383442

34393443
if (leader != event)
34403444
leader->pmu->read(leader);
@@ -3459,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
34593463
}
34603464
}
34613465

3466+
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3467+
PERF_FORMAT_TOTAL_TIME_RUNNING)
3468+
34623469
static void perf_output_read(struct perf_output_handle *handle,
34633470
struct perf_event *event)
34643471
{
3472+
u64 enabled = 0, running = 0, now, ctx_time;
3473+
u64 read_format = event->attr.read_format;
3474+
3475+
/*
3476+
* compute total_time_enabled, total_time_running
3477+
* based on snapshot values taken when the event
3478+
* was last scheduled in.
3479+
*
3480+
* we cannot simply called update_context_time()
3481+
* because of locking issue as we are called in
3482+
* NMI context
3483+
*/
3484+
if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3485+
now = perf_clock();
3486+
ctx_time = event->shadow_ctx_time + now;
3487+
enabled = ctx_time - event->tstamp_enabled;
3488+
running = ctx_time - event->tstamp_running;
3489+
}
3490+
34653491
if (event->attr.read_format & PERF_FORMAT_GROUP)
3466-
perf_output_read_group(handle, event);
3492+
perf_output_read_group(handle, event, enabled, running);
34673493
else
3468-
perf_output_read_one(handle, event);
3494+
perf_output_read_one(handle, event, enabled, running);
34693495
}
34703496

34713497
void perf_output_sample(struct perf_output_handle *handle,

0 commit comments

Comments
 (0)