Skip to content

Commit 9e63020

Browse files
Stephane EranianIngo Molnar
authored andcommitted
perf: Use hrtimers for event multiplexing
The current scheme of using the timer tick was fine for per-thread events. However, it was causing bias issues in system-wide mode (including for uncore PMUs). Event groups would not get their fair share of runtime on the PMU. With tickless kernels, if a core is idle there is no timer tick, and thus no event rotation (multiplexing). However, there are events (especially uncore events) which do count even though cores are asleep. This patch changes the timer source for multiplexing. It introduces a per-PMU per-cpu hrtimer. The advantage is that even when a core goes idle, it will come back to service the hrtimer, thus multiplexing on system-wide events works much better. The per-PMU implementation (suggested by PeterZ) enables adjusting the multiplexing interval per PMU. The preferred interval is stashed into the struct pmu. If not set, it will be forced to the default interval value. In order to minimize the impact of the hrtimer, it is turned on and off on demand. When the PMU on a CPU is overcommited, the hrtimer is activated. It is stopped when the PMU is not overcommitted. In order for this to work properly, we had to change the order of initialization in start_kernel() such that hrtimer_init() is run before perf_event_init(). The default interval in milliseconds is set to a timer tick just like with the old code. We will provide a sysctl to tune this in another patch. Signed-off-by: Stephane Eranian <[email protected]> Signed-off-by: Peter Zijlstra <[email protected]> Cc: Frederic Weisbecker <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent ab57384 commit 9e63020

File tree

3 files changed

+109
-10
lines changed

3 files changed

+109
-10
lines changed

include/linux/perf_event.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,9 @@ struct perf_cpu_context {
501501
struct perf_event_context *task_ctx;
502502
int active_oncpu;
503503
int exclusive;
504+
struct hrtimer hrtimer;
505+
ktime_t hrtimer_interval;
504506
struct list_head rotation_list;
505-
int jiffies_interval;
506507
struct pmu *unique_pmu;
507508
struct perf_cgroup *cgrp;
508509
};

init/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,6 @@ asmlinkage void __init start_kernel(void)
542542
if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
543543
local_irq_disable();
544544
idr_init_cache();
545-
perf_event_init();
546545
rcu_init();
547546
tick_nohz_init();
548547
radix_tree_init();
@@ -555,6 +554,7 @@ asmlinkage void __init start_kernel(void)
555554
softirq_init();
556555
timekeeping_init();
557556
time_init();
557+
perf_event_init();
558558
profile_init();
559559
call_function_init();
560560
WARN(!irqs_disabled(), "Interrupts were enabled early\n");

kernel/events/core.c

Lines changed: 106 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
170170
static int max_samples_per_tick __read_mostly =
171171
DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
172172

173+
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
174+
173175
int perf_proc_update_handler(struct ctl_table *table, int write,
174176
void __user *buffer, size_t *lenp,
175177
loff_t *ppos)
@@ -658,6 +660,98 @@ perf_cgroup_mark_enabled(struct perf_event *event,
658660
}
659661
#endif
660662

663+
/*
664+
* set default to be dependent on timer tick just
665+
* like original code
666+
*/
667+
#define PERF_CPU_HRTIMER (1000 / HZ)
668+
/*
669+
* function must be called with interrupts disbled
670+
*/
671+
static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
672+
{
673+
struct perf_cpu_context *cpuctx;
674+
enum hrtimer_restart ret = HRTIMER_NORESTART;
675+
int rotations = 0;
676+
677+
WARN_ON(!irqs_disabled());
678+
679+
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
680+
681+
rotations = perf_rotate_context(cpuctx);
682+
683+
/*
684+
* arm timer if needed
685+
*/
686+
if (rotations) {
687+
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
688+
ret = HRTIMER_RESTART;
689+
}
690+
691+
return ret;
692+
}
693+
694+
/* CPU is going down */
695+
void perf_cpu_hrtimer_cancel(int cpu)
696+
{
697+
struct perf_cpu_context *cpuctx;
698+
struct pmu *pmu;
699+
unsigned long flags;
700+
701+
if (WARN_ON(cpu != smp_processor_id()))
702+
return;
703+
704+
local_irq_save(flags);
705+
706+
rcu_read_lock();
707+
708+
list_for_each_entry_rcu(pmu, &pmus, entry) {
709+
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
710+
711+
if (pmu->task_ctx_nr == perf_sw_context)
712+
continue;
713+
714+
hrtimer_cancel(&cpuctx->hrtimer);
715+
}
716+
717+
rcu_read_unlock();
718+
719+
local_irq_restore(flags);
720+
}
721+
722+
static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
723+
{
724+
struct hrtimer *hr = &cpuctx->hrtimer;
725+
struct pmu *pmu = cpuctx->ctx.pmu;
726+
727+
/* no multiplexing needed for SW PMU */
728+
if (pmu->task_ctx_nr == perf_sw_context)
729+
return;
730+
731+
cpuctx->hrtimer_interval =
732+
ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER);
733+
734+
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
735+
hr->function = perf_cpu_hrtimer_handler;
736+
}
737+
738+
static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
739+
{
740+
struct hrtimer *hr = &cpuctx->hrtimer;
741+
struct pmu *pmu = cpuctx->ctx.pmu;
742+
743+
/* not for SW PMU */
744+
if (pmu->task_ctx_nr == perf_sw_context)
745+
return;
746+
747+
if (hrtimer_active(hr))
748+
return;
749+
750+
if (!hrtimer_callback_running(hr))
751+
__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
752+
0, HRTIMER_MODE_REL_PINNED, 0);
753+
}
754+
661755
void perf_pmu_disable(struct pmu *pmu)
662756
{
663757
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1600,7 @@ group_sched_in(struct perf_event *group_event,
15061600

15071601
if (event_sched_in(group_event, cpuctx, ctx)) {
15081602
pmu->cancel_txn(pmu);
1603+
perf_cpu_hrtimer_restart(cpuctx);
15091604
return -EAGAIN;
15101605
}
15111606

@@ -1552,6 +1647,8 @@ group_sched_in(struct perf_event *group_event,
15521647

15531648
pmu->cancel_txn(pmu);
15541649

1650+
perf_cpu_hrtimer_restart(cpuctx);
1651+
15551652
return -EAGAIN;
15561653
}
15571654

@@ -1807,8 +1904,10 @@ static int __perf_event_enable(void *info)
18071904
* If this event can't go on and it's part of a
18081905
* group, then the whole group has to come off.
18091906
*/
1810-
if (leader != event)
1907+
if (leader != event) {
18111908
group_sched_out(leader, cpuctx, ctx);
1909+
perf_cpu_hrtimer_restart(cpuctx);
1910+
}
18121911
if (leader->attr.pinned) {
18131912
update_group_times(leader);
18141913
leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2654,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
25552654
* because they're strictly cpu affine and rotate_start is called with IRQs
25562655
* disabled, while rotate_context is called from IRQ context.
25572656
*/
2558-
static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2657+
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
25592658
{
25602659
struct perf_event_context *ctx = NULL;
25612660
int rotate = 0, remove = 1;
@@ -2594,6 +2693,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
25942693
done:
25952694
if (remove)
25962695
list_del_init(&cpuctx->rotation_list);
2696+
2697+
return rotate;
25972698
}
25982699

25992700
#ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2726,6 @@ void perf_event_task_tick(void)
26252726
ctx = cpuctx->task_ctx;
26262727
if (ctx)
26272728
perf_adjust_freq_unthr_context(ctx, throttled);
2628-
2629-
if (cpuctx->jiffies_interval == 1 ||
2630-
!(jiffies % cpuctx->jiffies_interval))
2631-
perf_rotate_context(cpuctx);
26322729
}
26332730
}
26342731

@@ -6001,7 +6098,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
60016098
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
60026099
cpuctx->ctx.type = cpu_context;
60036100
cpuctx->ctx.pmu = pmu;
6004-
cpuctx->jiffies_interval = 1;
6101+
6102+
__perf_cpu_hrtimer_init(cpuctx, cpu);
6103+
60056104
INIT_LIST_HEAD(&cpuctx->rotation_list);
60066105
cpuctx->unique_pmu = pmu;
60076106
}
@@ -7387,7 +7486,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
73877486
case CPU_DOWN_PREPARE:
73887487
perf_event_exit_cpu(cpu);
73897488
break;
7390-
73917489
default:
73927490
break;
73937491
}

0 commit comments

Comments
 (0)