@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
196196static void update_context_time (struct perf_event_context * ctx );
197197static u64 perf_event_time (struct perf_event * event );
198198
199- static void ring_buffer_attach (struct perf_event * event ,
200- struct ring_buffer * rb );
201-
202199void __weak perf_event_print_debug (void ) { }
203200
204201extern __weak const char * perf_pmu_name (void )
@@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head)
29172914 kfree (event );
29182915}
29192916
2920- static bool ring_buffer_put (struct ring_buffer * rb );
2917+ static void ring_buffer_put (struct ring_buffer * rb );
2918+ static void ring_buffer_detach (struct perf_event * event , struct ring_buffer * rb );
29212919
29222920static void free_event (struct perf_event * event )
29232921{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
29422940 if (has_branch_stack (event )) {
29432941 static_key_slow_dec_deferred (& perf_sched_events );
29442942 /* is system-wide event */
2945- if (!(event -> attach_state & PERF_ATTACH_TASK ))
2943+ if (!(event -> attach_state & PERF_ATTACH_TASK )) {
29462944 atomic_dec (& per_cpu (perf_branch_stack_events ,
29472945 event -> cpu ));
2946+ }
29482947 }
29492948 }
29502949
29512950 if (event -> rb ) {
2952- ring_buffer_put (event -> rb );
2953- event -> rb = NULL ;
2951+ struct ring_buffer * rb ;
2952+
2953+ /*
2954+ * Can happen when we close an event with re-directed output.
2955+ *
2956+ * Since we have a 0 refcount, perf_mmap_close() will skip
2957+ * over us; possibly making our ring_buffer_put() the last.
2958+ */
2959+ mutex_lock (& event -> mmap_mutex );
2960+ rb = event -> rb ;
2961+ if (rb ) {
2962+ rcu_assign_pointer (event -> rb , NULL );
2963+ ring_buffer_detach (event , rb );
2964+ ring_buffer_put (rb ); /* could be last */
2965+ }
2966+ mutex_unlock (& event -> mmap_mutex );
29542967 }
29552968
29562969 if (is_cgroup_event (event ))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
31883201 unsigned int events = POLL_HUP ;
31893202
31903203 /*
3191- * Race between perf_event_set_output() and perf_poll(): perf_poll()
3192- * grabs the rb reference but perf_event_set_output() overrides it.
3193- * Here is the timeline for two threads T1, T2:
3194- * t0: T1, rb = rcu_dereference(event->rb)
3195- * t1: T2, old_rb = event->rb
3196- * t2: T2, event->rb = new rb
3197- * t3: T2, ring_buffer_detach(old_rb)
3198- * t4: T1, ring_buffer_attach(rb1)
3199- * t5: T1, poll_wait(event->waitq)
3200- *
3201- * To avoid this problem, we grab mmap_mutex in perf_poll()
3202- * thereby ensuring that the assignment of the new ring buffer
3203- * and the detachment of the old buffer appear atomic to perf_poll()
3204+ * Pin the event->rb by taking event->mmap_mutex; otherwise
3205+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
32043206 */
32053207 mutex_lock (& event -> mmap_mutex );
3206-
3207- rcu_read_lock ();
3208- rb = rcu_dereference (event -> rb );
3209- if (rb ) {
3210- ring_buffer_attach (event , rb );
3208+ rb = event -> rb ;
3209+ if (rb )
32113210 events = atomic_xchg (& rb -> poll , 0 );
3212- }
3213- rcu_read_unlock ();
3214-
32153211 mutex_unlock (& event -> mmap_mutex );
32163212
32173213 poll_wait (file , & event -> waitq , wait );
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
35213517 return ;
35223518
35233519 spin_lock_irqsave (& rb -> event_lock , flags );
3524- if (!list_empty (& event -> rb_entry ))
3525- goto unlock ;
3526-
3527- list_add (& event -> rb_entry , & rb -> event_list );
3528- unlock :
3520+ if (list_empty (& event -> rb_entry ))
3521+ list_add (& event -> rb_entry , & rb -> event_list );
35293522 spin_unlock_irqrestore (& rb -> event_lock , flags );
35303523}
35313524
3532- static void ring_buffer_detach (struct perf_event * event ,
3533- struct ring_buffer * rb )
3525+ static void ring_buffer_detach (struct perf_event * event , struct ring_buffer * rb )
35343526{
35353527 unsigned long flags ;
35363528
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
35493541
35503542 rcu_read_lock ();
35513543 rb = rcu_dereference (event -> rb );
3552- if (!rb )
3553- goto unlock ;
3554-
3555- list_for_each_entry_rcu (event , & rb -> event_list , rb_entry )
3556- wake_up_all (& event -> waitq );
3557-
3558- unlock :
3544+ if (rb ) {
3545+ list_for_each_entry_rcu (event , & rb -> event_list , rb_entry )
3546+ wake_up_all (& event -> waitq );
3547+ }
35593548 rcu_read_unlock ();
35603549}
35613550
@@ -3582,52 +3571,115 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
35823571 return rb ;
35833572}
35843573
3585- static bool ring_buffer_put (struct ring_buffer * rb )
3574+ static void ring_buffer_put (struct ring_buffer * rb )
35863575{
3587- struct perf_event * event , * n ;
3588- unsigned long flags ;
3589-
35903576 if (!atomic_dec_and_test (& rb -> refcount ))
3591- return false ;
3577+ return ;
35923578
3593- spin_lock_irqsave (& rb -> event_lock , flags );
3594- list_for_each_entry_safe (event , n , & rb -> event_list , rb_entry ) {
3595- list_del_init (& event -> rb_entry );
3596- wake_up_all (& event -> waitq );
3597- }
3598- spin_unlock_irqrestore (& rb -> event_lock , flags );
3579+ WARN_ON_ONCE (!list_empty (& rb -> event_list ));
35993580
36003581 call_rcu (& rb -> rcu_head , rb_free_rcu );
3601- return true;
36023582}
36033583
36043584static void perf_mmap_open (struct vm_area_struct * vma )
36053585{
36063586 struct perf_event * event = vma -> vm_file -> private_data ;
36073587
36083588 atomic_inc (& event -> mmap_count );
3589+ atomic_inc (& event -> rb -> mmap_count );
36093590}
36103591
3592+ /*
3593+ * A buffer can be mmap()ed multiple times; either directly through the same
3594+ * event, or through other events by use of perf_event_set_output().
3595+ *
3596+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
3597+ * the buffer here, where we still have a VM context. This means we need
3598+ * to detach all events redirecting to us.
3599+ */
36113600static void perf_mmap_close (struct vm_area_struct * vma )
36123601{
36133602 struct perf_event * event = vma -> vm_file -> private_data ;
36143603
3615- if (atomic_dec_and_mutex_lock (& event -> mmap_count , & event -> mmap_mutex )) {
3616- struct ring_buffer * rb = event -> rb ;
3617- struct user_struct * mmap_user = rb -> mmap_user ;
3618- int mmap_locked = rb -> mmap_locked ;
3619- unsigned long size = perf_data_size (rb );
3604+ struct ring_buffer * rb = event -> rb ;
3605+ struct user_struct * mmap_user = rb -> mmap_user ;
3606+ int mmap_locked = rb -> mmap_locked ;
3607+ unsigned long size = perf_data_size (rb );
36203608
3621- rcu_assign_pointer (event -> rb , NULL );
3622- ring_buffer_detach (event , rb );
3623- mutex_unlock (& event -> mmap_mutex );
3609+ atomic_dec (& rb -> mmap_count );
3610+
3611+ if (!atomic_dec_and_mutex_lock (& event -> mmap_count , & event -> mmap_mutex ))
3612+ return ;
3613+
3614+ /* Detach current event from the buffer. */
3615+ rcu_assign_pointer (event -> rb , NULL );
3616+ ring_buffer_detach (event , rb );
3617+ mutex_unlock (& event -> mmap_mutex );
3618+
3619+ /* If there's still other mmap()s of this buffer, we're done. */
3620+ if (atomic_read (& rb -> mmap_count )) {
3621+ ring_buffer_put (rb ); /* can't be last */
3622+ return ;
3623+ }
36243624
3625- if (ring_buffer_put (rb )) {
3626- atomic_long_sub ((size >> PAGE_SHIFT ) + 1 , & mmap_user -> locked_vm );
3627- vma -> vm_mm -> pinned_vm -= mmap_locked ;
3628- free_uid (mmap_user );
3625+ /*
3626+ * No other mmap()s, detach from all other events that might redirect
3627+ * into the now unreachable buffer. Somewhat complicated by the
3628+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
3629+ */
3630+ again :
3631+ rcu_read_lock ();
3632+ list_for_each_entry_rcu (event , & rb -> event_list , rb_entry ) {
3633+ if (!atomic_long_inc_not_zero (& event -> refcount )) {
3634+ /*
3635+ * This event is en-route to free_event() which will
3636+ * detach it and remove it from the list.
3637+ */
3638+ continue ;
36293639 }
3640+ rcu_read_unlock ();
3641+
3642+ mutex_lock (& event -> mmap_mutex );
3643+ /*
3644+ * Check we didn't race with perf_event_set_output() which can
3645+ * swizzle the rb from under us while we were waiting to
3646+ * acquire mmap_mutex.
3647+ *
3648+ * If we find a different rb; ignore this event, a next
3649+ * iteration will no longer find it on the list. We have to
3650+ * still restart the iteration to make sure we're not now
3651+ * iterating the wrong list.
3652+ */
3653+ if (event -> rb == rb ) {
3654+ rcu_assign_pointer (event -> rb , NULL );
3655+ ring_buffer_detach (event , rb );
3656+ ring_buffer_put (rb ); /* can't be last, we still have one */
3657+ }
3658+ mutex_unlock (& event -> mmap_mutex );
3659+ put_event (event );
3660+
3661+ /*
3662+ * Restart the iteration; either we're on the wrong list or
3663+ * destroyed its integrity by doing a deletion.
3664+ */
3665+ goto again ;
36303666 }
3667+ rcu_read_unlock ();
3668+
3669+ /*
3670+ * It could be there's still a few 0-ref events on the list; they'll
3671+ * get cleaned up by free_event() -- they'll also still have their
3672+ * ref on the rb and will free it whenever they are done with it.
3673+ *
3674+ * Aside from that, this buffer is 'fully' detached and unmapped,
3675+ * undo the VM accounting.
3676+ */
3677+
3678+ atomic_long_sub ((size >> PAGE_SHIFT ) + 1 , & mmap_user -> locked_vm );
3679+ vma -> vm_mm -> pinned_vm -= mmap_locked ;
3680+ free_uid (mmap_user );
3681+
3682+ ring_buffer_put (rb ); /* could be last */
36313683}
36323684
36333685static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
36773729 return - EINVAL ;
36783730
36793731 WARN_ON_ONCE (event -> ctx -> parent_ctx );
3732+ again :
36803733 mutex_lock (& event -> mmap_mutex );
36813734 if (event -> rb ) {
3682- if (event -> rb -> nr_pages != nr_pages )
3735+ if (event -> rb -> nr_pages != nr_pages ) {
36833736 ret = - EINVAL ;
3737+ goto unlock ;
3738+ }
3739+
3740+ if (!atomic_inc_not_zero (& event -> rb -> mmap_count )) {
3741+ /*
3742+ * Raced against perf_mmap_close() through
3743+ * perf_event_set_output(). Try again, hope for better
3744+ * luck.
3745+ */
3746+ mutex_unlock (& event -> mmap_mutex );
3747+ goto again ;
3748+ }
3749+
36843750 goto unlock ;
36853751 }
36863752
@@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
37223788 goto unlock ;
37233789 }
37243790
3791+ atomic_set (& rb -> mmap_count , 1 );
37253792 rb -> mmap_locked = extra ;
37263793 rb -> mmap_user = get_current_user ();
37273794
37283795 atomic_long_add (user_extra , & user -> locked_vm );
37293796 vma -> vm_mm -> pinned_vm += extra ;
37303797
3798+ ring_buffer_attach (event , rb );
37313799 rcu_assign_pointer (event -> rb , rb );
37323800
37333801 perf_event_update_userpage (event );
@@ -3737,6 +3805,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
37373805 atomic_inc (& event -> mmap_count );
37383806 mutex_unlock (& event -> mmap_mutex );
37393807
3808+ /*
3809+ * Since pinned accounting is per vm we cannot allow fork() to copy our
3810+ * vma.
3811+ */
37403812 vma -> vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP ;
37413813 vma -> vm_ops = & perf_mmap_vmops ;
37423814
@@ -6415,23 +6487,37 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
64156487 if (atomic_read (& event -> mmap_count ))
64166488 goto unlock ;
64176489
6490+ old_rb = event -> rb ;
6491+
64186492 if (output_event ) {
64196493 /* get the rb we want to redirect to */
64206494 rb = ring_buffer_get (output_event );
64216495 if (!rb )
64226496 goto unlock ;
64236497 }
64246498
6425- old_rb = event -> rb ;
6426- rcu_assign_pointer (event -> rb , rb );
64276499 if (old_rb )
64286500 ring_buffer_detach (event , old_rb );
6501+
6502+ if (rb )
6503+ ring_buffer_attach (event , rb );
6504+
6505+ rcu_assign_pointer (event -> rb , rb );
6506+
6507+ if (old_rb ) {
6508+ ring_buffer_put (old_rb );
6509+ /*
6510+ * Since we detached before setting the new rb, so that we
6511+ * could attach the new rb, we could have missed a wakeup.
6512+ * Provide it now.
6513+ */
6514+ wake_up_all (& event -> waitq );
6515+ }
6516+
64296517 ret = 0 ;
64306518unlock :
64316519 mutex_unlock (& event -> mmap_mutex );
64326520
6433- if (old_rb )
6434- ring_buffer_put (old_rb );
64356521out :
64366522 return ret ;
64376523}
0 commit comments