@@ -24,6 +24,8 @@ int gc_first_tid;
2424// Mutex/cond used to synchronize sleep/wakeup of GC threads
2525uv_mutex_t gc_threads_lock ;
2626uv_cond_t gc_threads_cond ;
27+ // Mutex used to coordinate entry of GC threads in the mark loop
28+ uv_mutex_t gc_queue_observer_lock ;
2729
2830// Linked list of callback functions
2931
@@ -2857,8 +2859,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
28572859 jl_gc_markqueue_t * mq = & ptls -> mark_queue ;
28582860 jl_gc_markqueue_t * mq_master = NULL ;
28592861 int master_tid = jl_atomic_load (& gc_master_tid );
2860- if (master_tid != -1 )
2861- mq_master = & gc_all_tls_states [master_tid ]-> mark_queue ;
2862+ if (master_tid == -1 ) {
2863+ return ;
2864+ }
2865+ mq_master = & gc_all_tls_states [master_tid ]-> mark_queue ;
28622866 void * new_obj ;
28632867 jl_gc_chunk_t c ;
28642868 pop : {
@@ -2933,28 +2937,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
29332937 }
29342938}
29352939
2940+ size_t gc_count_work_in_queue (jl_ptls_t ptls ) JL_NOTSAFEPOINT
2941+ {
2942+ // assume each chunk is worth 256 units of work and each pointer
2943+ // is worth 1 unit of work
2944+ size_t work = 256 * (jl_atomic_load_relaxed (& ptls -> mark_queue .chunk_queue .bottom ) -
2945+ jl_atomic_load_relaxed (& ptls -> mark_queue .chunk_queue .top ));
2946+ work += (jl_atomic_load_relaxed (& ptls -> mark_queue .ptr_queue .bottom ) -
2947+ jl_atomic_load_relaxed (& ptls -> mark_queue .ptr_queue .top ));
2948+ return work ;
2949+ }
2950+
2951+ /**
2952+ * Correctness argument for the mark-loop termination protocol.
2953+ *
2954+ * Safety properties:
2955+ * - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
2956+ * that `gc_n_threads_marking` is zero.
2957+ *
2958+ * - No work item shall be stolen from the master thread (i.e. mutator thread which started
2959+ * GC and which helped the `jl_n_gcthreads` - 1 threads to mark) after
2960+ * `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
2961+ * necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
2962+ * `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
2963+ * and that no work is stolen from us at that point.
2964+ *
2965+ * Proof:
2966+ * - Suppose the master thread observes that `gc_n_threads_marking` is zero in
2967+ * `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
2968+ * Since threads try to steal from all threads' queues, this implies that all threads must
2969+ * have tried to steal from the queue which still has a work item left, but failed to do so,
2970+ * which violates the semantics of Chase-Lev's work-stealing queue.
2971+ *
2972+ * - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the event
2973+ * "master thread observes that `gc_n_threads_marking` is zero". Since we're using
2974+ * sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
2975+ * `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
2976+ * increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
2977+ * event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
2978+ * `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
2979+ * the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
2980+ * and therefore won't enter the mark-loop.
2981+ */
2982+
2983+ int gc_should_mark (jl_ptls_t ptls )
2984+ {
2985+ int should_mark = 0 ;
2986+ int n_threads_marking = jl_atomic_load (& gc_n_threads_marking );
2987+ // fast path
2988+ if (n_threads_marking == 0 ) {
2989+ return 0 ;
2990+ }
2991+ uv_mutex_lock (& gc_queue_observer_lock );
2992+ while (1 ) {
2993+ int tid = jl_atomic_load (& gc_master_tid );
2994+ // fast path
2995+ if (tid == -1 ) {
2996+ break ;
2997+ }
2998+ n_threads_marking = jl_atomic_load (& gc_n_threads_marking );
2999+ // fast path
3000+ if (n_threads_marking == 0 ) {
3001+ break ;
3002+ }
3003+ size_t work = gc_count_work_in_queue (gc_all_tls_states [tid ]);
3004+ for (tid = gc_first_tid ; tid < gc_first_tid + jl_n_gcthreads ; tid ++ ) {
3005+ work += gc_count_work_in_queue (gc_all_tls_states [tid ]);
3006+ }
3007+ // if there is a lot of work left, enter the mark loop
3008+ if (work >= 16 * n_threads_marking ) {
3009+ jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
3010+ should_mark = 1 ;
3011+ break ;
3012+ }
3013+ jl_cpu_pause ();
3014+ }
3015+ uv_mutex_unlock (& gc_queue_observer_lock );
3016+ return should_mark ;
3017+ }
3018+
3019+ void gc_wake_all_for_marking (jl_ptls_t ptls )
3020+ {
3021+ jl_atomic_store (& gc_master_tid , ptls -> tid );
3022+ uv_mutex_lock (& gc_threads_lock );
3023+ jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
3024+ uv_cond_broadcast (& gc_threads_cond );
3025+ uv_mutex_unlock (& gc_threads_lock );
3026+ }
3027+
29363028void gc_mark_loop_parallel (jl_ptls_t ptls , int master )
29373029{
2938- int backoff = GC_BACKOFF_MIN ;
29393030 if (master ) {
2940- jl_atomic_store (& gc_master_tid , ptls -> tid );
2941- // Wake threads up and try to do some work
2942- uv_mutex_lock (& gc_threads_lock );
2943- jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
2944- uv_cond_broadcast (& gc_threads_cond );
2945- uv_mutex_unlock (& gc_threads_lock );
3031+ gc_wake_all_for_marking (ptls );
29463032 gc_mark_and_steal (ptls );
29473033 jl_atomic_fetch_add (& gc_n_threads_marking , -1 );
29483034 }
2949- while (jl_atomic_load (& gc_n_threads_marking ) > 0 ) {
2950- // Try to become a thief while other threads are marking
2951- jl_atomic_fetch_add (& gc_n_threads_marking , 1 );
2952- if (jl_atomic_load (& gc_master_tid ) != -1 ) {
2953- gc_mark_and_steal (ptls );
3035+ while (1 ) {
3036+ int should_mark = gc_should_mark (ptls );
3037+ if (!should_mark ) {
3038+ break ;
29543039 }
3040+ gc_mark_and_steal (ptls );
29553041 jl_atomic_fetch_add (& gc_n_threads_marking , -1 );
2956- // Failed to steal
2957- gc_backoff (& backoff );
29583042 }
29593043}
29603044
@@ -3728,6 +3812,7 @@ void jl_gc_init(void)
37283812 uv_mutex_init (& gc_perm_lock );
37293813 uv_mutex_init (& gc_threads_lock );
37303814 uv_cond_init (& gc_threads_cond );
3815+ uv_mutex_init (& gc_queue_observer_lock );
37313816
37323817 jl_gc_init_page ();
37333818 jl_gc_debug_init ();
0 commit comments