Skip to content

Commit 00b3a0b

Browse files
committed
GC scheduler refinements (JuliaLang#52294)
1 parent 005e280 commit 00b3a0b

File tree

2 files changed

+102
-30
lines changed

2 files changed

+102
-30
lines changed

src/gc.c

Lines changed: 101 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ int gc_first_tid;
2424
// Mutex/cond used to synchronize sleep/wakeup of GC threads
2525
uv_mutex_t gc_threads_lock;
2626
uv_cond_t gc_threads_cond;
27+
// Mutex used to coordinate entry of GC threads in the mark loop
28+
uv_mutex_t gc_queue_observer_lock;
2729

2830
// Linked list of callback functions
2931

@@ -2857,8 +2859,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
28572859
jl_gc_markqueue_t *mq = &ptls->mark_queue;
28582860
jl_gc_markqueue_t *mq_master = NULL;
28592861
int master_tid = jl_atomic_load(&gc_master_tid);
2860-
if (master_tid != -1)
2861-
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
2862+
if (master_tid == -1) {
2863+
return;
2864+
}
2865+
mq_master = &gc_all_tls_states[master_tid]->mark_queue;
28622866
void *new_obj;
28632867
jl_gc_chunk_t c;
28642868
pop : {
@@ -2933,28 +2937,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
29332937
}
29342938
}
29352939

2940+
size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
2941+
{
2942+
// assume each chunk is worth 256 units of work and each pointer
2943+
// is worth 1 unit of work
2944+
size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
2945+
jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
2946+
work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
2947+
jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
2948+
return work;
2949+
}
2950+
2951+
/**
2952+
* Correctness argument for the mark-loop termination protocol.
2953+
*
2954+
* Safety properties:
2955+
* - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
2956+
* that `gc_n_threads_marking` is zero.
2957+
*
2958+
* - No work item shall be stolen from the master thread (i.e. mutator thread which started
2959+
* GC and which helped the `jl_n_gcthreads` - 1 threads to mark) after
2960+
* `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
2961+
* necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
2962+
* `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
2963+
* and that no work is stolen from us at that point.
2964+
*
2965+
* Proof:
2966+
* - Suppose the master thread observes that `gc_n_threads_marking` is zero in
2967+
* `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
2968+
* Since threads try to steal from all threads' queues, this implies that all threads must
2969+
* have tried to steal from the queue which still has a work item left, but failed to do so,
2970+
* which violates the semantics of Chase-Lev's work-stealing queue.
2971+
*
2972+
* - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the event
2973+
* "master thread observes that `gc_n_threads_marking` is zero". Since we're using
2974+
* sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
2975+
* `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
2976+
* increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
2977+
* event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
2978+
* `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
2979+
* the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
2980+
* and therefore won't enter the mark-loop.
2981+
*/
2982+
2983+
int gc_should_mark(jl_ptls_t ptls)
2984+
{
2985+
int should_mark = 0;
2986+
int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
2987+
// fast path
2988+
if (n_threads_marking == 0) {
2989+
return 0;
2990+
}
2991+
uv_mutex_lock(&gc_queue_observer_lock);
2992+
while (1) {
2993+
int tid = jl_atomic_load(&gc_master_tid);
2994+
// fast path
2995+
if (tid == -1) {
2996+
break;
2997+
}
2998+
n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
2999+
// fast path
3000+
if (n_threads_marking == 0) {
3001+
break;
3002+
}
3003+
size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
3004+
for (tid = gc_first_tid; tid < gc_first_tid + jl_n_gcthreads; tid++) {
3005+
work += gc_count_work_in_queue(gc_all_tls_states[tid]);
3006+
}
3007+
// if there is a lot of work left, enter the mark loop
3008+
if (work >= 16 * n_threads_marking) {
3009+
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
3010+
should_mark = 1;
3011+
break;
3012+
}
3013+
jl_cpu_pause();
3014+
}
3015+
uv_mutex_unlock(&gc_queue_observer_lock);
3016+
return should_mark;
3017+
}
3018+
3019+
void gc_wake_all_for_marking(jl_ptls_t ptls)
3020+
{
3021+
jl_atomic_store(&gc_master_tid, ptls->tid);
3022+
uv_mutex_lock(&gc_threads_lock);
3023+
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
3024+
uv_cond_broadcast(&gc_threads_cond);
3025+
uv_mutex_unlock(&gc_threads_lock);
3026+
}
3027+
29363028
void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
29373029
{
2938-
int backoff = GC_BACKOFF_MIN;
29393030
if (master) {
2940-
jl_atomic_store(&gc_master_tid, ptls->tid);
2941-
// Wake threads up and try to do some work
2942-
uv_mutex_lock(&gc_threads_lock);
2943-
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2944-
uv_cond_broadcast(&gc_threads_cond);
2945-
uv_mutex_unlock(&gc_threads_lock);
3031+
gc_wake_all_for_marking(ptls);
29463032
gc_mark_and_steal(ptls);
29473033
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
29483034
}
2949-
while (jl_atomic_load(&gc_n_threads_marking) > 0) {
2950-
// Try to become a thief while other threads are marking
2951-
jl_atomic_fetch_add(&gc_n_threads_marking, 1);
2952-
if (jl_atomic_load(&gc_master_tid) != -1) {
2953-
gc_mark_and_steal(ptls);
3035+
while (1) {
3036+
int should_mark = gc_should_mark(ptls);
3037+
if (!should_mark) {
3038+
break;
29543039
}
3040+
gc_mark_and_steal(ptls);
29553041
jl_atomic_fetch_add(&gc_n_threads_marking, -1);
2956-
// Failed to steal
2957-
gc_backoff(&backoff);
29583042
}
29593043
}
29603044

@@ -3728,6 +3812,7 @@ void jl_gc_init(void)
37283812
uv_mutex_init(&gc_perm_lock);
37293813
uv_mutex_init(&gc_threads_lock);
37303814
uv_cond_init(&gc_threads_cond);
3815+
uv_mutex_init(&gc_queue_observer_lock);
37313816

37323817
jl_gc_init_page();
37333818
jl_gc_debug_init();

src/gc.h

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -189,25 +189,12 @@ extern jl_gc_page_stack_t global_page_pool_lazily_freed;
189189
extern jl_gc_page_stack_t global_page_pool_clean;
190190
extern jl_gc_page_stack_t global_page_pool_freed;
191191

192-
#define GC_BACKOFF_MIN 4
193-
#define GC_BACKOFF_MAX 12
194-
195-
STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
196-
{
197-
if (*i < GC_BACKOFF_MAX) {
198-
(*i)++;
199-
}
200-
for (int j = 0; j < (1 << *i); j++) {
201-
jl_cpu_pause();
202-
}
203-
}
204-
205192
// Lock-free stack implementation taken
206193
// from Herlihy's "The Art of Multiprocessor Programming"
207194
// XXX: this is not a general-purpose lock-free stack. We can
208195
// get away with just using a CAS and not implementing some ABA
209196
// prevention mechanism since once a node is popped from the
210-
// `jl_gc_global_page_pool_t`, it may only be pushed back to them
197+
// `jl_gc_page_stack_t`, it may only be pushed back to them
211198
// in the sweeping phase, which also doesn't push a node into the
212199
// same stack after it's popped
213200

0 commit comments

Comments
 (0)