diff --git a/base/locks-mt.jl b/base/locks-mt.jl
index 5d355b9ed200c..5c5db758787ef 100644
--- a/base/locks-mt.jl
+++ b/base/locks-mt.jl
@@ -39,8 +39,10 @@ end
 Base.assert_havelock(l::SpinLock) = islocked(l) ? nothing : Base.concurrency_violation()
 
 function lock(l::SpinLock)
+    # t0 = time_ns()
     while true
         if @inline trylock(l)
+            # ccall(:jl_record_lock_spin_time, Cvoid, (Culonglong,), time_ns() - t0)
             return
         end
         ccall(:jl_cpu_suspend, Cvoid, ())
diff --git a/base/timing.jl b/base/timing.jl
index bdbb32936b56f..fba02e1a6ef0d 100644
--- a/base/timing.jl
+++ b/base/timing.jl
@@ -114,6 +114,32 @@ function jit_total_bytes()
     return ccall(:jl_jit_total_bytes, Csize_t, ())
 end
 
+thread_up_time() = ccall(:jl_thread_up_time, UInt64, ())
+
+_unsafe_thread_user_time(tid::Integer) = ccall(:jl_thread_user_time, UInt64, (Cint,), Cint(tid))
+
+function thread_user_time(tid::Integer)
+    max_tid = Threads.threadpoolsize(:interactive) + Threads.threadpoolsize(:default)
+    0 < tid ≤ max_tid || throw(ArgumentError("invalid thread id `$(tid)`"))
+    return _unsafe_thread_user_time(tid)
+end
+
+function thread_user_time(pool::Symbol=:all)
+    total = UInt64(0)
+    pool in (:all, :default, :interactive) || throw(ArgumentError("invalid threadpool `$(repr(pool))`"))
+    if pool === :all || pool === :default
+        for tid in Threads.threadpooltids(:default)
+            total += _unsafe_thread_user_time(tid)
+        end
+    end
+    if pool === :all || pool === :interactive
+        for tid in Threads.threadpooltids(:interactive)
+            total += _unsafe_thread_user_time(tid)
+        end
+    end
+    return total
+end
+
 # print elapsed time, return expression value
 const _mem_units = ["byte", "KiB", "MiB", "GiB", "TiB", "PiB"]
 const _cnt_units = ["", " k", " M", " G", " T", " P"]
diff --git a/src/gc.c b/src/gc.c
index 4cb48ba72dfe1..c0dccc8758bf7 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3741,7 +3741,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
     jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
     // `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
-    uint64_t t0 = jl_hrtime();
+    uint64_t t0 = jl_hrtime(); // time we entered GC
     if (!jl_safepoint_start_gc()) {
         // either another thread is running GC, or the GC got disabled just now.
         jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
@@ -3770,7 +3770,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
     JL_PROBE_GC_STOP_THE_WORLD();
 
-    uint64_t t1 = jl_hrtime();
+    uint64_t t1 = jl_hrtime();  // time user code stopped running
     uint64_t duration = t1 - t0;
     if (duration > gc_num.max_time_to_safepoint)
         gc_num.max_time_to_safepoint = duration;
@@ -3798,6 +3798,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_safepoint_end_gc();
     jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
     JL_PROBE_GC_END();
+    // Time how long GC took.
+    ptls->timing_tls.gc_time += jl_hrtime() - t1;
 
     // Only disable finalizers on current thread
     // Doing this on all threads is racy (it's impossible to check
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 3d06d380d10c2..63f0e457b71c7 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -124,6 +124,14 @@ typedef struct {
 
 struct _jl_bt_element_t;
 
+typedef struct {
+    uint64_t start_time;
+    uint64_t sleep_time;
+    uint64_t scheduler_time;
+    /* uint64_t lock_spin_time; */
+    uint64_t gc_time;
+} jl_timing_tls_states_t;
+
 // This includes all the thread local states we care about for a thread.
 // Changes to TLS field types must be reflected in codegen.
 #define JL_MAX_BT_SIZE 80000
@@ -155,6 +163,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_timing_tls_states_t timing_tls;
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/partr.c b/src/partr.c
index 33631dc83c05a..b4b0b520c3f86 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -389,12 +389,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
 {
     jl_task_t *ct = jl_current_task;
     uint64_t start_cycles = 0;
-
+    uint64_t t0 = jl_hrtime();
     while (1) {
+        jl_ptls_t ptls = ct->ptls;
         jl_task_t *task = get_next_task(trypoptask, q);
-        if (task)
+        if (task) {
+            ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
             return task;
-
+        }
         // quick, race-y check to see if there seems to be any stuff in there
         jl_cpu_pause();
         if (!check_empty(checkempty)) {
@@ -403,7 +405,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
         }
 
         jl_cpu_pause();
-        jl_ptls_t ptls = ct->ptls;
         if (sleep_check_after_threshold(&start_cycles) || (ptls->tid == 0 && (!jl_atomic_load_relaxed(&_threadedregion) || wait_empty))) {
             // acquire sleep-check lock
             jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping);
@@ -425,6 +426,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
                     JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
                 }
                 if (task)
+                    ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
                     return task;
                 continue;
             }
@@ -433,6 +435,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
                     jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us
                     JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
                 }
+                ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
                 return task;
             }
 
@@ -507,6 +510,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
 
             // the other threads will just wait for an individual wake signal to resume
             JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
+            uint64_t tsleep0 = jl_hrtime();
             int8_t gc_state = jl_gc_safe_enter(ptls);
             uv_mutex_lock(&ptls->sleep_lock);
             while (may_sleep(ptls)) {
@@ -523,6 +527,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
             assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
             uv_mutex_unlock(&ptls->sleep_lock);
             JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
+            ptls->timing_tls.sleep_time += jl_hrtime() - tsleep0;
             jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
             start_cycles = 0;
             if (task) {
diff --git a/src/safepoint.c b/src/safepoint.c
index c6f9a42059d1a..17fdd8c5263c1 100644
--- a/src/safepoint.c
+++ b/src/safepoint.c
@@ -161,7 +161,10 @@ void jl_safepoint_wait_gc(void)
     jl_task_t *ct = jl_current_task; (void)ct;
     JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
     // The thread should have set this is already
-    assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0);
+    jl_ptls_t ptls = ct->ptls;
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) != 0);
+    // Time how long this thread is stopped while GC is running.
+    uint64_t t0 = jl_hrtime();
     // Use normal volatile load in the loop for speed until GC finishes.
     // Then use an acquire load to make sure the GC result is visible on this thread.
     while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
@@ -173,6 +176,7 @@ void jl_safepoint_wait_gc(void)
             uv_cond_wait(&safepoint_cond, &safepoint_lock);
         uv_mutex_unlock(&safepoint_lock);
     }
+    ptls->timing_tls.gc_time = jl_hrtime() - t0;
 }
 
 void jl_safepoint_enable_sigint(void)
diff --git a/src/threading.c b/src/threading.c
index 8f350d41f64b1..ee8c753b23a2b 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -49,6 +49,24 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;
 
+static uint64_t jl_thread_start_time;
+void jl_set_thread_start_time(void)
+{
+    jl_thread_start_time = jl_hrtime();
+}
+
+// TODO: not just current thread
+JL_DLLEXPORT uint64_t jl_thread_up_time(void)
+{
+    return jl_hrtime() - jl_thread_start_time;
+}
+JL_DLLEXPORT uint64_t jl_thread_user_time(uint8_t tid)
+{
+    jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+    jl_timing_tls_states_t *timing = &ptls->timing_tls;
+    return jl_thread_up_time() - timing->gc_time - timing->scheduler_time;
+}
+
 JL_DLLEXPORT void *jl_get_ptls_states(void)
 {
     // mostly deprecated: use current_task instead
@@ -765,6 +783,7 @@ void jl_start_threads(void)
         }
         uv_thread_detach(&uvtid);
     }
+    jl_set_thread_start_time();
 }
 
 _Atomic(unsigned) _threadedregion; // HACK: keep track of whether to prioritize IO or threading
@@ -794,6 +813,13 @@ JL_DLLEXPORT void jl_exit_threaded_region(void)
 
 // Profiling stubs
 
+/* JL_DLLEXPORT void jl_record_lock_spin_time(uint64_t time) JL_NOTSAFEPOINT */
+/* { */
+/*     jl_task_t *ct = jl_current_task; */
+/*     jl_ptls_t ptls = ct->ptls; */
+/*     ptls->timing_tls.lock_spin_time += time; */
+/* } */
+
 void _jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEPOINT
 {
     jl_atomic_store_relaxed(&lock->owner, (jl_task_t*)NULL);
@@ -816,10 +842,12 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint)
         return;
     }
     JL_TIMING(LOCK_SPIN, LOCK_SPIN);
+    /* uint64_t t0 = jl_hrtime(); */
     while (1) {
         if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) {
             lock->count = 1;
             jl_profile_lock_acquired(lock);
+            /* jl_record_lock_spin_time(jl_hrtime() - t0); */
             return;
         }
         if (safepoint) {
@@ -919,7 +947,6 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock)
     }
 }
 
-
 // Make gc alignment available for threading
 // see threads.jl alignment
 JL_DLLEXPORT int jl_alignment(size_t sz)
diff --git a/src/threading.h b/src/threading.h
index cb26537699713..b6dfe95e08933 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -30,6 +30,7 @@ void jl_init_threadinginfra(void);
 void jl_parallel_gc_threadfun(void *arg);
 void jl_concurrent_gc_threadfun(void *arg);
 void jl_threadfun(void *arg);
+void jl_set_thread_start_time(void);
 
 #ifdef __cplusplus
 }