diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 279686c387e1b..a3ffdf1d051a9 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -295,12 +295,12 @@ jl_code_instance_t *jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_
     jl_value_t *ci = cgparams.lookup(mi, world, world);
     JL_GC_PROMISE_ROOTED(ci);
     jl_code_instance_t *codeinst = NULL;
-    JL_GC_PUSH1(&codeinst);
     if (ci != jl_nothing && jl_atomic_load_relaxed(&((jl_code_instance_t *)ci)->inferred) != jl_nothing) {
         codeinst = (jl_code_instance_t*)ci;
     }
     else {
         if (cgparams.lookup != jl_rettype_inferred_addr) {
+            // XXX: This will corrupt and leak a lot of memory which may be very bad
             jl_error("Refusing to automatically run type inference with custom cache lookup.");
         }
         else {
@@ -309,15 +309,129 @@ jl_code_instance_t *jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_
              * it into the cache here, since it was explicitly requested and is
              * otherwise not reachable from anywhere in the system image.
              */
-            if (!jl_mi_cache_has_ci(mi, codeinst))
+            if (codeinst && !jl_mi_cache_has_ci(mi, codeinst)) {
+                JL_GC_PUSH1(&codeinst);
                 jl_mi_cache_insert(mi, codeinst);
+                JL_GC_POP();
+            }
         }
     }
-    JL_GC_POP();
     return codeinst;
 }
 
-arraylist_t new_invokes;
+typedef DenseMap<jl_code_instance_t*, std::pair<orc::ThreadSafeModule, jl_llvm_functions_t>> jl_compiled_functions_t;
+static void compile_workqueue(jl_codegen_params_t &params, CompilationPolicy policy, jl_compiled_functions_t &compiled_functions)
+{
+    decltype(params.workqueue) workqueue;
+    std::swap(params.workqueue, workqueue);
+    jl_code_info_t *src = NULL;
+    jl_code_instance_t *codeinst = NULL;
+    JL_GC_PUSH2(&src, &codeinst);
+    assert(!params.cache);
+    while (!workqueue.empty()) {
+        auto it = workqueue.pop_back_val();
+        codeinst = it.first;
+        auto &proto = it.second;
+        // try to emit code for this item from the workqueue
+        StringRef invokeName = "";
+        StringRef preal_decl = "";
+        bool preal_specsig = false;
+        {
+            auto it = compiled_functions.find(codeinst);
+            if (it == compiled_functions.end()) {
+                // Reinfer the function. The JIT came along and removed the inferred
+                // method body. See #34993
+                if ((policy != CompilationPolicy::Default || params.params->trim) &&
+                    jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
+                    // XXX: SOURCE_MODE_FORCE_SOURCE is wrong here (neither sufficient nor necessary)
+                    codeinst = jl_type_infer(codeinst->def, jl_atomic_load_relaxed(&codeinst->max_world), SOURCE_MODE_FORCE_SOURCE);
+                }
+                if (codeinst) {
+                    orc::ThreadSafeModule result_m =
+                        jl_create_ts_module(name_from_method_instance(codeinst->def),
+                            params.tsctx, params.DL, params.TargetTriple);
+                    auto decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
+                    if (result_m)
+                        it = compiled_functions.insert(std::make_pair(codeinst, std::make_pair(std::move(result_m), std::move(decls)))).first;
+                }
+            }
+            if (it != compiled_functions.end()) {
+                auto &decls = it->second.second;
+                invokeName = decls.functionObject;
+                if (decls.functionObject == "jl_fptr_args") {
+                    preal_decl = decls.specFunctionObject;
+                }
+                else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
+                    preal_decl = decls.specFunctionObject;
+                    preal_specsig = true;
+                }
+            }
+        }
+        // patch up the prototype we emitted earlier
+        Module *mod = proto.decl->getParent();
+        assert(proto.decl->isDeclaration());
+        Function *pinvoke = nullptr;
+        if (preal_decl.empty()) {
+            if (invokeName.empty() && params.params->trim) {
+                errs() << "Bailed out to invoke when compiling:";
+                jl_(codeinst->def);
+                abort();
+            }
+            pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params);
+            if (!proto.specsig)
+                proto.decl->replaceAllUsesWith(pinvoke);
+        }
+        if (proto.specsig && !preal_specsig) {
+            // get or build an fptr1 that can invoke codeinst
+            if (pinvoke == nullptr)
+                pinvoke = get_or_emit_fptr1(preal_decl, mod);
+            // emit specsig-to-(jl)invoke conversion
+            proto.decl->setLinkage(GlobalVariable::InternalLinkage);
+            //protodecl->setAlwaysInline();
+            jl_init_function(proto.decl, params.TargetTriple);
+            jl_method_instance_t *mi = codeinst->def;
+            size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed
+            bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
+            // TODO: maybe this can be cached in codeinst->specfptr?
+            emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke, 0, 0);
+            preal_decl = ""; // no need to fixup the name
+        }
+        if (!preal_decl.empty()) {
+            // merge and/or rename this prototype to the real function
+            if (Value *specfun = mod->getNamedValue(preal_decl)) {
+                if (proto.decl != specfun)
+                    proto.decl->replaceAllUsesWith(specfun);
+            }
+            else {
+                proto.decl->setName(preal_decl);
+            }
+        }
+        if (proto.oc) { // additionally, if we are dealing with an oc, then we might also need to fix up the fptr1 reference too
+            assert(proto.specsig);
+            StringRef ocinvokeDecl = invokeName;
+            // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too
+            // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure
+            if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return")
+                ocinvokeDecl = pinvoke->getName();
+            assert(!ocinvokeDecl.empty());
+            assert(ocinvokeDecl != "jl_fptr_args");
+            assert(ocinvokeDecl != "jl_fptr_sparam");
+            // merge and/or rename this prototype to the real function
+            if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) {
+                if (proto.oc != specfun)
+                    proto.oc->replaceAllUsesWith(specfun);
+            }
+            else {
+                proto.oc->setName(ocinvokeDecl);
+            }
+        }
+        workqueue.append(params.workqueue);
+        params.workqueue.clear();
+    }
+    JL_GC_POP();
+}
+
+
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup, and can
 // also be used be extern consumers like GPUCompiler.jl to obtain a module containing
@@ -346,7 +460,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     orc::ThreadSafeContext ctx;
     orc::ThreadSafeModule backing;
     if (!llvmmod) {
-        ctx = jl_ExecutionEngine->acquireContext();
+        ctx = jl_ExecutionEngine->makeContext();
         backing = jl_create_ts_module("text", ctx);
     }
     orc::ThreadSafeModule &clone = llvmmod ? *unwrap(llvmmod) : backing;
@@ -367,11 +481,11 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     params.imaging_mode = imaging;
     params.debug_level = cgparams->debug_info_level;
     params.external_linkage = _external_linkage;
-    arraylist_new(&new_invokes, 0);
     size_t compile_for[] = { jl_typeinf_world, _world };
     int worlds = 0;
     if (jl_options.trim != JL_TRIM_NO)
         worlds = 1;
+    jl_compiled_functions_t compiled_functions;
     for (; worlds < 2; worlds++) {
         JL_TIMING(NATIVE_AOT, NATIVE_Codegen);
         size_t this_world = compile_for[worlds];
@@ -391,7 +505,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
                 continue;
             }
             mi = (jl_method_instance_t*)item;
-compile_mi:
             src = NULL;
             // if this method is generally visible to the current compilation world,
             // and this is either the primary world, or not applicable in the primary world
@@ -406,7 +519,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
                     jl_(mi);
                     abort();
                 }
-                if (codeinst && !params.compiled_functions.count(codeinst) && !data->jl_fvar_map.count(codeinst)) {
+                if (codeinst && !compiled_functions.count(codeinst) && !data->jl_fvar_map.count(codeinst)) {
                     // now add it to our compilation results
                     // Const returns do not do codegen, but juliac inspects codegen results so make a dummy fvar entry to represent it
                     if (jl_options.trim != JL_TRIM_NO && jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr) {
@@ -418,7 +531,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
                                 Triple(clone.getModuleUnlocked()->getTargetTriple()));
                         jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
                         if (result_m)
-                            params.compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
+                            compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
                         else if (jl_options.trim != JL_TRIM_NO) {
                             // if we're building a small image, we need to compile everything
                             // to ensure that we have all the information we need.
@@ -428,26 +541,19 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
                         }
                     }
                 }
-            } else if (this_world != jl_typeinf_world) {
+            }
+            else if (this_world != jl_typeinf_world) {
                 /*
                 jl_safe_printf("Codegen could not find requested codeinstance to be compiled\n");
                 jl_(mi);
                 abort();
                 */
             }
-            // TODO: is goto the best way to do this?
-            jl_compile_workqueue(params, policy);
-            mi = (jl_method_instance_t*)arraylist_pop(&new_invokes);
-            if (mi != NULL) {
-                goto compile_mi;
-            }
         }
-
-        // finally, make sure all referenced methods also get compiled or fixed up
-        jl_compile_workqueue(params, policy);
     }
     JL_GC_POP();
-    arraylist_free(&new_invokes);
+    // finally, make sure all referenced methods also get compiled or fixed up
+    compile_workqueue(params, policy, compiled_functions);
 
     // process the globals array, before jl_merge_module destroys them
     SmallVector<std::string, 0> gvars(params.global_targets.size());
@@ -464,7 +570,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         data->jl_value_to_llvm[idx] = global.first;
         idx++;
     }
-    CreateNativeMethods += params.compiled_functions.size();
+    CreateNativeMethods += compiled_functions.size();
 
     size_t offset = gvars.size();
     data->jl_external_to_llvm.resize(params.external_fns.size());
@@ -489,7 +595,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     {
         JL_TIMING(NATIVE_AOT, NATIVE_Merge);
         Linker L(*clone.getModuleUnlocked());
-        for (auto &def : params.compiled_functions) {
+        for (auto &def : compiled_functions) {
             jl_merge_module(clone, std::move(std::get<0>(def.second)));
             jl_code_instance_t *this_code = def.first;
             jl_llvm_functions_t decls = std::get<1>(def.second);
@@ -573,9 +679,6 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
         }
         ct->reentrant_timing &= ~1ull;
     }
-    if (ctx.getContext()) {
-        jl_ExecutionEngine->releaseContext(std::move(ctx));
-    }
     return (void*)data;
 }
 
@@ -1975,11 +2078,6 @@ void jl_dump_native_impl(void *native_code,
     }
 }
 
-void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis)
-{
-    PM->add(new TargetLibraryInfoWrapperPass(triple));
-    PM->add(createTargetTransformInfoWrapperPass(std::move(analysis)));
-}
 
 // sometimes in GDB you want to find out what code would be created from a mi
 extern "C" JL_DLLEXPORT_CODEGEN jl_code_info_t *jl_gdbdumpcode(jl_method_instance_t *mi)
@@ -2037,8 +2135,8 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t* dump, jl_method_instance_t *mi, jl_
     dump->F = nullptr;
     dump->TSM = nullptr;
     if (src && jl_is_code_info(src)) {
-        auto ctx = jl_ExecutionEngine->getContext();
-        orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), *ctx);
+        auto ctx = jl_ExecutionEngine->makeContext();
+        orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), ctx);
         uint64_t compiler_start_time = 0;
         uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
         if (measure_compile_time_enabled)
@@ -2046,7 +2144,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t* dump, jl_method_instance_t *mi, jl_
         auto target_info = m.withModuleDo([&](Module &M) {
             return std::make_pair(M.getDataLayout(), Triple(M.getTargetTriple()));
         });
-        jl_codegen_params_t output(*ctx, std::move(target_info.first), std::move(target_info.second));
+        jl_codegen_params_t output(ctx, std::move(target_info.first), std::move(target_info.second));
         output.params = &params;
         output.imaging_mode = imaging_default();
         // This would be nice, but currently it causes some assembly regressions that make printed output
diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
index 8557698a4e513..c257d2a2e3331 100644
--- a/src/cgmemmgr.cpp
+++ b/src/cgmemmgr.cpp
@@ -32,14 +32,14 @@
 
 namespace {
 
-static size_t get_block_size(size_t size)
+static size_t get_block_size(size_t size) JL_NOTSAFEPOINT
 {
     return (size > jl_page_size * 256 ? LLT_ALIGN(size, jl_page_size) :
             jl_page_size * 256);
 }
 
 // Wrapper function to mmap/munmap/mprotect pages...
-static void *map_anon_page(size_t size)
+static void *map_anon_page(size_t size) JL_NOTSAFEPOINT
 {
 #ifdef _OS_WINDOWS_
     char *mem = (char*)VirtualAlloc(NULL, size + jl_page_size,
@@ -54,7 +54,7 @@ static void *map_anon_page(size_t size)
     return mem;
 }
 
-static void unmap_page(void *ptr, size_t size)
+static void unmap_page(void *ptr, size_t size) JL_NOTSAFEPOINT
 {
 #ifdef _OS_WINDOWS_
     VirtualFree(ptr, size, MEM_DECOMMIT);
@@ -71,7 +71,7 @@ enum class Prot : int {
     NO = PAGE_NOACCESS
 };
 
-static void protect_page(void *ptr, size_t size, Prot flags)
+static void protect_page(void *ptr, size_t size, Prot flags) JL_NOTSAFEPOINT
 {
     DWORD old_prot;
     if (!VirtualProtect(ptr, size, (DWORD)flags, &old_prot)) {
@@ -89,7 +89,7 @@ enum class Prot : int {
     NO = PROT_NONE
 };
 
-static void protect_page(void *ptr, size_t size, Prot flags)
+static void protect_page(void *ptr, size_t size, Prot flags) JL_NOTSAFEPOINT
 {
     int ret = mprotect(ptr, size, (int)flags);
     if (ret != 0) {
@@ -98,7 +98,7 @@ static void protect_page(void *ptr, size_t size, Prot flags)
     }
 }
 
-static bool check_fd_or_close(int fd)
+static bool check_fd_or_close(int fd) JL_NOTSAFEPOINT
 {
     if (fd == -1)
         return false;
@@ -129,7 +129,7 @@ static intptr_t anon_hdl = -1;
 // Also, creating big file mapping and then map pieces of it seems to
 // consume too much global resources. Therefore, we use each file mapping
 // as a block on windows
-static void *create_shared_map(size_t size, size_t id)
+static void *create_shared_map(size_t size, size_t id) JL_NOTSAFEPOINT
 {
     void *addr = MapViewOfFile((HANDLE)id, FILE_MAP_ALL_ACCESS,
                                0, 0, size);
@@ -137,13 +137,13 @@ static void *create_shared_map(size_t size, size_t id)
     return addr;
 }
 
-static intptr_t init_shared_map()
+static intptr_t init_shared_map() JL_NOTSAFEPOINT
 {
     anon_hdl = 0;
     return 0;
 }
 
-static void *alloc_shared_page(size_t size, size_t *id, bool exec)
+static void *alloc_shared_page(size_t size, size_t *id, bool exec) JL_NOTSAFEPOINT
 {
     assert(size % jl_page_size == 0);
     DWORD file_mode = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
@@ -162,7 +162,7 @@ static void *alloc_shared_page(size_t size, size_t *id, bool exec)
 }
 #else // _OS_WINDOWS_
 // For shared mapped region
-static intptr_t get_anon_hdl(void)
+static intptr_t get_anon_hdl(void) JL_NOTSAFEPOINT
 {
     int fd = -1;
 
@@ -228,7 +228,7 @@ static struct _make_shared_map_lock {
     };
 } shared_map_lock;
 
-static size_t get_map_size_inc()
+static size_t get_map_size_inc() JL_NOTSAFEPOINT
 {
     rlimit rl;
     if (getrlimit(RLIMIT_FSIZE, &rl) != -1) {
@@ -242,7 +242,7 @@ static size_t get_map_size_inc()
     return map_size_inc_default;
 }
 
-static void *create_shared_map(size_t size, size_t id)
+static void *create_shared_map(size_t size, size_t id) JL_NOTSAFEPOINT
 {
     void *addr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
                       anon_hdl, id);
@@ -250,7 +250,7 @@ static void *create_shared_map(size_t size, size_t id)
     return addr;
 }
 
-static intptr_t init_shared_map()
+static intptr_t init_shared_map() JL_NOTSAFEPOINT
 {
     anon_hdl = get_anon_hdl();
     if (anon_hdl == -1)
@@ -265,7 +265,7 @@ static intptr_t init_shared_map()
     return anon_hdl;
 }
 
-static void *alloc_shared_page(size_t size, size_t *id, bool exec)
+static void *alloc_shared_page(size_t size, size_t *id, bool exec) JL_NOTSAFEPOINT
 {
     assert(size % jl_page_size == 0);
     size_t off = jl_atomic_fetch_add(&map_offset, size);
@@ -292,7 +292,7 @@ static void *alloc_shared_page(size_t size, size_t *id, bool exec)
 #ifdef _OS_LINUX_
 // Using `/proc/self/mem`, A.K.A. Keno's remote memory manager.
 
-ssize_t pwrite_addr(int fd, const void *buf, size_t nbyte, uintptr_t addr)
+ssize_t pwrite_addr(int fd, const void *buf, size_t nbyte, uintptr_t addr) JL_NOTSAFEPOINT
 {
     static_assert(sizeof(off_t) >= 8, "off_t is smaller than 64bits");
 #ifdef _P64
@@ -319,7 +319,7 @@ ssize_t pwrite_addr(int fd, const void *buf, size_t nbyte, uintptr_t addr)
 
 // Do not call this directly.
 // Use `get_self_mem_fd` which has a guard to call this only once.
-static int _init_self_mem()
+static int _init_self_mem() JL_NOTSAFEPOINT
 {
     struct utsname kernel;
     uname(&kernel);
@@ -359,13 +359,13 @@ static int _init_self_mem()
     return fd;
 }
 
-static int get_self_mem_fd()
+static int get_self_mem_fd() JL_NOTSAFEPOINT
 {
     static int fd = _init_self_mem();
     return fd;
 }
 
-static void write_self_mem(void *dest, void *ptr, size_t size)
+static void write_self_mem(void *dest, void *ptr, size_t size) JL_NOTSAFEPOINT
 {
     while (size > 0) {
         ssize_t ret = pwrite_addr(get_self_mem_fd(), ptr, size, (uintptr_t)dest);
@@ -424,7 +424,7 @@ struct Block {
 
     Block(const Block&) = delete;
     Block &operator=(const Block&) = delete;
-    Block(Block &&other)
+    Block(Block &&other) JL_NOTSAFEPOINT
         : ptr(other.ptr),
           total(other.total),
           avail(other.avail)
@@ -433,9 +433,9 @@ struct Block {
         other.total = other.avail = 0;
     }
 
-    Block() = default;
+    Block() JL_NOTSAFEPOINT = default;
 
-    void *alloc(size_t size, size_t align)
+    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t aligned_avail = avail & (-align);
         if (aligned_avail < size)
@@ -444,7 +444,7 @@ struct Block {
         avail = aligned_avail - size;
         return p;
     }
-    void reset(void *addr, size_t size)
+    void reset(void *addr, size_t size) JL_NOTSAFEPOINT
     {
         if (avail >= jl_page_size) {
             uintptr_t end = uintptr_t(ptr) + total;
@@ -462,7 +462,8 @@ class RWAllocator {
     static constexpr int nblocks = 8;
     Block blocks[nblocks]{};
 public:
-    void *alloc(size_t size, size_t align)
+    RWAllocator() JL_NOTSAFEPOINT = default;
+    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
@@ -498,9 +499,9 @@ struct SplitPtrBlock : public Block {
 
     uintptr_t wr_ptr{0};
     uint32_t state{0};
-    SplitPtrBlock() = default;
+    SplitPtrBlock() JL_NOTSAFEPOINT = default;
 
-    void swap(SplitPtrBlock &other)
+    void swap(SplitPtrBlock &other) JL_NOTSAFEPOINT
     {
         std::swap(ptr, other.ptr);
         std::swap(total, other.total);
@@ -509,7 +510,7 @@ struct SplitPtrBlock : public Block {
         std::swap(state, other.state);
     }
 
-    SplitPtrBlock(SplitPtrBlock &&other)
+    SplitPtrBlock(SplitPtrBlock &&other) JL_NOTSAFEPOINT
         : SplitPtrBlock()
     {
         swap(other);
@@ -534,11 +535,12 @@ class ROAllocator {
     // but might not have all the permissions set or data copied yet.
     SmallVector<SplitPtrBlock, 16> completed;
     virtual void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
-                             size_t size, size_t align) = 0;
-    virtual SplitPtrBlock alloc_block(size_t size) = 0;
+                             size_t size, size_t align) JL_NOTSAFEPOINT = 0;
+    virtual SplitPtrBlock alloc_block(size_t size) JL_NOTSAFEPOINT = 0;
 public:
-    virtual ~ROAllocator() {}
-    virtual void finalize()
+    ROAllocator() JL_NOTSAFEPOINT = default;
+    virtual ~ROAllocator() JL_NOTSAFEPOINT {}
+    virtual void finalize() JL_NOTSAFEPOINT
     {
         for (auto &alloc: allocations) {
             // ensure the mapped pages are consistent
@@ -552,7 +554,7 @@ class ROAllocator {
     }
     // Allocations that have not been finalized yet.
     SmallVector<Allocation, 16> allocations;
-    void *alloc(size_t size, size_t align)
+    void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT
     {
         size_t min_size = (size_t)-1;
         int min_id = 0;
@@ -603,7 +605,7 @@ class ROAllocator {
 template<bool exec>
 class DualMapAllocator : public ROAllocator<exec> {
 protected:
-    void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override
+    void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT
     {
         assert((char*)rt_ptr >= block.ptr &&
                (char*)rt_ptr < (block.ptr + block.total));
@@ -618,7 +620,7 @@ class DualMapAllocator : public ROAllocator<exec> {
         }
         return (char*)rt_ptr + (block.wr_ptr - uintptr_t(block.ptr));
     }
-    SplitPtrBlock alloc_block(size_t size) override
+    SplitPtrBlock alloc_block(size_t size) override JL_NOTSAFEPOINT
     {
         SplitPtrBlock new_block;
         // use `wr_ptr` to record the id initially
@@ -626,7 +628,7 @@ class DualMapAllocator : public ROAllocator<exec> {
         new_block.reset(ptr, size);
         return new_block;
     }
-    void finalize_block(SplitPtrBlock &block, bool reset)
+    void finalize_block(SplitPtrBlock &block, bool reset) JL_NOTSAFEPOINT
     {
         // This function handles setting the block to the right mode
         // and free'ing maps that are not needed anymore.
@@ -662,11 +664,11 @@ class DualMapAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    DualMapAllocator()
+    DualMapAllocator() JL_NOTSAFEPOINT
     {
         assert(anon_hdl != -1);
     }
-    void finalize() override
+    void finalize() override JL_NOTSAFEPOINT
     {
         for (auto &block : this->blocks) {
             finalize_block(block, false);
@@ -685,7 +687,7 @@ class SelfMemAllocator : public ROAllocator<exec> {
     SmallVector<Block, 16> temp_buff;
 protected:
     void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
-                     size_t size, size_t align) override
+                     size_t size, size_t align) override JL_NOTSAFEPOINT
     {
         assert(!(block.state & SplitPtrBlock::InitAlloc));
         for (auto &wr_block: temp_buff) {
@@ -699,13 +701,13 @@ class SelfMemAllocator : public ROAllocator<exec> {
         new_block.reset(map_anon_page(block_size), block_size);
         return new_block.alloc(size, align);
     }
-    SplitPtrBlock alloc_block(size_t size) override
+    SplitPtrBlock alloc_block(size_t size) override JL_NOTSAFEPOINT
     {
         SplitPtrBlock new_block;
         new_block.reset(map_anon_page(size), size);
         return new_block;
     }
-    void finalize_block(SplitPtrBlock &block, bool reset)
+    void finalize_block(SplitPtrBlock &block, bool reset) JL_NOTSAFEPOINT
     {
         if (!(block.state & SplitPtrBlock::Alloc))
             return;
@@ -718,13 +720,13 @@ class SelfMemAllocator : public ROAllocator<exec> {
         }
     }
 public:
-    SelfMemAllocator()
+    SelfMemAllocator() JL_NOTSAFEPOINT
         : ROAllocator<exec>(),
           temp_buff()
     {
         assert(get_self_mem_fd() != -1);
     }
-    void finalize() override
+    void finalize() override JL_NOTSAFEPOINT
     {
         for (auto &block : this->blocks) {
             finalize_block(block, false);
@@ -770,17 +772,15 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
     RWAllocator rw_alloc;
     std::unique_ptr<ROAllocator<false>> ro_alloc;
     std::unique_ptr<ROAllocator<true>> exe_alloc;
-    bool code_allocated;
     size_t total_allocated;
 
 public:
-    RTDyldMemoryManagerJL()
+    RTDyldMemoryManagerJL() JL_NOTSAFEPOINT
         : SectionMemoryManager(),
           pending_eh(),
           rw_alloc(),
           ro_alloc(),
           exe_alloc(),
-          code_allocated(false),
           total_allocated(0)
     {
 #ifdef _OS_LINUX_
@@ -794,12 +794,12 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
             exe_alloc.reset(new DualMapAllocator<true>());
         }
     }
-    ~RTDyldMemoryManagerJL() override
+    ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT
     {
     }
-    size_t getTotalBytes() { return total_allocated; }
+    size_t getTotalBytes() JL_NOTSAFEPOINT { return total_allocated; }
     void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                          size_t Size) override;
+                          size_t Size) override JL_NOTSAFEPOINT;
 #if 0
     // Disable for now since we are not actually using this.
     void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
@@ -807,16 +807,16 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
 #endif
     uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                  unsigned SectionID,
-                                 StringRef SectionName) override;
+                                 StringRef SectionName) override JL_NOTSAFEPOINT;
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                  unsigned SectionID, StringRef SectionName,
-                                 bool isReadOnly) override;
+                                 bool isReadOnly) override JL_NOTSAFEPOINT;
     using SectionMemoryManager::notifyObjectLoaded;
     void notifyObjectLoaded(RuntimeDyld &Dyld,
-                            const object::ObjectFile &Obj) override;
-    bool finalizeMemory(std::string *ErrMsg = nullptr) override;
+                            const object::ObjectFile &Obj) override JL_NOTSAFEPOINT;
+    bool finalizeMemory(std::string *ErrMsg = nullptr) override JL_NOTSAFEPOINT;
     template <typename DL, typename Alloc>
-    void mapAddresses(DL &Dyld, Alloc &&allocator)
+    void mapAddresses(DL &Dyld, Alloc &&allocator) JL_NOTSAFEPOINT
     {
         for (auto &alloc: allocator->allocations) {
             if (alloc.rt_addr == alloc.wr_addr || alloc.relocated)
@@ -826,7 +826,7 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
         }
     }
     template <typename DL>
-    void mapAddresses(DL &Dyld)
+    void mapAddresses(DL &Dyld) JL_NOTSAFEPOINT
     {
         if (!ro_alloc)
             return;
@@ -838,14 +838,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager {
 uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size,
                                                     unsigned Alignment,
                                                     unsigned SectionID,
-                                                    StringRef SectionName)
+                                                    StringRef SectionName) JL_NOTSAFEPOINT
 {
     // allocating more than one code section can confuse libunwind.
-#if !defined(_COMPILER_MSAN_ENABLED_) && !defined(_COMPILER_ASAN_ENABLED_)
-    // TODO: Figure out why msan and now asan too need this.
-    assert(!code_allocated);
-    code_allocated = true;
-#endif
     total_allocated += Size;
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size);
@@ -859,7 +854,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size,
                                                     unsigned Alignment,
                                                     unsigned SectionID,
                                                     StringRef SectionName,
-                                                    bool isReadOnly)
+                                                    bool isReadOnly) JL_NOTSAFEPOINT
 {
     total_allocated += Size;
     jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size);
@@ -873,7 +868,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size,
 }
 
 void RTDyldMemoryManagerJL::notifyObjectLoaded(RuntimeDyld &Dyld,
-                                               const object::ObjectFile &Obj)
+                                               const object::ObjectFile &Obj) JL_NOTSAFEPOINT
 {
     if (!ro_alloc) {
         assert(!exe_alloc);
@@ -884,9 +879,8 @@ void RTDyldMemoryManagerJL::notifyObjectLoaded(RuntimeDyld &Dyld,
     mapAddresses(Dyld);
 }
 
-bool RTDyldMemoryManagerJL::finalizeMemory(std::string *ErrMsg)
+bool RTDyldMemoryManagerJL::finalizeMemory(std::string *ErrMsg) JL_NOTSAFEPOINT
 {
-    code_allocated = false;
     if (ro_alloc) {
         ro_alloc->finalize();
         assert(exe_alloc);
@@ -904,7 +898,7 @@ bool RTDyldMemoryManagerJL::finalizeMemory(std::string *ErrMsg)
 
 void RTDyldMemoryManagerJL::registerEHFrames(uint8_t *Addr,
                                              uint64_t LoadAddr,
-                                             size_t Size)
+                                             size_t Size) JL_NOTSAFEPOINT
 {
     if (uintptr_t(Addr) == LoadAddr) {
         register_eh_frames(Addr, Size);
@@ -917,7 +911,7 @@ void RTDyldMemoryManagerJL::registerEHFrames(uint8_t *Addr,
 #if 0
 void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
                                                uint64_t LoadAddr,
-                                               size_t Size)
+                                               size_t Size) JL_NOTSAFEPOINT
 {
     deregister_eh_frames((uint8_t*)LoadAddr, Size);
 }
@@ -925,12 +919,12 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
 
 }
 
-RTDyldMemoryManager* createRTDyldMemoryManager()
+RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT
 {
     return new RTDyldMemoryManagerJL();
 }
 
-size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm)
+size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT
 {
     return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes();
 }
diff --git a/src/clangsa/GCChecker.cpp b/src/clangsa/GCChecker.cpp
index 31631eb70a4ad..4892ebdabd110 100644
--- a/src/clangsa/GCChecker.cpp
+++ b/src/clangsa/GCChecker.cpp
@@ -31,7 +31,7 @@ namespace {
 using namespace clang;
 using namespace ento;
 
-#define PDP std::shared_ptr<PathDiagnosticPiece>
+typedef std::shared_ptr<PathDiagnosticPiece> PDP;
 #define MakePDP make_unique<PathDiagnosticEventPiece>
 
 static const Stmt *getStmtForDiagnostics(const ExplodedNode *N)
@@ -394,13 +394,18 @@ PDP GCChecker::SafepointBugVisitor::VisitNode(const ExplodedNode *N,
       } else {
         PathDiagnosticLocation Pos = PathDiagnosticLocation::createDeclBegin(
             N->getLocationContext(), BRC.getSourceManager());
-        return MakePDP(Pos, "Tracking JL_NOT_SAFEPOINT annotation here.");
+        if (Pos.isValid())
+          return MakePDP(Pos, "Tracking JL_NOT_SAFEPOINT annotation here.");
+        //N->getLocation().dump();
       }
     } else if (NewSafepointDisabled == (unsigned)-1) {
       PathDiagnosticLocation Pos = PathDiagnosticLocation::createDeclBegin(
           N->getLocationContext(), BRC.getSourceManager());
-      return MakePDP(Pos, "Safepoints re-enabled here");
+      if (Pos.isValid())
+        return MakePDP(Pos, "Safepoints re-enabled here");
+      //N->getLocation().dump();
     }
+    // n.b. there may be no position here to report if they were disabled by julia_notsafepoint_enter/leave
   }
   return nullptr;
 }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 3f69f4789493a..0ab26a65fcaaa 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -233,7 +233,6 @@ STATISTIC(EmittedSpecfunCalls, "Number of specialized calls emitted");
 STATISTIC(EmittedInvokes, "Number of invokes emitted");
 STATISTIC(EmittedCalls, "Number of calls emitted");
 STATISTIC(EmittedUndefVarErrors, "Number of undef var errors emitted");
-STATISTIC(EmittedOpaqueClosureFunctions, "Number of opaque closures emitted");
 STATISTIC(EmittedToJLInvokes, "Number of tojlinvoke calls emitted");
 STATISTIC(EmittedCFuncInvalidates, "Number of C function invalidates emitted");
 STATISTIC(GeneratedCFuncWrappers, "Number of C function wrappers generated");
@@ -1009,6 +1008,11 @@ static const auto jlinvoke_func = new JuliaFunction<>{
             {AttributeSet(),
              Attributes(C, {Attribute::ReadOnly, Attribute::NoCapture})}); },
 };
+static const auto jlopaque_closure_call_func = new JuliaFunction<>{
+    XSTR(jl_f_opaque_closure_call),
+    get_func_sig,
+    get_func_attrs,
+};
 static const auto jlmethod_func = new JuliaFunction<>{
     XSTR(jl_method_def),
     [](LLVMContext &C) {
@@ -1606,7 +1610,7 @@ static const auto jltuple_func = new JuliaFunction<>{XSTR(jl_f_tuple), get_func_
 static const auto jlintrinsic_func = new JuliaFunction<>{XSTR(jl_f_intrinsic_call), get_func3_sig, get_func_attrs};
 
 static const auto &builtin_func_map() {
-    static std::map<jl_fptr_args_t, JuliaFunction<>*> builtins = {
+    static auto builtins = new DenseMap<jl_fptr_args_t, JuliaFunction<>*> {
           { jl_f_is_addr,                 new JuliaFunction<>{XSTR(jl_f_is), get_func_sig, get_func_attrs} },
           { jl_f_typeof_addr,             new JuliaFunction<>{XSTR(jl_f_typeof), get_func_sig, get_func_attrs} },
           { jl_f_sizeof_addr,             new JuliaFunction<>{XSTR(jl_f_sizeof), get_func_sig, get_func_attrs} },
@@ -1649,18 +1653,18 @@ static const auto &builtin_func_map() {
           { jl_f__svec_ref_addr,          new JuliaFunction<>{XSTR(jl_f__svec_ref), get_func_sig, get_func_attrs} },
           { jl_f_current_scope_addr,      new JuliaFunction<>{XSTR(jl_f_current_scope), get_func_sig, get_func_attrs} },
         };
-    return builtins;
+    return *builtins;
 }
 
 static const auto &may_dispatch_builtins() {
-    static std::unordered_set<jl_fptr_args_t> builtins(
+    static auto builtins = new DenseSet<jl_fptr_args_t>(
         {jl_f__apply_iterate_addr,
         jl_f__apply_pure_addr,
         jl_f__call_in_world_addr,
         jl_f__call_in_world_total_addr,
         jl_f__call_latest_addr,
         });
-    return builtins;
+    return *builtins;
 }
 
 static const auto jl_new_opaque_closure_jlcall_func = new JuliaFunction<>{XSTR(jl_new_opaque_closure_jlcall), get_func_sig, get_func_attrs};
@@ -2976,7 +2980,7 @@ static void jl_name_jlfuncparams_args(jl_codegen_params_t &params, Function *F)
     F->getArg(3)->setName("sparams::Any");
 }
 
-static void jl_init_function(Function *F, const Triple &TT)
+void jl_init_function(Function *F, const Triple &TT)
 {
     // set any attributes that *must* be set on all functions
     AttrBuilder attr(F->getContext());
@@ -3023,6 +3027,7 @@ static bool uses_specsig(jl_value_t *sig, bool needsparams, jl_value_t *rettype,
     if (jl_vararg_kind(jl_tparam(sig, jl_nparams(sig) - 1)) == JL_VARARG_UNBOUND)
         return false;
     // not invalid, consider if specialized signature is worthwhile
+    // n.b. sig is sometimes wrong for OC (tparam0 might be the captures type of the specialization, even though what gets passed in that slot is an OC object), so prefer_specsig is always set (instead of recomputing tparam0 using get_oc_type)
     if (prefer_specsig)
         return true;
     if (!deserves_retbox(rettype) && !jl_is_datatype_singleton((jl_datatype_t*)rettype) && rettype != (jl_value_t*)jl_bool_type)
@@ -5236,7 +5241,15 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, FunctionCallee theFptr, Value *t
     if (theF)
         theArgs.push_back(theF);
     for (size_t i = 0; i < nargs; i++) {
-        Value *arg = boxed(ctx, argv[i]);
+        Value *arg;
+        if (i == 0 && trampoline == julia_call3) {
+            const jl_cgval_t &f = argv[i];
+            arg = f.inline_roots.empty() && f.ispointer() ? data_pointer(ctx, f) : value_to_pointer(ctx, f).V;
+            arg = decay_derived(ctx, arg);
+        }
+        else {
+            arg = boxed(ctx, argv[i]);
+        }
         theArgs.push_back(arg);
     }
     CallInst *result = ctx.builder.CreateCall(TheTrampoline, theArgs);
@@ -5283,13 +5296,13 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
         idx++;
     }
     for (size_t i = 0; i < nargs; i++) {
-        jl_value_t *jt = jl_nth_slot_type(specTypes, i);
         // n.b.: specTypes is required to be a datatype by construction for specsig
         if (is_opaque_closure && i == 0) {
             // Special implementation for opaque closures: their jt and thus
-            // julia_type_to_llvm values are likely wrong, so override the
-            // behavior here to directly pass the expected pointer based instead
-            // just on passing arg as a pointer
+            // julia_type_to_llvm values are likely wrong (based on captures instead of the OC), so override the
+            // behavior here to directly pass the expected pointer directly instead of
+            // computing it from the available information
+            // jl_value_t *oc_type = (jl_value_t*)jl_any_type; // more accurately: get_oc_type(specTypes, jlretty)
             jl_cgval_t arg = argv[i];
             if (arg.isghost) {
                 argvals[idx] = Constant::getNullValue(ctx.builder.getPtrTy(AddressSpace::Derived));
@@ -5302,6 +5315,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
             idx++;
             continue;
         }
+        jl_value_t *jt = jl_nth_slot_type(specTypes, i);
         jl_cgval_t arg = update_julia_type(ctx, argv[i], jt);
         if (arg.typ == jl_bottom_type)
             return jl_cgval_t();
@@ -5519,6 +5533,7 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
                     // Check if we already queued this up
                     auto it = ctx.call_targets.find(codeinst);
                     if (need_to_emit && it != ctx.call_targets.end()) {
+                        assert(it->second.specsig == specsig);
                         protoname = it->second.decl->getName();
                         need_to_emit = cache_valid = false;
                     }
@@ -5559,7 +5574,7 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
                     handled = true;
                     if (need_to_emit) {
                         Function *trampoline_decl = cast<Function>(jl_Module->getNamedValue(protoname));
-                        ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, specsig};
+                        ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, nullptr, specsig};
                         if (trim_may_error(ctx.params->trim))
                             push_frames(ctx, ctx.linfo, mi);
                     }
@@ -5570,9 +5585,9 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
     if (!handled) {
         if (trim_may_error(ctx.params->trim)) {
             if (lival.constant) {
-                arraylist_push(&new_invokes, lival.constant);
                 push_frames(ctx, ctx.linfo, (jl_method_instance_t*)lival.constant);
-            } else {
+            }
+            else {
                 errs() << "Dynamic call to unknown function";
                 errs() << "In " << ctx.builder.getCurrentDebugLocation()->getFilename() << ":" << ctx.builder.getCurrentDebugLocation()->getLine() << "\n";
 
@@ -5728,20 +5743,16 @@ static jl_cgval_t emit_call(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt, bo
             }
         }
         FunctionCallee fptr;
-        Value *F;
         JuliaFunction<> *cc;
         if (f.typ == (jl_value_t*)jl_intrinsic_type) {
             fptr = prepare_call(jlintrinsic_func);
-            F = f.inline_roots.empty() && f.ispointer() ? data_pointer(ctx, f) : value_to_pointer(ctx, f).V;
-            F = decay_derived(ctx, F);
             cc = julia_call3;
         }
         else {
             fptr = FunctionCallee(get_func_sig(ctx.builder.getContext()), ctx.builder.CreateCall(prepare_call(jlgetbuiltinfptr_func), {emit_typeof(ctx, f)}));
-            F = boxed(ctx, f);
             cc = julia_call;
         }
-        Value *ret = emit_jlcall(ctx, fptr, F, ArrayRef<jl_cgval_t>(argv).drop_front(), nargs - 1, cc);
+        Value *ret = emit_jlcall(ctx, fptr, nullptr, argv, nargs, cc);
         setName(ctx.emission_context, ret, "Builtin_ret");
         return mark_julia_type(ctx, ret, true, rt);
     }
@@ -5758,52 +5769,12 @@ static jl_cgval_t emit_call(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt, bo
                 JL_GC_POP();
                 return r;
             }
+            // TODO: else emit_oc_call
         }
     }
     int failed_dispatch = !argv[0].constant;
     if (ctx.params->trim != JL_TRIM_NO) {
-        size_t min_valid = 1;
-        size_t max_valid = ~(size_t)0;
-        size_t latest_world = jl_get_world_counter(); // TODO: marshal the world age of the compilation here.
-
-        // Find all methods matching the call signature
-        jl_array_t *matches = NULL;
-        jl_value_t *tup = NULL;
-        JL_GC_PUSH2(&tup, &matches);
-        if (!failed_dispatch) {
-            SmallVector<jl_value_t*> argtypes;
-            for (auto& arg: argv)
-                argtypes.push_back(arg.typ);
-            tup = jl_apply_tuple_type_v(argtypes.data(), argtypes.size());
-            matches = (jl_array_t*)jl_matching_methods((jl_tupletype_t*)tup, jl_nothing, 10 /*TODO: make global*/, 1,
-                                                latest_world, &min_valid, &max_valid, NULL);
-            if ((jl_value_t*)matches == jl_nothing)
-                failed_dispatch = 1;
-        }
-
-        // Expand each matching method to its unique specialization, if it has exactly one
-        if (!failed_dispatch) {
-            size_t k;
-            size_t len = new_invokes.len;
-            for (k = 0; k < jl_array_nrows(matches); k++) {
-                jl_method_match_t *match = (jl_method_match_t *)jl_array_ptr_ref(matches, k);
-                jl_method_instance_t *mi = jl_method_match_to_mi(match, latest_world, min_valid, max_valid, 0);
-                if (!mi) {
-                    if (jl_array_nrows(matches) == 1) {
-                        // if the method match is not compileable, but there is only one, fall back to
-                        // unspecialized implementation
-                        mi = jl_get_unspecialized(match->method);
-                    }
-                    else {
-                        new_invokes.len = len;
-                        failed_dispatch = 1;
-                        break;
-                    }
-                }
-                arraylist_push(&new_invokes, mi);
-            }
-        }
-        JL_GC_POP();
+        abort(); // this code path is unsound, unsafe, and probably bad
     }
 
     if (failed_dispatch && trim_may_error(ctx.params->trim)) {
@@ -6634,66 +6605,73 @@ static std::pair<Function*, Function*> get_oc_function(jl_codectx_t &ctx, jl_met
         assert(jl_is_method_instance(mi));
         ci = jl_atomic_load_relaxed(&mi->cache);
     }
-
-    if (ci == NULL || (jl_value_t*)ci == jl_nothing) {
-        JL_GC_POP();
-        return std::make_pair((Function*)NULL, (Function*)NULL);
-    }
-    auto inferred = jl_atomic_load_relaxed(&ci->inferred);
-    if (!inferred || inferred == jl_nothing) {
+    if (ci == NULL || (jl_value_t*)ci == jl_nothing || ci->rettype != rettype || !jl_egal(sigtype, mi->specTypes)) { // TODO: correctly handle the ABI conversion if rettype != ci->rettype
         JL_GC_POP();
         return std::make_pair((Function*)NULL, (Function*)NULL);
     }
 
-    auto it = ctx.emission_context.compiled_functions.find(ci);
-
-    if (it == ctx.emission_context.compiled_functions.end()) {
-        ++EmittedOpaqueClosureFunctions;
-        jl_code_info_t *ir = jl_uncompress_ir(closure_method, ci, (jl_value_t*)inferred);
-        JL_GC_PUSH1(&ir);
-        // TODO: Emit this inline and outline it late using LLVM's coroutine support.
-        orc::ThreadSafeModule closure_m = jl_create_ts_module(
-                name_from_method_instance(mi), ctx.emission_context.tsctx,
-                jl_Module->getDataLayout(), Triple(jl_Module->getTargetTriple()));
-        jl_llvm_functions_t closure_decls = emit_function(closure_m, mi, ir, rettype, ctx.emission_context);
-        JL_GC_POP();
-        it = ctx.emission_context.compiled_functions.insert(std::make_pair(ci, std::make_pair(std::move(closure_m), std::move(closure_decls)))).first;
+    // method lookup code (similar to emit_invoke, and the inverse of emit_specsig_oc_call)
+    bool specsig = uses_specsig(sigtype, false, rettype, true);
+    std::string name;
+    std::string oc;
+    StringRef protoname;
+    StringRef proto_oc;
+
+    // Check if we already queued this up
+    auto it = ctx.call_targets.find(ci);
+    bool need_to_emit = it == ctx.call_targets.end();
+    if (!need_to_emit) {
+        assert(specsig == it->second.specsig);
+        if (specsig) {
+            protoname = it->second.decl->getName();
+            proto_oc = it->second.oc->getName();
+        }
+        else {
+            proto_oc = it->second.decl->getName();
+        }
+        need_to_emit = false;
+    }
+    else {
+        if (specsig) {
+            raw_string_ostream(name) << "j_" << name_from_method_instance(mi) << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1);
+            protoname = StringRef(name);
+        }
+        raw_string_ostream(oc) << "j1_" << name_from_method_instance(mi) << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1);
+        proto_oc = StringRef(oc);
     }
 
-    auto &closure_m = it->second.first;
-    auto &closure_decls = it->second.second;
-
-    assert(closure_decls.functionObject != "jl_fptr_sparam");
-    bool isspecsig = closure_decls.functionObject != "jl_fptr_args";
-
-    Function *F = NULL;
-    std::string fname = isspecsig ?
-        closure_decls.functionObject :
-        closure_decls.specFunctionObject;
-    if (GlobalValue *V = jl_Module->getNamedValue(fname)) {
+    // Get the fptr1 OC
+    Function *F = nullptr;
+    if (GlobalValue *V = jl_Module->getNamedValue(proto_oc)) {
         F = cast<Function>(V);
     }
     else {
         F = Function::Create(get_func_sig(ctx.builder.getContext()),
                              Function::ExternalLinkage,
-                             fname, jl_Module);
+                             proto_oc, jl_Module);
         jl_init_function(F, ctx.emission_context.TargetTriple);
         jl_name_jlfunc_args(ctx.emission_context, F);
         F->setAttributes(AttributeList::get(ctx.builder.getContext(), {get_func_attrs(ctx.builder.getContext()), F->getAttributes()}));
     }
-    Function *specF = NULL;
-    if (!isspecsig) {
-        specF = F;
+
+    // Get the specsig (if applicable)
+    Function *specF = nullptr;
+    jl_returninfo_t::CallingConv cc = jl_returninfo_t::CallingConv::Boxed;
+    unsigned return_roots = 0;
+    bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
+    assert(is_opaque_closure);
+    if (specsig) {
+        bool gcstack_arg = JL_FEAT_TEST(ctx, gcstack_arg);
+        jl_returninfo_t returninfo = get_specsig_function(ctx, jl_Module, nullptr, protoname, mi->specTypes, rettype, is_opaque_closure, gcstack_arg);
+        cc = returninfo.cc;
+        return_roots = returninfo.return_roots;
+        specF = cast<Function>(returninfo.decl.getCallee());
     }
-    else {
-        //emission context holds context lock so can get module
-        specF = closure_m.getModuleUnlocked()->getFunction(closure_decls.specFunctionObject);
-        if (specF) {
-            jl_returninfo_t returninfo = get_specsig_function(ctx, jl_Module, NULL,
-                closure_decls.specFunctionObject, sigtype, rettype, true, JL_FEAT_TEST(ctx,gcstack_arg));
-            specF = cast<Function>(returninfo.decl.getCallee());
-        }
+
+    if (need_to_emit) {
+        ctx.call_targets[ci] = {cc, return_roots, specsig ? specF : F, specsig ? F : nullptr, specsig};
     }
+
     JL_GC_POP();
     return std::make_pair(F, specF);
 }
@@ -7173,7 +7151,12 @@ static Value *get_scope_field(jl_codectx_t &ctx)
     return emit_ptrgep(ctx, ct, offsetof(jl_task_t, scope), "current_scope");
 }
 
-static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, Module *M, jl_codegen_params_t &params)
+Function *get_or_emit_fptr1(StringRef preal_decl, Module *M)
+{
+    return cast<Function>(M->getOrInsertFunction(preal_decl, get_func_sig(M->getContext()), get_func_attrs(M->getContext())).getCallee());
+}
+
+Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOINT
 {
     ++EmittedToJLInvokes;
     jl_codectx_t ctx(M->getContext(), params, codeinst);
@@ -7184,7 +7167,6 @@ static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptr
             name, M);
     jl_init_function(f, params.TargetTriple);
     if (trim_may_error(params.params->trim)) {
-        arraylist_push(&new_invokes, codeinst->def); // Try t compile this invoke
         // TODO: Debuginfo!
         push_frames(ctx, ctx.linfo, codeinst->def, 1);
     }
@@ -7213,7 +7195,17 @@ static Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptr
     return f;
 }
 
-static void emit_cfunc_invalidate(
+static jl_value_t *get_oc_type(jl_value_t *calltype, jl_value_t *rettype) JL_ALWAYS_LEAFTYPE
+{
+    jl_value_t *argtype = jl_argtype_without_function((jl_value_t*)calltype);
+    JL_GC_PUSH1(&argtype);
+    jl_value_t *oc_type JL_ALWAYS_LEAFTYPE = jl_apply_type2((jl_value_t*)jl_opaque_closure_type, argtype, rettype);
+    JL_GC_PROMISE_ROOTED(oc_type);
+    JL_GC_POP();
+    return oc_type;
+}
+
+void emit_specsig_to_fptr1(
         Function *gf_thunk, jl_returninfo_t::CallingConv cc, unsigned return_roots,
         jl_value_t *calltype, jl_value_t *rettype, bool is_for_opaque_closure,
         size_t nargs,
@@ -7240,14 +7232,18 @@ static void emit_cfunc_invalidate(
         ++AI; // gcstack_arg
     }
     for (size_t i = 0; i < nargs; i++) {
-        // n.b. calltype is required to be a datatype by construction for specsig
-        jl_value_t *jt = jl_nth_slot_type(calltype, i);
         if (i == 0 && is_for_opaque_closure) {
+            // `jt` would be wrong here (it is the captures type), so is not used used for
+            // the ABI decisions, but the argument actually will require boxing as its real type
+            // which can be exactly recomputed from the specialization, as that defined the ABI
+            jl_value_t *oc_type = get_oc_type(calltype, rettype);
             Value *arg_v = &*AI;
             ++AI;
-            myargs[i] = mark_julia_slot(arg_v, jt, NULL, ctx.tbaa().tbaa_const);
+            myargs[i] = mark_julia_slot(arg_v, (jl_value_t*)oc_type, NULL, ctx.tbaa().tbaa_const);
             continue;
         }
+        // n.b. calltype is required to be a datatype by construction for specsig
+        jl_value_t *jt = jl_nth_slot_type(calltype, i);
         bool isboxed = false;
         Type *et;
         if (deserves_argbox(jt)) {
@@ -7335,16 +7331,6 @@ static void emit_cfunc_invalidate(
     }
 }
 
-static void emit_cfunc_invalidate(
-        Function *gf_thunk, jl_returninfo_t::CallingConv cc, unsigned return_roots,
-        jl_value_t *calltype, jl_value_t *rettype, bool is_for_opaque_closure,
-        size_t nargs, jl_codegen_params_t &params,
-        size_t min_world, size_t max_world)
-{
-    emit_cfunc_invalidate(gf_thunk, cc, return_roots, calltype, rettype, is_for_opaque_closure, nargs, params,
-        prepare_call_in(gf_thunk->getParent(), jlapplygeneric_func), min_world, max_world);
-}
-
 static Function* gen_cfun_wrapper(
     Module *into, jl_codegen_params_t &params,
     const function_sig_t &sig, jl_value_t *ff, const char *aliasname,
@@ -7712,11 +7698,11 @@ static Function* gen_cfun_wrapper(
                     GlobalVariable::InternalLinkage, funcName, M);
             jl_init_function(gf_thunk, ctx.emission_context.TargetTriple);
             gf_thunk->setAttributes(AttributeList::get(M->getContext(), {returninfo.attrs, gf_thunk->getAttributes()}));
-            // build a  specsig -> jl_apply_generic converter thunk
+            // build a specsig -> jl_apply_generic converter thunk
             // this builds a method that calls jl_apply_generic (as a closure over a singleton function pointer),
             // but which has the signature of a specsig
-            emit_cfunc_invalidate(gf_thunk, returninfo.cc, returninfo.return_roots, lam->specTypes, codeinst->rettype, is_opaque_closure, nargs + 1, ctx.emission_context,
-                min_world, max_world);
+            emit_specsig_to_fptr1(gf_thunk, returninfo.cc, returninfo.return_roots, lam->specTypes, codeinst->rettype, is_opaque_closure, nargs + 1, ctx.emission_context,
+                prepare_call_in(gf_thunk->getParent(), jlapplygeneric_func), min_world, max_world);
             returninfo.decl = FunctionCallee(returninfo.decl.getFunctionType(), ctx.builder.CreateSelect(age_ok, returninfo.decl.getCallee(), gf_thunk));
         }
         retval = emit_call_specfun_other(ctx, is_opaque_closure, lam->specTypes, codeinst->rettype, returninfo, nullptr, inputargs, nargs + 1);
@@ -8026,7 +8012,8 @@ const char *jl_generate_ccallable(LLVMOrcThreadSafeModuleRef llvmmod, void *sysi
 }
 
 // generate a julia-callable function that calls f (AKA lam)
-static void gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlretty, jl_returninfo_t &f, unsigned nargs, int retarg, StringRef funcName,
+// if is_opaque_closure, then generate the OC invoke, rather than a real invoke
+static void gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlretty, jl_returninfo_t &f, unsigned nargs, int retarg, bool is_opaque_closure, StringRef funcName,
         Module *M, jl_codegen_params_t &params)
 {
     ++GeneratedInvokeWrappers;
@@ -8055,11 +8042,14 @@ static void gen_invoke_wrapper(jl_method_instance_t *lam, jl_value_t *jlretty, j
     allocate_gc_frame(ctx, b0);
 
     SmallVector<jl_cgval_t, 0> argv(nargs);
-    bool is_opaque_closure = jl_is_method(lam->def.value) && lam->def.method->is_for_opaque_closure;
     jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
     for (size_t i = 0; i < nargs; ++i) {
-        jl_value_t *ty = ((i == 0) && is_opaque_closure) ? (jl_value_t*)jl_any_type :
-            jl_nth_slot_type(lam->specTypes, i);
+        if (i == 0 && is_opaque_closure) {
+            jl_value_t *oc_type = (jl_value_t*)jl_any_type; // more accurately: get_oc_type(lam->specTypes, jlretty)
+            argv[i] = mark_julia_slot(funcArg, oc_type, NULL, ctx.tbaa().tbaa_const);
+            continue;
+        }
+        jl_value_t *ty = jl_nth_slot_type(lam->specTypes, i);
         Value *theArg;
         if (i == 0) {
             theArg = funcArg;
@@ -8455,6 +8445,7 @@ static jl_llvm_functions_t
         //  specTypes is required to be a datatype by construction for specsig, but maybe not otherwise
         // OpaqueClosure implicitly loads the env
         if (i == 0 && ctx.is_opaque_closure) {
+            // n.b. this is not really needed, because ty was already supposed to be correct
             if (jl_is_array(src->slottypes)) {
                 ty = jl_array_ptr_ref((jl_array_t*)src->slottypes, i);
             }
@@ -8554,7 +8545,7 @@ static jl_llvm_functions_t
         raw_string_ostream(wrapName) << "jfptr_" << ctx.name << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1);
         declarations.functionObject = wrapName;
         size_t nparams = jl_nparams(lam->specTypes);
-        gen_invoke_wrapper(lam, jlrettype, returninfo, nparams, retarg, declarations.functionObject, M, ctx.emission_context);
+        gen_invoke_wrapper(lam, jlrettype, returninfo, nparams, retarg, ctx.is_opaque_closure, declarations.functionObject, M, ctx.emission_context);
         // TODO: add attributes: maybe_mark_argument_dereferenceable(Arg, argType)
         // TODO: add attributes: dereferenceable<sizeof(void*) * nreq>
         // TODO: (if needsparams) add attributes: dereferenceable<sizeof(void*) * length(sp)>, readonly, nocapture
@@ -8564,11 +8555,10 @@ static jl_llvm_functions_t
                              GlobalVariable::ExternalLinkage,
                              declarations.specFunctionObject, M);
         jl_init_function(f, ctx.emission_context.TargetTriple);
-        if (needsparams) {
+        if (needsparams)
             jl_name_jlfuncparams_args(ctx.emission_context, f);
-        } else {
+        else
             jl_name_jlfunc_args(ctx.emission_context, f);
-        }
         f->setAttributes(AttributeList::get(ctx.builder.getContext(), {get_func_attrs(ctx.builder.getContext()), f->getAttributes()}));
         returninfo.decl = f;
         declarations.functionObject = needsparams ? "jl_fptr_sparam" : "jl_fptr_args";
@@ -8940,76 +8930,73 @@ static jl_llvm_functions_t
     }
     for (i = 0; i < nreq && i < vinfoslen; i++) {
         jl_sym_t *s = slot_symbol(ctx, i);
-        jl_value_t *argType = jl_nth_slot_type(lam->specTypes, i);
-        // TODO: jl_nth_slot_type should call jl_rewrap_unionall?
-        //  specTypes is required to be a datatype by construction for specsig, but maybe not otherwise
-        bool isboxed = deserves_argbox(argType);
-        Type *llvmArgType = NULL;
-        if (i == 0 && ctx.is_opaque_closure) {
-            isboxed = false;
-            llvmArgType = ctx.builder.getPtrTy(AddressSpace::Derived);
-            argType = (jl_value_t*)jl_any_type;
-        }
-        else {
-            llvmArgType = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, argType);
-        }
         jl_varinfo_t &vi = ctx.slots[i];
-        if (s == jl_unused_sym || vi.value.constant) {
-            assert(vi.boxroot == NULL);
-            if (specsig && !type_is_ghost(llvmArgType) && !is_uniquerep_Type(argType)) {
-                ++AI;
-                auto tracked = CountTrackedPointers(llvmArgType);
-                if (tracked.count && !tracked.all)
-                    ++AI;
-            }
-            continue;
-        }
         jl_cgval_t theArg;
-        // If this is an opaque closure, implicitly load the env and switch
-        // the world age.
         if (i == 0 && ctx.is_opaque_closure) {
+            // If this is an opaque closure, implicitly load the env and switch
+            // the world age. The specTypes value is wrong for this field, so
+            // this needs to be handled first.
+            // jl_value_t *oc_type = get_oc_type(calltype, rettype);
+            Value *oc_this = decay_derived(ctx, &*AI);
+            ++AI; // both specsig (derived) and fptr1 (box) pass this argument as a distinct argument
             // Load closure world
-            Value *oc_this = decay_derived(ctx, &*AI++);
-            Value *argaddr = oc_this;
-            Value *worldaddr = emit_ptrgep(ctx, argaddr, offsetof(jl_opaque_closure_t, world));
-
+            Value *worldaddr = emit_ptrgep(ctx, oc_this, offsetof(jl_opaque_closure_t, world));
             jl_cgval_t closure_world = typed_load(ctx, worldaddr, NULL, (jl_value_t*)jl_long_type,
                 nullptr, nullptr, false, AtomicOrdering::NotAtomic, false, ctx.types().alignof_ptr.value());
             ctx.world_age_at_entry = closure_world.V; // The tls world in a OC is the world of the closure
             emit_unbox_store(ctx, closure_world, world_age_field, ctx.tbaa().tbaa_gcframe, ctx.types().alignof_ptr);
 
-            // Load closure env
-            Value *envaddr = emit_ptrgep(ctx, argaddr, offsetof(jl_opaque_closure_t, captures));
+            if (s == jl_unused_sym || vi.value.constant)
+                continue;
 
-            jl_cgval_t closure_env = typed_load(ctx, envaddr, NULL, (jl_value_t*)jl_any_type,
-                nullptr, nullptr, true, AtomicOrdering::NotAtomic, false, sizeof(void*));
-            theArg = update_julia_type(ctx, closure_env, vi.value.typ);
-        }
-        else if (specsig) {
-            theArg = get_specsig_arg(argType, llvmArgType, isboxed);
+            // Load closure env, which is always a boxed value (usually some Tuple) currently
+            Value *envaddr = emit_ptrgep(ctx, oc_this, offsetof(jl_opaque_closure_t, captures));
+            theArg = typed_load(ctx, envaddr, NULL, (jl_value_t*)vi.value.typ,
+                nullptr, nullptr, /*isboxed*/true, AtomicOrdering::NotAtomic, false, sizeof(void*));
         }
         else {
-            if (i == 0) {
-                // first (function) arg is separate in jlcall
-                theArg = mark_julia_type(ctx, fArg, true, vi.value.typ);
+            jl_value_t *argType = jl_nth_slot_type(lam->specTypes, i);
+            // TODO: jl_nth_slot_type should call jl_rewrap_unionall?
+            //  specTypes is required to be a datatype by construction for specsig, but maybe not otherwise
+            bool isboxed = deserves_argbox(argType);
+            Type *llvmArgType = NULL;
+            llvmArgType = isboxed ? ctx.types().T_prjlvalue : julia_type_to_llvm(ctx, argType);
+            if (s == jl_unused_sym || vi.value.constant) {
+                assert(vi.boxroot == NULL);
+                if (specsig && !type_is_ghost(llvmArgType) && !is_uniquerep_Type(argType)) {
+                    ++AI;
+                    auto tracked = CountTrackedPointers(llvmArgType);
+                    if (tracked.count && !tracked.all)
+                        ++AI;
+                }
+                continue;
+            }
+            if (specsig) {
+                theArg = get_specsig_arg(argType, llvmArgType, isboxed);
             }
             else {
-                Value *argPtr = emit_ptrgep(ctx, argArray, (i - 1) * ctx.types().sizeof_ptr);
-                jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
-                Value *load = ai.decorateInst(maybe_mark_load_dereferenceable(
-                        ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, argPtr, Align(sizeof(void*))),
-                        false, vi.value.typ));
-                theArg = mark_julia_type(ctx, load, true, vi.value.typ);
-                if (debug_enabled && vi.dinfo && !vi.boxroot) {
-                    SmallVector<uint64_t, 8> addr;
-                    addr.push_back(llvm::dwarf::DW_OP_deref);
-                    addr.push_back(llvm::dwarf::DW_OP_plus_uconst);
-                    addr.push_back((i - 1) * sizeof(void*));
-                    if ((Metadata*)vi.dinfo->getType() != debugcache.jl_pvalue_dillvmt)
+                if (i == 0) {
+                    // first (function) arg is separate in jlcall
+                    theArg = mark_julia_type(ctx, fArg, true, vi.value.typ);
+                }
+                else {
+                    Value *argPtr = emit_ptrgep(ctx, argArray, (i - 1) * ctx.types().sizeof_ptr);
+                    jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
+                    Value *load = ai.decorateInst(maybe_mark_load_dereferenceable(
+                            ctx.builder.CreateAlignedLoad(ctx.types().T_prjlvalue, argPtr, Align(sizeof(void*))),
+                            false, vi.value.typ));
+                    theArg = mark_julia_type(ctx, load, true, vi.value.typ);
+                    if (debug_enabled && vi.dinfo && !vi.boxroot) {
+                        SmallVector<uint64_t, 8> addr;
                         addr.push_back(llvm::dwarf::DW_OP_deref);
-                    dbuilder.insertDeclare(pargArray, vi.dinfo, dbuilder.createExpression(addr),
-                                    topdebugloc,
-                                    ctx.builder.GetInsertBlock());
+                        addr.push_back(llvm::dwarf::DW_OP_plus_uconst);
+                        addr.push_back((i - 1) * sizeof(void*));
+                        if ((Metadata*)vi.dinfo->getType() != debugcache.jl_pvalue_dillvmt)
+                            addr.push_back(llvm::dwarf::DW_OP_deref);
+                        dbuilder.insertDeclare(pargArray, vi.dinfo, dbuilder.createExpression(addr),
+                                        topdebugloc,
+                                        ctx.builder.GetInsertBlock());
+                    }
                 }
             }
         }
@@ -9996,7 +9983,6 @@ jl_llvm_functions_t jl_emit_code(
 {
     JL_TIMING(CODEGEN, CODEGEN_LLVM);
     jl_timing_show_func_sig((jl_value_t *)li->specTypes, JL_TIMING_DEFAULT_BLOCK);
-    // caller must hold codegen_lock
     jl_llvm_functions_t decls = {};
     assert((params.params == &jl_default_cgparams /* fast path */ || !params.cache ||
         compare_cgparams(params.params, &jl_default_cgparams)) &&
@@ -10031,33 +10017,38 @@ jl_llvm_functions_t jl_emit_code(
     return decls;
 }
 
+static int effects_foldable(uint32_t effects)
+{
+    // N.B.: This needs to be kept in sync with Core.Compiler.is_foldable(effects, true)
+    return ((effects & 0x7) == 0) && // is_consistent(effects)
+           (((effects >> 10) & 0x03) == 0) && // is_noub(effects)
+           (((effects >> 3) & 0x03) == 0) && // is_effect_free(effects)
+           ((effects >> 6) & 0x01); // is_terminates(effects)
+}
+
 static jl_llvm_functions_t jl_emit_oc_wrapper(orc::ThreadSafeModule &m, jl_codegen_params_t &params, jl_method_instance_t *mi, jl_value_t *rettype)
 {
-    Module *M = m.getModuleUnlocked();
-    jl_codectx_t ctx(M->getContext(), params, 0, 0);
-    ctx.name = M->getModuleIdentifier().data();
-    std::string funcName = get_function_name(true, false, ctx.name, ctx.emission_context.TargetTriple);
     jl_llvm_functions_t declarations;
     declarations.functionObject = "jl_f_opaque_closure_call";
     if (uses_specsig(mi->specTypes, false, rettype, true)) {
+        // context lock is held by params
+        Module *M = m.getModuleUnlocked();
+        jl_codectx_t ctx(M->getContext(), params, 0, 0);
+        ctx.name = M->getModuleIdentifier().data();
+        std::string funcName = get_function_name(true, false, ctx.name, ctx.emission_context.TargetTriple);
         jl_returninfo_t returninfo = get_specsig_function(ctx, M, NULL, funcName, mi->specTypes, rettype, true, JL_FEAT_TEST(ctx,gcstack_arg));
         Function *gf_thunk = cast<Function>(returninfo.decl.getCallee());
         jl_init_function(gf_thunk, ctx.emission_context.TargetTriple);
         size_t nrealargs = jl_nparams(mi->specTypes);
-        emit_cfunc_invalidate(gf_thunk, returninfo.cc, returninfo.return_roots, mi->specTypes, rettype, true, nrealargs, ctx.emission_context, ctx.min_world, ctx.max_world);
+        emit_specsig_to_fptr1(gf_thunk, returninfo.cc, returninfo.return_roots,
+                mi->specTypes, rettype, true, nrealargs, ctx.emission_context,
+                prepare_call_in(gf_thunk->getParent(), jlopaque_closure_call_func), // TODO: this could call emit_oc_call directly
+                ctx.min_world, ctx.max_world);
         declarations.specFunctionObject = funcName;
     }
     return declarations;
 }
 
-static int effects_foldable(uint32_t effects)
-{
-    // N.B.: This needs to be kept in sync with Core.Compiler.is_foldable(effects, true)
-    return ((effects & 0x7) == 0) && // is_consistent(effects)
-           (((effects >> 10) & 0x03) == 0) && // is_noub(effects)
-           (((effects >> 3) & 0x03) == 0) && // is_effect_free(effects)
-           ((effects >> 6) & 0x01); // is_terminates(effects)
-}
 
 jl_llvm_functions_t jl_emit_codeinst(
         orc::ThreadSafeModule &m,
@@ -10070,12 +10061,14 @@ jl_llvm_functions_t jl_emit_codeinst(
     JL_GC_PUSH1(&src);
     if (!src) {
         src = (jl_code_info_t*)jl_atomic_load_relaxed(&codeinst->inferred);
-        jl_method_t *def = codeinst->def->def.method;
+        jl_method_instance_t *mi = codeinst->def;
+        jl_method_t *def = mi->def.method;
         // Check if this is the generic method for opaque closure wrappers -
-        // if so, generate the specsig -> invoke converter.
+        // if so, this must compile specptr such that it holds the specptr -> invoke wrapper
+        // to satisfy the dispatching implementation requirements of jl_f_opaque_closure_call
         if (def == jl_opaque_closure_method) {
             JL_GC_POP();
-            return jl_emit_oc_wrapper(m, params, codeinst->def, codeinst->rettype);
+            return jl_emit_oc_wrapper(m, params, mi, codeinst->rettype);
         }
         if (src && (jl_value_t*)src != jl_nothing && jl_is_method(def))
             src = jl_uncompress_ir(def, codeinst, (jl_value_t*)src);
@@ -10149,135 +10142,15 @@ jl_llvm_functions_t jl_emit_codeinst(
     return decls;
 }
 
-
-void jl_compile_workqueue(
-    jl_codegen_params_t &params,
-    CompilationPolicy policy)
-{
-    JL_TIMING(CODEGEN, CODEGEN_Workqueue);
-    jl_code_info_t *src = NULL;
-    JL_GC_PUSH1(&src);
-    while (!params.workqueue.empty()) {
-        jl_code_instance_t *codeinst;
-        auto it = params.workqueue.back();
-        codeinst = it.first;
-        auto proto = it.second;
-        params.workqueue.pop_back();
-        // try to emit code for this item from the workqueue
-        StringRef preal_decl = "";
-        bool preal_specsig = false;
-        jl_callptr_t invoke = NULL;
-        if (params.cache) {
-            // WARNING: this correctness is protected by an outer lock
-            uint8_t specsigflags;
-            void *fptr;
-            jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
-            //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
-            if (invoke == jl_fptr_args_addr) {
-                preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-            }
-            else if (specsigflags & 0b1) {
-                preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-                preal_specsig = true;
-            }
-        }
-        if (preal_decl.empty()) {
-            auto it = params.compiled_functions.find(codeinst);
-            if (it == params.compiled_functions.end()) {
-                // Reinfer the function. The JIT came along and removed the inferred
-                // method body. See #34993
-                if ((policy != CompilationPolicy::Default || params.params->trim) &&
-                    jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
-                    // XXX: SOURCE_MODE_FORCE_SOURCE is wrong here (neither sufficient nor necessary)
-                    codeinst = jl_type_infer(codeinst->def, jl_atomic_load_relaxed(&codeinst->max_world), SOURCE_MODE_FORCE_SOURCE);
-                }
-                if (codeinst) {
-                    orc::ThreadSafeModule result_m =
-                        jl_create_ts_module(name_from_method_instance(codeinst->def),
-                            params.tsctx, params.DL, params.TargetTriple);
-                    auto decls = jl_emit_codeinst(result_m, codeinst, NULL, params);
-                    if (result_m)
-                        it = params.compiled_functions.insert(std::make_pair(codeinst, std::make_pair(std::move(result_m), std::move(decls)))).first;
-                }
-            }
-            if (it != params.compiled_functions.end()) {
-                auto &decls = it->second.second;
-                if (decls.functionObject == "jl_fptr_args") {
-                    preal_decl = decls.specFunctionObject;
-                }
-                else if (decls.functionObject != "jl_fptr_sparam") {
-                    preal_decl = decls.specFunctionObject;
-                    preal_specsig = true;
-                }
-            }
-        }
-        // patch up the prototype we emitted earlier
-        Module *mod = proto.decl->getParent();
-        assert(proto.decl->isDeclaration());
-        if (proto.specsig) {
-            // expected specsig
-            if (!preal_specsig) {
-                if (params.params->trim) {
-                    auto it = params.compiled_functions.find(codeinst); //TODO: What to do about this
-                    errs() << "Bailed out to invoke when compiling:";
-                    jl_(codeinst->def);
-                    if (it != params.compiled_functions.end()) {
-                        errs() << it->second.second.functionObject << "\n";
-                        errs() << it->second.second.specFunctionObject << "\n";
-                    } else
-                        errs() << "codeinst not in compile_functions\n";
-                }
-                // emit specsig-to-(jl)invoke conversion
-                StringRef invokeName;
-                if (invoke != NULL)
-                    invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
-                Function *preal = emit_tojlinvoke(codeinst, invokeName, mod, params);
-                proto.decl->setLinkage(GlobalVariable::InternalLinkage);
-                //protodecl->setAlwaysInline();
-                jl_init_function(proto.decl, params.TargetTriple);
-                size_t nrealargs = jl_nparams(codeinst->def->specTypes); // number of actual arguments being passed
-                // TODO: maybe this can be cached in codeinst->specfptr?
-                emit_cfunc_invalidate(proto.decl, proto.cc, proto.return_roots, codeinst->def->specTypes, codeinst->rettype, false, nrealargs, params, preal, 0, 0);
-                preal_decl = ""; // no need to fixup the name
-            }
-            else {
-                assert(!preal_decl.empty());
-            }
-        }
-        else {
-            // expected non-specsig
-            if (preal_decl.empty() || preal_specsig) {
-                // emit jlcall1-to-(jl)invoke conversion
-                StringRef invokeName;
-                if (invoke != NULL)
-                    invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
-                preal_decl = emit_tojlinvoke(codeinst, invokeName, mod, params)->getName();
-            }
-        }
-        if (!preal_decl.empty()) {
-            // merge and/or rename this prototype to the real function
-            if (Value *specfun = mod->getNamedValue(preal_decl)) {
-                if (proto.decl != specfun)
-                    proto.decl->replaceAllUsesWith(specfun);
-            }
-            else {
-                proto.decl->setName(preal_decl);
-            }
-        }
-    }
-    JL_GC_POP();
-}
-
-
 // --- initialization ---
-SmallVector<std::pair<jl_value_t**, JuliaVariable*>, 0> gv_for_global;
+static auto gv_for_global = new SmallVector<std::pair<jl_value_t**, JuliaVariable*>, 0>();
 static void global_jlvalue_to_llvm(JuliaVariable *var, jl_value_t **addr)
 {
-    gv_for_global.push_back(std::make_pair(addr, var));
+    gv_for_global->push_back(std::make_pair(addr, var));
 }
 static JuliaVariable *julia_const_gv(jl_value_t *val)
 {
-    for (auto &kv : gv_for_global) {
+    for (auto &kv : *gv_for_global) {
         if (*kv.first == val)
             return kv.second;
     }
@@ -10286,6 +10159,9 @@ static JuliaVariable *julia_const_gv(jl_value_t *val)
 
 static void init_jit_functions(void)
 {
+    add_named_global("jl_fptr_args", jl_fptr_args_addr);
+    add_named_global("jl_fptr_sparam", jl_fptr_sparam_addr);
+    add_named_global("jl_f_opaque_closure_call", &jl_f_opaque_closure_call);
     add_named_global(jl_small_typeof_var, &jl_small_typeof);
     add_named_global(jlstack_chk_guard_var, &__stack_chk_guard);
     add_named_global(jlRTLD_DEFAULT_var, &jl_RTLD_DEFAULT_handle);
diff --git a/src/debug-registry.h b/src/debug-registry.h
index 4c9e13d8cd72d..4d0b7a44f19e5 100644
--- a/src/debug-registry.h
+++ b/src/debug-registry.h
@@ -32,7 +32,7 @@ class JITDebugInfoRegistry
             std::unique_lock<std::mutex> lock;
             CResourceT &resource;
 
-            Lock(std::mutex &mutex, CResourceT &resource) JL_NOTSAFEPOINT : lock(mutex), resource(resource) {}
+            Lock(std::mutex &mutex, CResourceT &resource) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER : lock(mutex), resource(resource) {}
             Lock(Lock &&) JL_NOTSAFEPOINT = default;
             Lock &operator=(Lock &&) JL_NOTSAFEPOINT = default;
 
@@ -56,7 +56,7 @@ class JITDebugInfoRegistry
                 return resource;
             }
 
-            ~Lock() JL_NOTSAFEPOINT = default;
+            ~Lock() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE = default;
         };
     private:
 
@@ -68,15 +68,15 @@ class JITDebugInfoRegistry
 
         Locked(ResourceT resource = ResourceT()) JL_NOTSAFEPOINT : mutex(), resource(std::move(resource)) {}
 
-        LockT operator*() JL_NOTSAFEPOINT {
+        LockT operator*() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER {
             return LockT(mutex, resource);
         }
 
-        ConstLockT operator*() const JL_NOTSAFEPOINT {
+        ConstLockT operator*() const JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER {
             return ConstLockT(mutex, resource);
         }
 
-        ~Locked() JL_NOTSAFEPOINT = default;
+        ~Locked() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE = default;
     };
 
     struct image_info_t {
@@ -105,6 +105,7 @@ class JITDebugInfoRegistry
         std::unique_ptr<const llvm::object::ObjectFile> object;
         std::unique_ptr<llvm::DIContext> context;
         LazyObjectInfo() = delete;
+        ~LazyObjectInfo() JL_NOTSAFEPOINT = default;
     };
 
     struct SectionInfo {
@@ -113,6 +114,7 @@ class JITDebugInfoRegistry
         ptrdiff_t slide;
         uint64_t SectionIndex;
         SectionInfo() = delete;
+        ~SectionInfo() JL_NOTSAFEPOINT = default;
     };
 
     template<typename KeyT, typename ValT>
@@ -145,7 +147,7 @@ class JITDebugInfoRegistry
     void add_code_in_flight(llvm::StringRef name, jl_code_instance_t *codeinst, const llvm::DataLayout &DL) JL_NOTSAFEPOINT;
     jl_method_instance_t *lookupLinfo(size_t pointer) JL_NOTSAFEPOINT;
     void registerJITObject(const llvm::object::ObjectFile &Object,
-                        std::function<uint64_t(const llvm::StringRef &)> getLoadAddress);
+                        std::function<uint64_t(const llvm::StringRef &)> getLoadAddress) JL_NOTSAFEPOINT;
     objectmap_t& getObjectMap() JL_NOTSAFEPOINT;
     void add_image_info(image_info_t info) JL_NOTSAFEPOINT;
     bool get_image_info(uint64_t base, image_info_t *info) const JL_NOTSAFEPOINT;
diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp
index f6fca47e9a889..31f1ba8281a89 100644
--- a/src/debuginfo.cpp
+++ b/src/debuginfo.cpp
@@ -58,7 +58,7 @@ extern "C" void __register_frame(void*) JL_NOTSAFEPOINT;
 extern "C" void __deregister_frame(void*) JL_NOTSAFEPOINT;
 
 template <typename callback>
-static void processFDEs(const char *EHFrameAddr, size_t EHFrameSize, callback f)
+static void processFDEs(const char *EHFrameAddr, size_t EHFrameSize, callback f) JL_NOTSAFEPOINT
 {
     const char *P = EHFrameAddr;
     const char *End = P + EHFrameSize;
@@ -164,6 +164,12 @@ static void jl_profile_atomic(T f) JL_NOTSAFEPOINT
 // --- storing and accessing source location metadata ---
 void jl_add_code_in_flight(StringRef name, jl_code_instance_t *codeinst, const DataLayout &DL)
 {
+    // Non-opaque-closure MethodInstances are considered globally rooted
+    // through their methods, but for OC, we need to create a global root
+    // here.
+    jl_method_instance_t *mi = codeinst->def;
+    if (jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure)
+        jl_as_global_root((jl_value_t*)mi, 1);
     getJITDebugRegistry().add_code_in_flight(name, codeinst, DL);
 }
 
@@ -369,11 +375,6 @@ void JITDebugInfoRegistry::registerJITObject(const object::ObjectFile &Object,
         if (codeinst) {
             JL_GC_PROMISE_ROOTED(codeinst);
             mi = codeinst->def;
-            // Non-opaque-closure MethodInstances are considered globally rooted
-            // through their methods, but for OC, we need to create a global root
-            // here.
-            if (jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure)
-                mi = (jl_method_instance_t*)jl_as_global_root((jl_value_t*)mi, 1);
         }
         jl_profile_atomic([&]() JL_NOTSAFEPOINT {
             if (mi)
@@ -1281,14 +1282,14 @@ void register_eh_frames(uint8_t *Addr, size_t Size)
 {
   // On OS X OS X __register_frame takes a single FDE as an argument.
   // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
-  processFDEs((char*)Addr, Size, [](const char *Entry) {
+  processFDEs((char*)Addr, Size, [](const char *Entry) JL_NOTSAFEPOINT {
       getJITDebugRegistry().libc_frames.libc_register_frame(Entry);
     });
 }
 
 void deregister_eh_frames(uint8_t *Addr, size_t Size)
 {
-   processFDEs((char*)Addr, Size, [](const char *Entry) {
+   processFDEs((char*)Addr, Size, [](const char *Entry) JL_NOTSAFEPOINT {
       getJITDebugRegistry().libc_frames.libc_deregister_frame(Entry);
     });
 }
@@ -1300,7 +1301,7 @@ void deregister_eh_frames(uint8_t *Addr, size_t Size)
 
 // Skip over an arbitrary long LEB128 encoding.
 // Return the pointer to the first unprocessed byte.
-static const uint8_t *consume_leb128(const uint8_t *Addr, const uint8_t *End)
+static const uint8_t *consume_leb128(const uint8_t *Addr, const uint8_t *End) JL_NOTSAFEPOINT
 {
     const uint8_t *P = Addr;
     while ((*P >> 7) != 0 && P < End)
@@ -1312,7 +1313,7 @@ static const uint8_t *consume_leb128(const uint8_t *Addr, const uint8_t *End)
 // bytes than what there are more bytes than what the type can store.
 // Adjust the pointer to the first unprocessed byte.
 template<typename T> static T parse_leb128(const uint8_t *&Addr,
-                                           const uint8_t *End)
+                                           const uint8_t *End) JL_NOTSAFEPOINT
 {
     typedef typename std::make_unsigned<T>::type uT;
     uT v = 0;
@@ -1335,7 +1336,7 @@ template<typename T> static T parse_leb128(const uint8_t *&Addr,
 }
 
 template <typename U, typename T>
-static U safe_trunc(T t)
+static U safe_trunc(T t) JL_NOTSAFEPOINT
 {
     assert((t >= static_cast<T>(std::numeric_limits<U>::min()))
            && (t <= static_cast<T>(std::numeric_limits<U>::max())));
@@ -1375,7 +1376,7 @@ enum DW_EH_PE : uint8_t {
 };
 
 // Parse the CIE and return the type of encoding used by FDE
-static DW_EH_PE parseCIE(const uint8_t *Addr, const uint8_t *End)
+static DW_EH_PE parseCIE(const uint8_t *Addr, const uint8_t *End) JL_NOTSAFEPOINT
 {
     // https://www.airs.com/blog/archives/460
     // Length (4 bytes)
@@ -1481,7 +1482,7 @@ void register_eh_frames(uint8_t *Addr, size_t Size)
 
     // Now first count the number of FDEs
     size_t nentries = 0;
-    processFDEs((char*)Addr, Size, [&](const char*){ nentries++; });
+    processFDEs((char*)Addr, Size, [&](const char*) JL_NOTSAFEPOINT { nentries++; });
     if (nentries == 0)
         return;
 
@@ -1510,7 +1511,7 @@ void register_eh_frames(uint8_t *Addr, size_t Size)
     // CIE's (may not happen) without parsing it every time.
     const uint8_t *cur_cie = nullptr;
     DW_EH_PE encoding = DW_EH_PE_omit;
-    processFDEs((char*)Addr, Size, [&](const char *Entry) {
+    processFDEs((char*)Addr, Size, [&](const char *Entry) JL_NOTSAFEPOINT {
             // Skip Length (4bytes) and CIE offset (4bytes)
             uint32_t fde_size = *(const uint32_t*)Entry;
             uint32_t cie_id = ((const uint32_t*)Entry)[1];
@@ -1631,7 +1632,7 @@ void deregister_eh_frames(uint8_t *Addr, size_t Size)
 #endif
 
 extern "C" JL_DLLEXPORT_CODEGEN
-uint64_t jl_getUnwindInfo_impl(uint64_t dwAddr)
+uint64_t jl_getUnwindInfo_impl(uint64_t dwAddr) JL_NOTSAFEPOINT
 {
     // Might be called from unmanaged thread
     jl_lock_profile();
diff --git a/src/engine.cpp b/src/engine.cpp
index 6db4dce44e48e..2b68de731c4dd 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -45,8 +45,8 @@ template<> struct llvm::DenseMapInfo<InferKey> {
   }
 };
 
-static std::mutex engine_lock;
-static std::condition_variable engine_wait;
+static std::mutex engine_lock; // n.b. this lock is only ever held briefly
+static std::condition_variable engine_wait; // but it may be waiting a while in this state
 // map from MethodInstance to threadid that owns it currently for inference
 static DenseMap<InferKey, ReservationInfo> Reservations;
 // vector of which threads are blocked and which lease they need
@@ -63,55 +63,51 @@ jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner
     ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph
     jl_code_instance_t *ci = jl_new_codeinst_uninit(m, owner); // allocate a placeholder
     JL_GC_PUSH1(&ci);
-    int8_t gc_state = jl_gc_safe_enter(ct->ptls);
-    InferKey key = {m, owner};
-    std::unique_lock<std::mutex> lock(engine_lock);
     auto tid = jl_atomic_load_relaxed(&ct->tid);
-    if ((signed)Awaiting.size() < tid + 1)
-        Awaiting.resize(tid + 1);
-    while (1) {
-        auto record = Reservations.find(key);
-        if (record == Reservations.end()) {
-            Reservations[key] = ReservationInfo{tid, ci};
-            lock.unlock();
-            jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
-            JL_GC_POP();
-            return ci;
-        }
-        // before waiting, need to run deadlock/cycle detection
-        // there is a cycle if the thread holding our lease is blocked
-        // and waiting for (transitively) any lease that is held by this thread
-        auto wait_tid = record->second.tid;
-        while (1) {
-            if (wait_tid == tid) {
-                lock.unlock();
-                jl_gc_safe_leave(ct->ptls, gc_state); // contains jl_gc_safepoint
-                JL_GC_POP();
-                ct->ptls->engine_nqueued--;
-                return ci; // break the cycle
+    if (([tid, m, owner, ci] () -> bool { // necessary scope block / lambda for unique_lock
+            jl_unique_gcsafe_lock lock(engine_lock);
+            InferKey key{m, owner};
+            if ((signed)Awaiting.size() < tid + 1)
+                Awaiting.resize(tid + 1);
+            while (1) {
+                auto record = Reservations.find(key);
+                if (record == Reservations.end()) {
+                    Reservations[key] = ReservationInfo{tid, ci};
+                    return false;
+                }
+                // before waiting, need to run deadlock/cycle detection
+                // there is a cycle if the thread holding our lease is blocked
+                // and waiting for (transitively) any lease that is held by this thread
+                auto wait_tid = record->second.tid;
+                while (1) {
+                    if (wait_tid == tid)
+                        return true;
+                    if ((signed)Awaiting.size() <= wait_tid)
+                        break; // no cycle, since it is running (and this should be unreachable)
+                    auto key2 = Awaiting[wait_tid];
+                    if (key2.mi == nullptr)
+                        break; // no cycle, since it is running
+                    auto record2 = Reservations.find(key2);
+                    if (record2 == Reservations.end())
+                        break; // no cycle, since it is about to resume
+                    assert(wait_tid != record2->second.tid);
+                    wait_tid = record2->second.tid;
+                }
+                Awaiting[tid] = key;
+                lock.wait(engine_wait);
+                Awaiting[tid] = InferKey{};
             }
-            if ((signed)Awaiting.size() <= wait_tid)
-                break; // no cycle, since it is running (and this should be unreachable)
-            auto key2 = Awaiting[wait_tid];
-            if (key2.mi == nullptr)
-                break; // no cycle, since it is running
-            auto record2 = Reservations.find(key2);
-            if (record2 == Reservations.end())
-                break; // no cycle, since it is about to resume
-            assert(wait_tid != record2->second.tid);
-            wait_tid = record2->second.tid;
-        }
-        Awaiting[tid] = key;
-        engine_wait.wait(lock);
-        Awaiting[tid] = InferKey{};
-    }
+        })())
+        ct->ptls->engine_nqueued--;
+    JL_GC_POP();
+    return ci;
 }
 
 int jl_engine_hasreserved(jl_method_instance_t *m, jl_value_t *owner)
 {
     jl_task_t *ct = jl_current_task;
     InferKey key = {m, owner};
-    std::unique_lock<std::mutex> lock(engine_lock);
+    std::unique_lock lock(engine_lock);
     auto record = Reservations.find(key);
     return record != Reservations.end() && record->second.tid == jl_atomic_load_relaxed(&ct->tid);
 }
@@ -123,7 +119,7 @@ STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT
 
 void jl_engine_sweep(jl_ptls_t *gc_all_tls_states)
 {
-    std::unique_lock<std::mutex> lock(engine_lock);
+    std::unique_lock lock(engine_lock);
     bool any = false;
     for (auto I = Reservations.begin(); I != Reservations.end(); ++I) {
         jl_code_instance_t *ci = I->second.ci;
@@ -142,7 +138,7 @@ void jl_engine_sweep(jl_ptls_t *gc_all_tls_states)
 void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src)
 {
     jl_task_t *ct = jl_current_task;
-    std::unique_lock<std::mutex> lock(engine_lock);
+    std::unique_lock lock(engine_lock);
     auto record = Reservations.find(InferKey{ci->def, ci->owner});
     if (record == Reservations.end() || record->second.ci != ci)
         return;
@@ -152,7 +148,6 @@ void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src)
     engine_wait.notify_all();
 }
 
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gf.c b/src/gf.c
index fc2e62ebff96b..e77c950c38ae4 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -3222,6 +3222,23 @@ jl_value_t *jl_argtype_with_function_type(jl_value_t *ft JL_MAYBE_UNROOTED, jl_v
     return tt;
 }
 
+// undo jl_argtype_with_function transform
+jl_value_t *jl_argtype_without_function(jl_value_t *ftypes)
+{
+    jl_value_t *types = jl_unwrap_unionall(ftypes);
+    size_t l = jl_nparams(types);
+    if (l == 1 && jl_is_vararg(jl_tparam0(types)))
+        return ftypes;
+    jl_value_t *tt = (jl_value_t*)jl_alloc_svec(l - 1);
+    JL_GC_PUSH1(&tt);
+    for (size_t i = 1; i < l; i++)
+        jl_svecset(tt, i - 1, jl_tparam(types, i));
+    tt = (jl_value_t*)jl_apply_tuple_type((jl_svec_t*)tt, 0);
+    tt = jl_rewrap_unionall_(tt, types);
+    JL_GC_POP();
+    return tt;
+}
+
 #ifdef JL_TRACE
 static int trace_en = 0;
 static int error_en = 1;
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 313449dda5557..8b8004af03616 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -64,9 +64,6 @@ using namespace llvm;
 #define DEBUG_TYPE "julia_jitlayers"
 
 STATISTIC(LinkedGlobals, "Number of globals linked");
-STATISTIC(CompiledCodeinsts, "Number of codeinsts compiled directly");
-STATISTIC(MaxWorkqueueSize, "Maximum number of elements in the workqueue");
-STATISTIC(IndirectCodeinsts, "Number of dependent codeinsts compiled");
 STATISTIC(SpecFPtrCount, "Number of specialized function pointers compiled");
 STATISTIC(UnspecFPtrCount, "Number of specialized function pointers compiled");
 STATISTIC(ModulesAdded, "Number of modules added to the JIT");
@@ -151,13 +148,6 @@ void jl_dump_llvm_opt_impl(void *s)
     **jl_ExecutionEngine->get_dump_llvm_opt_stream() = (ios_t*)s;
 }
 
-#ifndef JL_USE_JITLINK
-static int jl_add_to_ee(
-        orc::ThreadSafeModule &M,
-        const StringMap<orc::ThreadSafeModule*> &NewExports,
-        DenseMap<orc::ThreadSafeModule*, int> &Queued,
-        SmallVectorImpl<orc::ThreadSafeModule*> &Stack) JL_NOTSAFEPOINT;
-#endif
 static void jl_decorate_module(Module &M) JL_NOTSAFEPOINT;
 
 void jl_link_global(GlobalVariable *GV, void *addr) JL_NOTSAFEPOINT
@@ -187,214 +177,536 @@ void jl_jit_globals(std::map<void *, GlobalVariable*> &globals) JL_NOTSAFEPOINT
     }
 }
 
-// this generates llvm code for the lambda info
-// and adds the result to the jitlayers
-// (and the shadow module),
-// and generates code for it
-static jl_callptr_t _jl_compile_codeinst(
-        jl_code_instance_t *codeinst,
-        jl_code_info_t *src,
-        orc::ThreadSafeContext context)
+  // lock for places where only single threaded behavior is implemented, so we need GC support
+static jl_mutex_t jitlock;
+  // locks for adding external code to the JIT atomically
+static std::mutex extern_c_lock;
+  // locks and barriers for this state
+static std::mutex engine_lock;
+static std::condition_variable engine_wait;
+static int threads_in_compiler_phase;
+  // the TSM for each codeinst
+static SmallVector<orc::ThreadSafeModule,0> sharedmodules;
+static DenseMap<jl_code_instance_t*, orc::ThreadSafeModule> emittedmodules;
+  // the invoke and specsig function names in the JIT
+static DenseMap<jl_code_instance_t*, jl_llvm_functions_t> invokenames;
+  // everything that any thread wants to compile right now
+static DenseSet<jl_code_instance_t*> compileready;
+  // everything that any thread has compiled recently
+static DenseSet<jl_code_instance_t*> linkready;
+  // a map from a codeinst to the outgoing edges needed before linking it
+static DenseMap<jl_code_instance_t*, SmallVector<jl_code_instance_t*,0>> complete_graph;
+  // the state for each codeinst and the number of unresolved edges (we don't
+  // really need this once JITLink is available everywhere, since every module
+  // is automatically complete, and we can emit any required fixups later as a
+  // separate module)
+static DenseMap<jl_code_instance_t*, std::tuple<jl_codegen_params_t, int>> incompletemodules;
+  // the set of incoming unresolved edges resolved by a codeinstance
+static DenseMap<jl_code_instance_t*, SmallVector<jl_code_instance_t*,0>> incomplete_rgraph;
+
+// Lock hierarchy here:
+//   jitlock is outermost, can contain others and allows GC
+//   engine_lock is next
+//   ThreadSafeContext locks are next, they should not be nested (unless engine_lock is also held, but this may make TSAN sad anyways)
+//   extern_c_lock is next
+//   jl_ExecutionEngine internal locks are exclusive to this list, since OrcJIT promises to never hold a lock over a materialization unit:
+//        construct a query object from a query set and query handler
+//        lock the session
+//        lodge query against requested symbols, collect required materializers (if any)
+//        unlock the session
+//        dispatch materializers (if any)
+//     However, this guarantee relies on Julia releasing all TSC locks before causing any materialization units to be dispatched
+//     as materialization may need to acquire TSC locks.
+
+
+static void finish_params(Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOINT
 {
-    // caller must hold codegen_lock
-    // and have disabled finalizers
-    uint64_t start_time = 0;
-    bool timed = !!*jl_ExecutionEngine->get_dump_compiles_stream();
-    if (timed)
-        start_time = jl_hrtime();
+    if (params._shared_module) {
+        sharedmodules.push_back(orc::ThreadSafeModule(std::move(params._shared_module), params.tsctx));
+    }
+
+    // In imaging mode, we can't inline global variable initializers in order to preserve
+    // the fiction that we don't know what loads from the global will return. Thus, we
+    // need to emit a separate module for the globals before any functions are compiled,
+    // to ensure that the globals are defined when they are compiled.
+    if (params.imaging_mode) {
+        if (!params.global_targets.empty()) {
+            void **globalslots = new void*[params.global_targets.size()];
+            void **slot = globalslots;
+            for (auto &global : params.global_targets) {
+                auto GV = global.second;
+                *slot = global.first;
+                jl_ExecutionEngine->addGlobalMapping(GV->getName(), (uintptr_t)slot);
+                slot++;
+            }
+#ifdef __clang_analyzer__
+            static void **leaker = globalslots; // for the purpose of the analyzer, we need to expressly leak this variable or it thinks we forgot to free it
+#endif
+        }
+    }
+    else {
+        StringMap<void*> NewGlobals;
+        for (auto &global : params.global_targets) {
+            NewGlobals[global.second->getName()] = global.first;
+        }
+        for (auto &GV : M->globals()) {
+            auto InitValue = NewGlobals.find(GV.getName());
+            if (InitValue != NewGlobals.end()) {
+                jl_link_global(&GV, InitValue->second);
+            }
+        }
+    }
+}
 
-    assert(jl_is_code_instance(codeinst));
 
-    JL_TIMING(CODEINST_COMPILE, CODEINST_COMPILE);
-    jl_callptr_t fptr = NULL;
-    // emit the code in LLVM IR form
-    jl_codegen_params_t params(std::move(context), jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple()); // Locks the context
-    params.cache = true;
-    params.imaging_mode = imaging_default();
-    params.debug_level = jl_options.debug_level;
-    {
-        orc::ThreadSafeModule result_m =
-            jl_create_ts_module(name_from_method_instance(codeinst->def), params.tsctx, params.DL, params.TargetTriple);
-        jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, src, params);
-        if (result_m)
-            params.compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
-        jl_compile_workqueue(params, CompilationPolicy::Default);
-
-        if (params._shared_module) {
-            jl_ExecutionEngine->optimizeDLSyms(*params._shared_module);
-            jl_ExecutionEngine->addModule(orc::ThreadSafeModule(std::move(params._shared_module), params.tsctx));
+static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t &params, bool forceall=false) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
+{
+    jl_task_t *ct = jl_current_task;
+    decltype(params.workqueue) edges;
+    std::swap(params.workqueue, edges);
+    for (auto &it : edges) {
+        jl_code_instance_t *codeinst = it.first;
+        auto &proto = it.second;
+        // try to emit code for this item from the workqueue
+        StringRef invokeName = "";
+        StringRef preal_decl = "";
+        bool preal_specsig = false;
+        jl_callptr_t invoke = nullptr;
+        bool isedge = false;
+        assert(params.cache);
+        // Checking the cache here is merely an optimization and not strictly required
+        // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock
+        uint8_t specsigflags;
+        void *fptr;
+        jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
+        //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
+        if (invoke == jl_fptr_args_addr) {
+            preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
         }
-
-        // In imaging mode, we can't inline global variable initializers in order to preserve
-        // the fiction that we don't know what loads from the global will return. Thus, we
-        // need to emit a separate module for the globals before any functions are compiled,
-        // to ensure that the globals are defined when they are compiled.
-        if (params.imaging_mode) {
-            // Won't contain any PLT/dlsym calls, so no need to optimize those
-            if (!params.global_targets.empty()) {
-                void **globalslots = new void*[params.global_targets.size()];
-                void **slot = globalslots;
-                for (auto &global : params.global_targets) {
-                    auto GV = global.second;
-                    *slot = global.first;
-                    jl_ExecutionEngine->addGlobalMapping(GV->getName(), (uintptr_t)slot);
-                    slot++;
+        else if (specsigflags & 0b1) {
+            preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+            preal_specsig = true;
+        }
+        bool force = forceall || invoke != nullptr;
+        if (preal_decl.empty()) {
+            auto it = invokenames.find(codeinst);
+            if (it != invokenames.end()) {
+                auto &decls = it->second;
+                invokeName = decls.functionObject;
+                if (decls.functionObject == "jl_fptr_args") {
+                    preal_decl = decls.specFunctionObject;
+                    isedge = true;
                 }
-#ifdef __clang_analyzer__
-                static void **leaker = globalslots; // for the purpose of the analyzer, we need to expressly leak this variable or it thinks we forgot to free it
-#endif
+                else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
+                    preal_decl = decls.specFunctionObject;
+                    preal_specsig = true;
+                    isedge = true;
+                }
+                force = true;
             }
         }
-        else {
-            StringMap<void*> NewGlobals;
-            for (auto &global : params.global_targets) {
-                NewGlobals[global.second->getName()] = global.first;
+        if (!preal_decl.empty() || force) {
+            // if we have a prototype emitted, compare it to what we emitted earlier
+            Module *mod = proto.decl->getParent();
+            assert(proto.decl->isDeclaration());
+            Function *pinvoke = nullptr;
+            if (preal_decl.empty()) {
+                if (invoke != nullptr && invokeName.empty()) {
+                    assert(invoke != jl_fptr_args_addr);
+                    if (invoke == jl_fptr_sparam_addr)
+                        invokeName = "jl_fptr_sparam";
+                    else if (invoke == jl_f_opaque_closure_call_addr)
+                        invokeName = "jl_f_opaque_closure_call";
+                    else
+                        invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+                }
+                pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params);
+                if (!proto.specsig)
+                    proto.decl->replaceAllUsesWith(pinvoke);
+                isedge = false;
             }
-            for (auto &def : params.compiled_functions) {
-                auto M = std::get<0>(def.second).getModuleUnlocked();
-                for (auto &GV : M->globals()) {
-                    auto InitValue = NewGlobals.find(GV.getName());
-                    if (InitValue != NewGlobals.end()) {
-                        jl_link_global(&GV, InitValue->second);
-                    }
+            if (proto.specsig && !preal_specsig) {
+                // get or build an fptr1 that can invoke codeinst
+                if (pinvoke == nullptr)
+                    pinvoke = get_or_emit_fptr1(preal_decl, mod);
+                // emit specsig-to-(jl)invoke conversion
+                proto.decl->setLinkage(GlobalVariable::InternalLinkage);
+                //protodecl->setAlwaysInline();
+                jl_init_function(proto.decl, params.TargetTriple);
+                // TODO: maybe this can be cached in codeinst->specfptr?
+                int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls)
+                jl_method_instance_t *mi = codeinst->def;
+                size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed
+                bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
+                emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke, 0, 0);
+                jl_gc_unsafe_leave(ct->ptls, gc_state);
+                preal_decl = ""; // no need to fixup the name
+            }
+            if (!preal_decl.empty()) {
+                // merge and/or rename this prototype to the real function
+                if (Value *specfun = mod->getNamedValue(preal_decl)) {
+                    if (proto.decl != specfun)
+                        proto.decl->replaceAllUsesWith(specfun);
+                }
+                else {
+                    proto.decl->setName(preal_decl);
                 }
             }
-        }
-
-#ifndef JL_USE_JITLINK
-        // Collect the exported functions from the params.compiled_functions modules,
-        // which form dependencies on which functions need to be
-        // compiled first. Cycles of functions are compiled together.
-        // (essentially we compile a DAG of SCCs in reverse topological order,
-        // if we treat declarations of external functions as edges from declaration
-        // to definition)
-        StringMap<orc::ThreadSafeModule*> NewExports;
-        for (auto &def : params.compiled_functions) {
-            orc::ThreadSafeModule &TSM = std::get<0>(def.second);
-            //The underlying context object is still locked because params is not destroyed yet
-            auto M = TSM.getModuleUnlocked();
-            jl_ExecutionEngine->optimizeDLSyms(*M);
-            for (auto &F : M->global_objects()) {
-                if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) {
-                    NewExports[F.getName()] = &TSM;
+            if (proto.oc) { // additionally, if we are dealing with an OC constructor, then we might also need to fix up the fptr1 reference too
+                assert(proto.specsig);
+                StringRef ocinvokeDecl = invokeName;
+                if (invoke != nullptr && ocinvokeDecl.empty()) {
+                    // check for some special tokens used by opaque_closure.c and convert those to their real functions
+                    assert(invoke != jl_fptr_args_addr);
+                    assert(invoke != jl_fptr_sparam_addr);
+                    if (invoke == jl_fptr_interpret_call_addr)
+                        ocinvokeDecl = "jl_fptr_interpret_call";
+                    else if (invoke == jl_fptr_const_return_addr)
+                        ocinvokeDecl = "jl_fptr_const_return";
+                    else if (invoke == jl_f_opaque_closure_call_addr)
+                        ocinvokeDecl = "jl_f_opaque_closure_call";
+                    //else if (invoke == jl_interpret_opaque_closure_addr)
+                    else
+                        ocinvokeDecl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+                }
+                // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too
+                // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure
+                if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return") {
+                    if (pinvoke == nullptr)
+                        ocinvokeDecl = get_or_emit_fptr1(preal_decl, mod)->getName();
+                    else
+                        ocinvokeDecl = pinvoke->getName();
+                }
+                assert(!ocinvokeDecl.empty());
+                assert(ocinvokeDecl != "jl_fptr_args");
+                assert(ocinvokeDecl != "jl_fptr_sparam");
+                // merge and/or rename this prototype to the real function
+                if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) {
+                    if (proto.oc != specfun)
+                        proto.oc->replaceAllUsesWith(specfun);
+                }
+                else {
+                    proto.oc->setName(ocinvokeDecl);
                 }
             }
         }
-        DenseMap<orc::ThreadSafeModule*, int> Queued;
-        SmallVector<orc::ThreadSafeModule*, 0> Stack;
-        for (auto &def : params.compiled_functions) {
-            // Add the results to the execution engine now
-            orc::ThreadSafeModule &M = std::get<0>(def.second);
-            jl_add_to_ee(M, NewExports, Queued, Stack);
-            assert(Queued.empty() && Stack.empty() && !M);
+        else {
+            isedge = true;
+            params.workqueue.push_back(it);
+            incomplete_rgraph[codeinst].push_back(callee);
         }
-#else
-        for (auto &def : params.compiled_functions) {
-            // Add the results to the execution engine now
-            orc::ThreadSafeModule &M = std::get<0>(def.second);
-            if (M)
-                jl_ExecutionEngine->addModule(std::move(M));
+        if (isedge)
+            complete_graph[callee].push_back(codeinst);
+    }
+    return params.workqueue.size();
+}
+
+// test whether codeinst->invoke is usable already without further compilation needed
+static bool jl_is_compiled_codeinst(jl_code_instance_t *codeinst) JL_NOTSAFEPOINT
+{
+    auto invoke = jl_atomic_load_relaxed(&codeinst->invoke);
+    if (invoke == nullptr || invoke == jl_fptr_wait_for_compiled_addr)
+        return false;
+    return true;
+}
+
+// move codeinst (and deps) from incompletemodules to emitted modules
+// and populate compileready from complete_graph
+static void prepare_compile(jl_code_instance_t *codeinst) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
+{
+    SmallVector<jl_code_instance_t*> workqueue;
+    workqueue.push_back(codeinst);
+    while (!workqueue.empty()) {
+        codeinst = workqueue.pop_back_val();
+        if (!invokenames.count(codeinst)) {
+            // this means it should be compiled already while the callee was in stasis
+            assert(jl_is_compiled_codeinst(codeinst));
+            continue;
         }
-#endif
-        ++CompiledCodeinsts;
-        MaxWorkqueueSize.updateMax(params.compiled_functions.size());
-        IndirectCodeinsts += params.compiled_functions.size() - 1;
-    }
-
-    // batch compile job for all new functions
-    SmallVector<StringRef> NewDefs;
-    for (auto &def : params.compiled_functions) {
-        jl_llvm_functions_t &decls = std::get<1>(def.second);
-        if (decls.functionObject != "jl_fptr_args" &&
-            decls.functionObject != "jl_fptr_sparam" &&
-            decls.functionObject != "jl_f_opaque_closure_call")
-            NewDefs.push_back(decls.functionObject);
-        if (!decls.specFunctionObject.empty())
-            NewDefs.push_back(decls.specFunctionObject);
-    }
-    auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs);
-
-    size_t i = 0;
-    size_t nextaddr = 0;
-    for (auto &def : params.compiled_functions) {
-        jl_code_instance_t *this_code = def.first;
-        if (i < jl_timing_print_limit)
-            jl_timing_show_func_sig(this_code->def->specTypes, JL_TIMING_DEFAULT_BLOCK);
-
-        jl_llvm_functions_t &decls = std::get<1>(def.second);
-        jl_callptr_t addr;
-        bool isspecsig = false;
-        if (decls.functionObject == "jl_fptr_args") {
-            addr = jl_fptr_args_addr;
+        // if this was incomplete, force completion now of it
+        auto it = incompletemodules.find(codeinst);
+        if (it != incompletemodules.end()) {
+            int waiting = 0;
+            auto &edges = complete_graph[codeinst];
+            auto edges_end = std::remove_if(edges.begin(), edges.end(), [&waiting, codeinst] (jl_code_instance_t *edge) JL_NOTSAFEPOINT -> bool {
+                auto &redges = incomplete_rgraph[edge];
+                // waiting += std::erase(redges, codeinst);
+                auto redges_end = std::remove(redges.begin(), redges.end(), codeinst);
+                if (redges_end != redges.end()) {
+                    waiting += redges.end() - redges_end;
+                    redges.erase(redges_end, redges.end());
+                    assert(!invokenames.count(edge));
+                }
+                return !invokenames.count(edge);
+            });
+            edges.erase(edges_end, edges.end());
+            assert(waiting == std::get<1>(it->second));
+            std::get<1>(it->second) = 0;
+            auto &params = std::get<0>(it->second);
+            params.tsctx_lock = params.tsctx.getLock();
+            waiting = jl_analyze_workqueue(codeinst, params, true); // may safepoint
+            assert(!waiting); (void)waiting;
+            Module *M = emittedmodules[codeinst].getModuleUnlocked();
+            finish_params(M, params);
+            incompletemodules.erase(it);
+        }
+        // and then indicate this should be compiled now
+        if (!linkready.count(codeinst) && compileready.insert(codeinst).second) {
+            auto edges = complete_graph.find(codeinst);
+            if (edges != complete_graph.end()) {
+                workqueue.append(edges->second);
+            }
+        }
+    }
+}
+
+// notify any other pending work that this edge now has code defined
+static void complete_emit(jl_code_instance_t *edge) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
+{
+    auto notify = incomplete_rgraph.find(edge);
+    if (notify == incomplete_rgraph.end())
+        return;
+    auto redges = std::move(notify->second);
+    incomplete_rgraph.erase(notify);
+    for (size_t i = 0; i < redges.size(); i++) {
+        jl_code_instance_t *callee = redges[i];
+        auto it = incompletemodules.find(callee);
+        assert(it != incompletemodules.end());
+        if (--std::get<1>(it->second) == 0) {
+            auto &params = std::get<0>(it->second);
+            params.tsctx_lock = params.tsctx.getLock();
+            assert(callee == it->first);
+            int waiting = jl_analyze_workqueue(callee, params); // may safepoint
+            assert(!waiting); (void)waiting;
+            Module *M = emittedmodules[callee].getModuleUnlocked();
+            finish_params(M, params);
+            incompletemodules.erase(it);
         }
-        else if (decls.functionObject == "jl_fptr_sparam") {
-            addr = jl_fptr_sparam_addr;
+    }
+}
+
+
+// set the invoke field for codeinst (and all deps, and assist with other pending work from other threads) now
+static void jl_compile_codeinst_now(jl_code_instance_t *codeinst)
+{
+    jl_unique_gcsafe_lock lock(engine_lock);
+    if (!invokenames.count(codeinst))
+        return;
+    threads_in_compiler_phase++;
+    prepare_compile(codeinst); // may safepoint
+    while (1) {
+        // TODO: split up this work by ThreadSafeContext, so two threads don't need to get the same locks and stall
+        if (!sharedmodules.empty()) {
+            auto TSM = sharedmodules.pop_back_val();
+            lock.native.unlock();
+            {
+                auto Lock = TSM.getContext().getLock();
+                jl_ExecutionEngine->optimizeDLSyms(*TSM.getModuleUnlocked()); // may safepoint
+            }
+            jl_ExecutionEngine->addModule(std::move(TSM));
+            lock.native.lock();
         }
-        else if (decls.functionObject == "jl_f_opaque_closure_call") {
-            addr = jl_f_opaque_closure_call_addr;
+        else if (!compileready.empty()) {
+            // move a function from compileready to linkready then compile it
+            auto compilenext = compileready.begin();
+            codeinst = *compilenext;
+            compileready.erase(compilenext);
+            auto TSMref = emittedmodules.find(codeinst);
+            assert(TSMref != emittedmodules.end());
+            auto TSM = std::move(TSMref->second);
+            linkready.insert(codeinst);
+            emittedmodules.erase(TSMref);
+            lock.native.unlock();
+            uint64_t start_time = jl_hrtime();
+            {
+                auto Lock = TSM.getContext().getLock();
+                jl_ExecutionEngine->optimizeDLSyms(*TSM.getModuleUnlocked()); // may safepoint
+            }
+            jl_ExecutionEngine->addModule(std::move(TSM)); // may safepoint
+            // If logging of the compilation stream is enabled,
+            // then dump the method-instance specialization type to the stream
+            jl_method_instance_t *mi = codeinst->def;
+            if (jl_is_method(mi->def.method)) {
+                auto stream = *jl_ExecutionEngine->get_dump_compiles_stream();
+                if (stream) {
+                    uint64_t end_time = jl_hrtime();
+                    ios_printf(stream, "%" PRIu64 "\t\"", end_time - start_time);
+                    jl_static_show((JL_STREAM*)stream, mi->specTypes);
+                    ios_printf(stream, "\"\n");
+                }
+            }
+            lock.native.lock();
         }
         else {
-            assert(NewDefs[nextaddr] == decls.functionObject);
-            addr = (jl_callptr_t)Addrs[nextaddr++];
-            assert(addr);
-            isspecsig = true;
+            break;
         }
-        if (!decls.specFunctionObject.empty()) {
-            void *prev_specptr = NULL;
-            assert(NewDefs[nextaddr] == decls.specFunctionObject);
-            void *spec = (void*)Addrs[nextaddr++];
-            assert(spec);
-            if (jl_atomic_cmpswap_acqrel(&this_code->specptr.fptr, &prev_specptr, spec)) {
-                // only set specsig and invoke if we were the first to set specptr
-                jl_atomic_store_relaxed(&this_code->specsigflags, (uint8_t) isspecsig);
-                // we might overwrite invokeptr here; that's ok, anybody who relied on the identity of invokeptr
-                // either assumes that specptr was null, doesn't care about specptr,
-                // or will wait until specsigflags has 0b10 set before reloading invoke
-                jl_atomic_store_release(&this_code->invoke, addr);
-                jl_atomic_store_release(&this_code->specsigflags, (uint8_t) (0b10 | isspecsig));
-            } else {
-                //someone else beat us, don't commit any results
-                while (!(jl_atomic_load_acquire(&this_code->specsigflags) & 0b10)) {
-                    jl_cpu_pause();
+    }
+    codeinst = nullptr;
+    // barrier until all threads have finished calling addModule
+    if (--threads_in_compiler_phase == 0) {
+        // the last thread out will finish linking everything
+        // then release all of the other threads
+        // move the function pointers out from invokenames to the codeinst
+
+        // batch compile job for all new functions
+        SmallVector<StringRef> NewDefs;
+        for (auto &this_code : linkready) {
+            auto it = invokenames.find(this_code);
+            assert(it != invokenames.end());
+            jl_llvm_functions_t &decls = it->second;
+            assert(!decls.functionObject.empty());
+            if (decls.functionObject != "jl_fptr_args" &&
+                decls.functionObject != "jl_fptr_sparam" &&
+                decls.functionObject != "jl_f_opaque_closure_call")
+                NewDefs.push_back(decls.functionObject);
+            if (!decls.specFunctionObject.empty())
+                NewDefs.push_back(decls.specFunctionObject);
+        }
+        auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs);
+
+        size_t nextaddr = 0;
+        for (auto &this_code : linkready) {
+            auto it = invokenames.find(this_code);
+            assert(it != invokenames.end());
+            jl_llvm_functions_t &decls = it->second;
+            jl_callptr_t addr;
+            bool isspecsig = false;
+            if (decls.functionObject == "jl_fptr_args") {
+                addr = jl_fptr_args_addr;
+            }
+            else if (decls.functionObject == "jl_fptr_sparam") {
+                addr = jl_fptr_sparam_addr;
+            }
+            else if (decls.functionObject == "jl_f_opaque_closure_call") {
+                addr = jl_f_opaque_closure_call_addr;
+            }
+            else {
+                assert(NewDefs[nextaddr] == decls.functionObject);
+                addr = (jl_callptr_t)Addrs[nextaddr++];
+                assert(addr);
+                isspecsig = true;
+            }
+            if (!decls.specFunctionObject.empty()) {
+                void *prev_specptr = nullptr;
+                assert(NewDefs[nextaddr] == decls.specFunctionObject);
+                void *spec = (void*)Addrs[nextaddr++];
+                assert(spec);
+                if (jl_atomic_cmpswap_acqrel(&this_code->specptr.fptr, &prev_specptr, spec)) {
+                    // only set specsig and invoke if we were the first to set specptr
+                    jl_atomic_store_relaxed(&this_code->specsigflags, (uint8_t) isspecsig);
+                    // we might overwrite invokeptr here; that's ok, anybody who relied on the identity of invokeptr
+                    // either assumes that specptr was null, doesn't care about specptr,
+                    // or will wait until specsigflags has 0b10 set before reloading invoke
+                    jl_atomic_store_release(&this_code->invoke, addr);
+                    jl_atomic_store_release(&this_code->specsigflags, (uint8_t) (0b10 | isspecsig));
+                }
+                else {
+                    //someone else beat us, don't commit any results
+                    while (!(jl_atomic_load_acquire(&this_code->specsigflags) & 0b10)) {
+                        jl_cpu_pause();
+                    }
+                    addr = jl_atomic_load_relaxed(&this_code->invoke);
                 }
-                addr = jl_atomic_load_relaxed(&this_code->invoke);
             }
-        } else {
-            jl_callptr_t prev_invoke = NULL;
-            // Allow replacing addr if it is either NULL or our special waiting placeholder.
-            if (!jl_atomic_cmpswap_acqrel(&this_code->invoke, &prev_invoke, addr)) {
-                if (prev_invoke == jl_fptr_wait_for_compiled_addr && !jl_atomic_cmpswap_acqrel(&this_code->invoke, &prev_invoke, addr)) {
-                    addr = prev_invoke;
-                    //TODO do we want to potentially promote invoke anyways? (e.g. invoke is jl_interpret_call or some other
-                    //known lesser function)
+            else {
+                jl_callptr_t prev_invoke = nullptr;
+                // Allow replacing addr if it is either nullptr or our special waiting placeholder.
+                if (!jl_atomic_cmpswap_acqrel(&this_code->invoke, &prev_invoke, addr)) {
+                    if (prev_invoke == jl_fptr_wait_for_compiled_addr && !jl_atomic_cmpswap_acqrel(&this_code->invoke, &prev_invoke, addr)) {
+                        addr = prev_invoke;
+                        //TODO do we want to potentially promote invoke anyways? (e.g. invoke is jl_interpret_call or some other
+                        //known lesser function)
+                    }
                 }
             }
+            invokenames.erase(it);
+            complete_graph.erase(this_code);
         }
-        if (this_code == codeinst)
-            fptr = addr;
-        i++;
+        linkready.clear();
+        engine_wait.notify_all();
+    }
+    else while (threads_in_compiler_phase) {
+        lock.wait(engine_wait);
     }
-    if (i > jl_timing_print_limit)
-        jl_timing_printf(JL_TIMING_DEFAULT_BLOCK, "... <%d methods truncated>", i - 10);
+}
 
-    uint64_t end_time = 0;
-    if (timed)
-        end_time = jl_hrtime();
-
-    // If logging of the compilation stream is enabled,
-    // then dump the method-instance specialization type to the stream
-    jl_method_instance_t *mi = codeinst->def;
-    if (jl_is_method(mi->def.method)) {
-        auto stream = *jl_ExecutionEngine->get_dump_compiles_stream();
-        if (stream) {
-            ios_printf(stream, "%" PRIu64 "\t\"", end_time - start_time);
-            jl_static_show((JL_STREAM*)stream, mi->specTypes);
-            ios_printf(stream, "\"\n");
+static void jl_emit_codeinst_to_jit(
+        jl_code_instance_t *codeinst,
+        jl_code_info_t *src)
+{
+    { // lock scope
+        jl_unique_gcsafe_lock lock(engine_lock);
+        if (invokenames.count(codeinst) || jl_is_compiled_codeinst(codeinst))
+            return;
+    }
+    JL_TIMING(CODEINST_COMPILE, CODEINST_COMPILE);
+    // emit the code in LLVM IR form to the new context
+    jl_codegen_params_t params(std::make_unique<LLVMContext>(), jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple()); // Locks the context
+    params.cache = true;
+    params.imaging_mode = imaging_default();
+    params.debug_level = jl_options.debug_level;
+    orc::ThreadSafeModule result_m =
+        jl_create_ts_module(name_from_method_instance(codeinst->def), params.tsctx, params.DL, params.TargetTriple);
+    jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, src, params); // contains safepoints
+    if (!result_m)
+        return;
+    { // drop lock before acquiring engine_lock
+        auto release = std::move(params.tsctx_lock);
+    }
+    jl_unique_gcsafe_lock lock(engine_lock);
+    if (invokenames.count(codeinst) || jl_is_compiled_codeinst(codeinst))
+        return; // destroy everything
+    invokenames[codeinst] = std::move(decls);
+    complete_emit(codeinst);
+    params.tsctx_lock = params.tsctx.getLock(); // re-acquire lock
+    int waiting = jl_analyze_workqueue(codeinst, params);
+    if (waiting) {
+        auto release = std::move(params.tsctx_lock); // unlock again before moving from it
+        incompletemodules.insert(std::pair(codeinst, std::tuple(std::move(params), waiting)));
+    }
+    else {
+        finish_params(result_m.getModuleUnlocked(), params);
+    }
+    emittedmodules[codeinst] = std::move(result_m);
+}
+
+static void recursive_compile_graph(
+    jl_code_instance_t *codeinst,
+    jl_code_info_t *src)
+{
+    jl_emit_codeinst_to_jit(codeinst, src);
+    DenseSet<jl_code_instance_t*> Seen;
+    SmallVector<jl_code_instance_t*> workqueue;
+    workqueue.push_back(codeinst);
+    // if any edges were incomplete, try to complete them now
+    while (!workqueue.empty()) {
+        auto this_code = workqueue.pop_back_val();
+        if (Seen.insert(this_code).second) {
+            if (this_code != codeinst)
+                jl_emit_codeinst_to_jit(this_code, nullptr); // contains safepoints
+            jl_unique_gcsafe_lock lock(engine_lock);
+            auto edges = complete_graph.find(this_code);
+            if (edges != complete_graph.end()) {
+                workqueue.append(edges->second);
+            }
         }
     }
-    return fptr;
 }
 
+// this generates llvm code for the lambda info
+// and adds the result to the jitlayers
+// (and the shadow module),
+// and generates code for it
+static jl_callptr_t _jl_compile_codeinst(
+        jl_code_instance_t *codeinst,
+        jl_code_info_t *src)
+{
+    recursive_compile_graph(codeinst, src);
+    jl_compile_codeinst_now(codeinst);
+    return jl_atomic_load_acquire(&codeinst->invoke);
+}
+
+
 const char *jl_generate_ccallable(LLVMOrcThreadSafeModuleRef llvmmod, void *sysimg_handle, jl_value_t *declrt, jl_value_t *sigt, jl_codegen_params_t &params);
 
 // compile a C-callable alias
@@ -415,42 +727,40 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
     orc::ThreadSafeModule backing;
     if (into == NULL) {
         if (!pparams) {
-            ctx = jl_ExecutionEngine->acquireContext();
+            ctx = jl_ExecutionEngine->makeContext();
         }
         backing = jl_create_ts_module("cextern", pparams ? pparams->tsctx : ctx,  pparams ? pparams->DL : jl_ExecutionEngine->getDataLayout(), pparams ? pparams->TargetTriple : jl_ExecutionEngine->getTargetTriple());
         into = &backing;
     }
-    auto target_info = into->withModuleDo([&](Module &M) {
-        return std::make_pair(M.getDataLayout(), Triple(M.getTargetTriple()));
-    });
-    jl_codegen_params_t params(into->getContext(), std::move(target_info.first), std::move(target_info.second));
-    params.imaging_mode = imaging_default();
-    params.debug_level = jl_options.debug_level;
-    if (pparams == NULL)
-        pparams = &params;
-    assert(pparams->tsctx.getContext() == into->getContext().getContext());
-    const char *name = jl_generate_ccallable(wrap(into), sysimg, declrt, sigt, *pparams);
     bool success = true;
-    if (!sysimg) {
-        JL_LOCK(&jl_ExecutionEngine->jitlock);
-        if (jl_ExecutionEngine->getGlobalValueAddress(name)) {
-            success = false;
-        }
-        if (success && p == NULL) {
-            jl_jit_globals(params.global_targets);
-            assert(params.workqueue.empty());
-            if (params._shared_module) {
-                jl_ExecutionEngine->optimizeDLSyms(*params._shared_module);
-                jl_ExecutionEngine->addModule(orc::ThreadSafeModule(std::move(params._shared_module), params.tsctx));
+    {
+        auto Lock = into->getContext().getLock();
+        Module *M = into->getModuleUnlocked();
+        jl_codegen_params_t params(into->getContext(), M->getDataLayout(), Triple(M->getTargetTriple()));
+        params.imaging_mode = imaging_default();
+        params.debug_level = jl_options.debug_level;
+        if (pparams == NULL)
+            pparams = &params;
+        assert(pparams->tsctx.getContext() == into->getContext().getContext());
+        const char *name = jl_generate_ccallable(wrap(into), sysimg, declrt, sigt, *pparams);
+        if (!sysimg) {
+            jl_unique_gcsafe_lock lock(extern_c_lock);
+            if (jl_ExecutionEngine->getGlobalValueAddress(name)) {
+                success = false;
+            }
+            if (success && p == NULL) {
+                jl_jit_globals(params.global_targets);
+                assert(params.workqueue.empty());
+                if (params._shared_module) {
+                    jl_ExecutionEngine->optimizeDLSyms(*params._shared_module); // safepoint
+                    jl_ExecutionEngine->addModule(orc::ThreadSafeModule(std::move(params._shared_module), params.tsctx));
+                }
+            }
+            if (success && llvmmod == NULL) {
+                jl_ExecutionEngine->optimizeDLSyms(*M); // safepoint
+                jl_ExecutionEngine->addModule(std::move(*into));
             }
         }
-        if (success && llvmmod == NULL) {
-            into->withModuleDo([&](Module &M) {
-                jl_ExecutionEngine->optimizeDLSyms(M);
-            });
-            jl_ExecutionEngine->addModule(std::move(*into));
-        }
-        JL_UNLOCK(&jl_ExecutionEngine->jitlock); // Might GC
     }
     if (timed) {
         if (measure_compile_time_enabled) {
@@ -459,9 +769,6 @@ int jl_compile_extern_c_impl(LLVMOrcThreadSafeModuleRef llvmmod, void *p, void *
         }
         ct->reentrant_timing &= ~1ull;
     }
-    if (ctx.getContext()) {
-        jl_ExecutionEngine->releaseContext(std::move(ctx));
-    }
     return success;
 }
 
@@ -512,18 +819,13 @@ extern "C" JL_DLLEXPORT_CODEGEN
 int jl_compile_codeinst_impl(jl_code_instance_t *ci)
 {
     int newly_compiled = 0;
-    if (jl_atomic_load_relaxed(&ci->invoke) != NULL) {
-        return newly_compiled;
-    }
-    JL_LOCK(&jl_ExecutionEngine->jitlock);
     if (jl_atomic_load_relaxed(&ci->invoke) == NULL) {
         ++SpecFPtrCount;
         uint64_t start = jl_typeinf_timing_begin();
-        _jl_compile_codeinst(ci, NULL, *jl_ExecutionEngine->getContext());
+        _jl_compile_codeinst(ci, NULL);
         jl_typeinf_timing_end(start, 0);
         newly_compiled = 1;
     }
-    JL_UNLOCK(&jl_ExecutionEngine->jitlock); // Might GC
     return newly_compiled;
 }
 
@@ -541,38 +843,39 @@ void jl_generate_fptr_for_unspecialized_impl(jl_code_instance_t *unspec)
     uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
     if (measure_compile_time_enabled)
         compiler_start_time = jl_hrtime();
-    JL_LOCK(&jl_ExecutionEngine->jitlock);
-    if (jl_atomic_load_relaxed(&unspec->invoke) == NULL) {
-        jl_code_info_t *src = NULL;
-        JL_GC_PUSH1(&src);
-        jl_method_t *def = unspec->def->def.method;
-        if (jl_is_method(def)) {
-            src = (jl_code_info_t*)def->source;
-            if (src && (jl_value_t*)src != jl_nothing)
-                src = jl_uncompress_ir(def, NULL, (jl_value_t*)src);
-        }
-        else {
-            jl_method_instance_t *mi = unspec->def;
-            jl_code_instance_t *uninferred = jl_cached_uninferred(
-                jl_atomic_load_relaxed(&mi->cache), 1);
-            assert(uninferred);
-            src = (jl_code_info_t*)jl_atomic_load_relaxed(&uninferred->inferred);
-            assert(src);
-        }
-        if (src) {
+    jl_code_info_t *src = NULL;
+    JL_GC_PUSH1(&src);
+    jl_method_t *def = unspec->def->def.method;
+    if (jl_is_method(def)) {
+        src = (jl_code_info_t*)def->source;
+        if (src && (jl_value_t*)src != jl_nothing)
+            src = jl_uncompress_ir(def, NULL, (jl_value_t*)src);
+    }
+    else {
+        jl_method_instance_t *mi = unspec->def;
+        jl_code_instance_t *uninferred = jl_cached_uninferred(
+            jl_atomic_load_relaxed(&mi->cache), 1);
+        assert(uninferred);
+        src = (jl_code_info_t*)jl_atomic_load_relaxed(&uninferred->inferred);
+        assert(src);
+    }
+    if (src) {
+        // TODO: first prepare recursive_compile_graph(unspec, src) before taking this lock to avoid recursion?
+        JL_LOCK(&jitlock); // TODO: use a better lock
+        if (jl_atomic_load_relaxed(&unspec->invoke) == NULL) {
             assert(jl_is_code_info(src));
             ++UnspecFPtrCount;
             jl_debuginfo_t *debuginfo = src->debuginfo;
             jl_atomic_store_release(&unspec->debuginfo, debuginfo); // n.b. this assumes the field was previously NULL, which is not entirely true
             jl_gc_wb(unspec, debuginfo);
-            _jl_compile_codeinst(unspec, src, *jl_ExecutionEngine->getContext());
+            _jl_compile_codeinst(unspec, src);
         }
-        jl_callptr_t null = nullptr;
-        // if we hit a codegen bug (or ran into a broken generated function or llvmcall), fall back to the interpreter as a last resort
-        jl_atomic_cmpswap(&unspec->invoke, &null, jl_fptr_interpret_call_addr);
-        JL_GC_POP();
+        JL_UNLOCK(&jitlock); // Might GC
     }
-    JL_UNLOCK(&jl_ExecutionEngine->jitlock); // Might GC
+    JL_GC_POP();
+    jl_callptr_t null = nullptr;
+    // if we hit a codegen bug (or ran into a broken generated function or llvmcall), fall back to the interpreter as a last resort
+    jl_atomic_cmpswap(&unspec->invoke, &null, jl_fptr_interpret_call_addr);
     if (timed) {
         if (measure_compile_time_enabled) {
             auto end = jl_hrtime();
@@ -634,8 +937,8 @@ static auto countBasicBlocks(const Function &F) JL_NOTSAFEPOINT
 
 static constexpr size_t N_optlevels = 4;
 
-static Expected<orc::ThreadSafeModule> selectOptLevel(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) {
-    TSM.withModuleDo([](Module &M) {
+static orc::ThreadSafeModule selectOptLevel(orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT {
+    TSM.withModuleDo([](Module &M) JL_NOTSAFEPOINT {
         size_t opt_level = std::max(static_cast<int>(jl_options.opt_level), 0);
         do {
             if (jl_generating_output()) {
@@ -661,7 +964,10 @@ static Expected<orc::ThreadSafeModule> selectOptLevel(orc::ThreadSafeModule TSM,
         opt_level = std::min(opt_level, N_optlevels - 1);
         M.addModuleFlag(Module::Warning, "julia.optlevel", opt_level);
     });
-    return std::move(TSM);
+    return TSM;
+}
+static orc::ThreadSafeModule selectOptLevel(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
+    return selectOptLevel(std::move(TSM));
 }
 
 void jl_register_jit_object(const object::ObjectFile &debugObj,
@@ -699,8 +1005,8 @@ class JLDebuginfoPlugin : public ObjectLinkingLayer::Plugin {
         {
             std::lock_guard<std::mutex> lock(PluginMutex);
             assert(PendingObjs.count(&MR) == 0);
-            PendingObjs[&MR] = std::unique_ptr<JITObjectInfo>(
-                new JITObjectInfo{std::move(NewBuffer), std::move(NewObj), {}});
+            PendingObjs[&MR] = std::unique_ptr<JITObjectInfo>(new JITObjectInfo{
+                std::move(NewBuffer), std::move(NewObj), {}});
         }
     }
 
@@ -870,7 +1176,7 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin {
 
 // TODO: Port our memory management optimisations to JITLink instead of using the
 // default InProcessMemoryManager.
-std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() {
+std::unique_ptr<jitlink::JITLinkMemoryManager> createJITLinkMemoryManager() JL_NOTSAFEPOINT {
 #if JL_LLVM_VERSION < 160000
     return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper<orc::InProcessMemoryMapper>());
 #else
@@ -900,7 +1206,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar {
     }
 };
 
-RTDyldMemoryManager* createRTDyldMemoryManager(void);
+RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT;
 
 // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr
 class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {
@@ -909,7 +1215,10 @@ class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {
 
 public:
     ForwardingMemoryManager(std::shared_ptr<RuntimeDyld::MemoryManager> MemMgr) : MemMgr(MemMgr) {}
-    virtual ~ForwardingMemoryManager() = default;
+    ForwardingMemoryManager(ForwardingMemoryManager &) = delete;
+    virtual ~ForwardingMemoryManager() {
+        assert(!MemMgr);
+    }
     virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                      unsigned SectionID,
                                      StringRef SectionName) override {
@@ -947,7 +1256,11 @@ class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {
         return MemMgr->deregisterEHFrames();
     }
     virtual bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-        return MemMgr->finalizeMemory(ErrMsg);
+        bool b = false;
+        if (MemMgr.use_count() == 2)
+            b = MemMgr->finalizeMemory(ErrMsg);
+        MemMgr.reset();
+        return b;
     }
     virtual void notifyObjectLoaded(RuntimeDyld &RTDyld,
                                     const object::ObjectFile &Obj) override {
@@ -955,10 +1268,10 @@ class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {
     }
 };
 
-
-void registerRTDyldJITObject(const object::ObjectFile &Object,
-                             const RuntimeDyld::LoadedObjectInfo &L,
-                             const std::shared_ptr<RTDyldMemoryManager> &MemMgr)
+#ifndef JL_USE_JITLINK
+static void registerRTDyldJITObject(orc::MaterializationResponsibility &MR,
+                                    const object::ObjectFile &Object,
+                                    const RuntimeDyld::LoadedObjectInfo &L)
 {
     StringMap<object::SectionRef> loadedSections;
     for (const object::SectionRef &lSection : Object.sections()) {
@@ -980,6 +1293,8 @@ void registerRTDyldJITObject(const object::ObjectFile &Object,
     auto DebugObject = L.getObjectForDebug(Object); // ELF requires us to make a copy to mutate the header with the section load addresses. On other platforms this is a no-op.
     jl_register_jit_object(DebugObject.getBinary() ? *DebugObject.getBinary() : Object, getLoadAddress);
 }
+#endif
+
 namespace {
     static std::unique_ptr<TargetMachine> createTargetMachine() JL_NOTSAFEPOINT {
         TargetOptions options = TargetOptions();
@@ -1078,9 +1393,6 @@ namespace {
         fixupTM(*TM);
         return std::unique_ptr<TargetMachine>(TM);
     }
-} // namespace
-
-namespace {
 
     typedef NewPM PassManager;
 
@@ -1131,14 +1443,14 @@ namespace {
     };
 
     template<size_t N>
-    struct OptimizerT {
-        OptimizerT(TargetMachine &TM, SmallVector<std::function<void()>, 0> &printers, std::mutex &llvm_printing_mutex) JL_NOTSAFEPOINT {
+    struct sizedOptimizerT {
+        sizedOptimizerT(TargetMachine &TM, SmallVector<std::function<void()>, 0> &printers, std::mutex &llvm_printing_mutex) JL_NOTSAFEPOINT {
             for (size_t i = 0; i < N; i++) {
                 PMs[i] = std::make_unique<JuliaOJIT::ResourcePool<std::unique_ptr<PassManager>>>(PMCreator(TM, i, printers, llvm_printing_mutex));
             }
         }
 
-        OptimizerResultT operator()(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
+        orc::ThreadSafeModule operator()(orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT {
             TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
                 auto PoolIdx = cast<ConstantInt>(cast<ConstantAsMetadata>(M.getModuleFlag("julia.optlevel"))->getValue())->getZExtValue();
                 assert(PoolIdx < N && "Invalid optimization pool index");
@@ -1243,12 +1555,23 @@ namespace {
                         llvm_unreachable("optlevel is between 0 and 3!");
                 }
             });
-            return Expected<orc::ThreadSafeModule>{std::move(TSM)};
+            return TSM;
         }
     private:
         std::array<std::unique_ptr<JuliaOJIT::ResourcePool<std::unique_ptr<PassManager>>>, N> PMs;
     };
 
+    // shim for converting a unique_ptr to a TransformFunction to a TransformFunction
+    template <typename T>
+    struct IRTransformRef {
+        IRTransformRef(T &transform) : transform(transform) {}
+        OptimizerResultT operator()(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
+            return transform(std::move(TSM), R);
+        }
+    private:
+        T &transform;
+    };
+
     template<size_t N>
     struct CompilerT : orc::IRCompileLayer::IRCompiler {
 
@@ -1264,7 +1587,8 @@ namespace {
             size_t PoolIdx;
             if (auto opt_level = M.getModuleFlag("julia.optlevel")) {
                 PoolIdx = cast<ConstantInt>(cast<ConstantAsMetadata>(opt_level)->getValue())->getZExtValue();
-            } else {
+            }
+            else {
                 PoolIdx = jl_options.opt_level;
             }
             assert(PoolIdx < N && "Invalid optimization level for compiler!");
@@ -1273,74 +1597,89 @@ namespace {
 
         std::array<std::unique_ptr<JuliaOJIT::ResourcePool<std::unique_ptr<TargetMachine>>>, N> TMs;
     };
+}
 
-    struct JITPointersT {
-
-        JITPointersT(SharedBytesT &SharedBytes, std::mutex &Lock) JL_NOTSAFEPOINT
-            : SharedBytes(SharedBytes), Lock(Lock) {}
+struct JuliaOJIT::OptimizerT {
+    OptimizerT(TargetMachine &TM, SmallVector<std::function<void()>, 0> &printers, std::mutex &llvm_printing_mutex)
+        : opt(TM, printers, llvm_printing_mutex) {}
+    orc::ThreadSafeModule operator()(orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT {
+        return opt(std::move(TSM));
+    }
+    OptimizerResultT operator()(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
+        return opt(std::move(TSM));
+    }
+private:
+    struct sizedOptimizerT<N_optlevels> opt;
+};
 
-        Expected<orc::ThreadSafeModule> operator()(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
-            TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
-                std::lock_guard<std::mutex> locked(Lock);
-                for (auto &GV : make_early_inc_range(M.globals())) {
-                    if (auto *Shared = getSharedBytes(GV)) {
-                        ++InternedGlobals;
-                        GV.replaceAllUsesWith(Shared);
-                        GV.eraseFromParent();
-                    }
+struct JuliaOJIT::JITPointersT {
+    JITPointersT(SharedBytesT &SharedBytes, std::mutex &Lock) JL_NOTSAFEPOINT
+        : SharedBytes(SharedBytes), Lock(Lock) {}
+
+    orc::ThreadSafeModule operator()(orc::ThreadSafeModule TSM) JL_NOTSAFEPOINT {
+        TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
+            std::lock_guard<std::mutex> locked(Lock);
+            for (auto &GV : make_early_inc_range(M.globals())) {
+                if (auto *Shared = getSharedBytes(GV)) {
+                    ++InternedGlobals;
+                    GV.replaceAllUsesWith(Shared);
+                    GV.eraseFromParent();
                 }
+            }
 
-                // Windows needs some inline asm to help
-                // build unwind tables, if they have any functions to decorate
-                if (!M.functions().empty())
-                    jl_decorate_module(M);
-            });
-            return std::move(TSM);
-        }
+            // Windows needs some inline asm to help
+            // build unwind tables, if they have any functions to decorate
+            if (!M.functions().empty())
+                jl_decorate_module(M);
+        });
+        return TSM;
+    }
+    Expected<orc::ThreadSafeModule> operator()(orc::ThreadSafeModule TSM, orc::MaterializationResponsibility &R) JL_NOTSAFEPOINT {
+        return operator()(std::move(TSM));
+    }
 
-    private:
-        // optimize memory by turning long strings into memoized copies, instead of
-        // making a copy per object file of output.
-        // we memoize them using a StringSet with a custom-alignment allocator
-        // to ensure they are properly aligned
-        Constant *getSharedBytes(GlobalVariable &GV) JL_NOTSAFEPOINT {
-            // We could probably technically get away with
-            // interning even external linkage globals,
-            // as long as they have global unnamedaddr,
-            // but currently we shouldn't be emitting those
-            // except in imaging mode, and we don't want to
-            // do this optimization there.
-            if (GV.hasExternalLinkage() || !GV.hasGlobalUnnamedAddr()) {
-                return nullptr;
-            }
-            if (!GV.hasInitializer()) {
-                return nullptr;
-            }
-            if (!GV.isConstant()) {
-                return nullptr;
-            }
-            auto CDS = dyn_cast<ConstantDataSequential>(GV.getInitializer());
-            if (!CDS) {
-                return nullptr;
-            }
-            StringRef Data = CDS->getRawDataValues();
-            if (Data.size() < 16) {
-                // Cutoff, since we don't want to intern small strings
-                return nullptr;
-            }
-            Align Required = GV.getAlign().valueOrOne();
-            Align Preferred = MaxAlignedAlloc::alignment(Data.size());
-            if (Required > Preferred)
-                return nullptr;
-            StringRef Interned = SharedBytes.insert(Data).first->getKey();
-            assert(llvm::isAddrAligned(Preferred, Interned.data()));
-            return literal_static_pointer_val(Interned.data(), GV.getType());
+private:
+    // optimize memory by turning long strings into memoized copies, instead of
+    // making a copy per object file of output.
+    // we memoize them using a StringSet with a custom-alignment allocator
+    // to ensure they are properly aligned
+    Constant *getSharedBytes(GlobalVariable &GV) JL_NOTSAFEPOINT {
+        // We could probably technically get away with
+        // interning even external linkage globals,
+        // as long as they have global unnamedaddr,
+        // but currently we shouldn't be emitting those
+        // except in imaging mode, and we don't want to
+        // do this optimization there.
+        if (GV.hasExternalLinkage() || !GV.hasGlobalUnnamedAddr()) {
+            return nullptr;
         }
+        if (!GV.hasInitializer()) {
+            return nullptr;
+        }
+        if (!GV.isConstant()) {
+            return nullptr;
+        }
+        auto CDS = dyn_cast<ConstantDataSequential>(GV.getInitializer());
+        if (!CDS) {
+            return nullptr;
+        }
+        StringRef Data = CDS->getRawDataValues();
+        if (Data.size() < 16) {
+            // Cutoff, since we don't want to intern small strings
+            return nullptr;
+        }
+        Align Required = GV.getAlign().valueOrOne();
+        Align Preferred = MaxAlignedAlloc::alignment(Data.size());
+        if (Required > Preferred)
+            return nullptr;
+        StringRef Interned = SharedBytes.insert(Data).first->getKey();
+        assert(llvm::isAddrAligned(Preferred, Interned.data()));
+        return literal_static_pointer_val(Interned.data(), GV.getType());
+    }
 
-        SharedBytesT &SharedBytes;
-        std::mutex &Lock;
-    };
-}
+    SharedBytesT &SharedBytes;
+    std::mutex &Lock;
+};
 
 
 struct JuliaOJIT::DLSymOptimizer {
@@ -1362,20 +1701,24 @@ struct JuliaOJIT::DLSymOptimizer {
 
 #undef INIT_RUNTIME_LIBRARY
     }
+    ~DLSymOptimizer() JL_NOTSAFEPOINT = default;
 
-    void *lookup_symbol(void *libhandle, const char *fname) {
+    void *lookup_symbol(void *libhandle, const char *fname) JL_NOTSAFEPOINT {
         void *addr;
         jl_dlsym(libhandle, fname, &addr, 0);
         return addr;
     }
 
-    void *lookup(const char *libname, const char *fname) {
+    void *lookup(const char *libname, const char *fname) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER {
         StringRef lib(libname);
         StringRef f(fname);
         std::lock_guard<std::mutex> lock(symbols_mutex);
         auto uit = user_symbols.find(lib);
         if (uit == user_symbols.end()) {
+            jl_task_t *ct = jl_current_task;
+            int8_t gc_state = jl_gc_unsafe_enter(ct->ptls);
             void *handle = jl_get_library_(libname, 0);
+            jl_gc_unsafe_leave(ct->ptls, gc_state);
             if (!handle)
                 return nullptr;
             uit = user_symbols.insert(std::make_pair(lib, std::make_pair(handle, StringMap<void*>()))).first;
@@ -1390,7 +1733,7 @@ struct JuliaOJIT::DLSymOptimizer {
         return handle;
     }
 
-    void *lookup(uintptr_t libidx, const char *fname) {
+    void *lookup(uintptr_t libidx, const char *fname) JL_NOTSAFEPOINT {
         std::lock_guard<std::mutex> lock(symbols_mutex);
         runtime_symbols.resize(std::max(runtime_symbols.size(), libidx + 1));
         auto it = runtime_symbols[libidx].second.find(fname);
@@ -1402,7 +1745,7 @@ struct JuliaOJIT::DLSymOptimizer {
         return handle;
     }
 
-    void operator()(Module &M) {
+    void operator()(Module &M) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER {
         for (auto &GV : M.globals()) {
             auto Name = GV.getName();
             if (Name.starts_with("jlplt") && Name.ends_with("got")) {
@@ -1518,7 +1861,7 @@ struct JuliaOJIT::DLSymOptimizer {
     bool named;
 };
 
-void optimizeDLSyms(Module &M) {
+void optimizeDLSyms(Module &M) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER {
     JuliaOJIT::DLSymOptimizer(true)(M);
 }
 
@@ -1552,10 +1895,6 @@ llvm::DataLayout jl_create_datalayout(TargetMachine &TM) {
     return jl_data_layout;
 }
 
-#ifdef _COMPILER_ASAN_ENABLED_
-int64_t ___asan_globals_registered;
-#endif
-
 JuliaOJIT::JuliaOJIT()
   : TM(createTargetMachine()),
     DL(jl_create_datalayout(*TM)),
@@ -1564,34 +1903,27 @@ JuliaOJIT::JuliaOJIT()
     JD(ES.createBareJITDylib("JuliaOJIT")),
     ExternalJD(ES.createBareJITDylib("JuliaExternal")),
     DLSymOpt(std::make_unique<DLSymOptimizer>(false)),
-    ContextPool([](){
-        auto ctx = std::make_unique<LLVMContext>();
-        #if JL_LLVM_VERSION < 170000
-        SetOpaquePointer(*ctx);
-        #endif
-        return orc::ThreadSafeContext(std::move(ctx));
-    }),
 #ifdef JL_USE_JITLINK
     MemMgr(createJITLinkMemoryManager()),
     ObjectLayer(ES, *MemMgr),
-    CompileLayer(ES, ObjectLayer, std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)),
 #else
     MemMgr(createRTDyldMemoryManager()),
-    ObjectLayer(
+    UnlockedObjectLayer(
             ES,
             [this]() {
                 std::unique_ptr<RuntimeDyld::MemoryManager> result(new ForwardingMemoryManager(MemMgr));
                 return result;
             }
         ),
-    LockLayer(ObjectLayer),
-    CompileLayer(ES, LockLayer, std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)),
+    ObjectLayer(UnlockedObjectLayer),
 #endif
-    JITPointersLayer(ES, CompileLayer, orc::IRTransformLayer::TransformFunction(JITPointersT(SharedBytes, RLST_mutex))),
-    OptimizeLayer(ES, JITPointersLayer, orc::IRTransformLayer::TransformFunction(OptimizerT<N_optlevels>(*TM, PrintLLVMTimers, llvm_printing_mutex))),
-    OptSelLayer(ES, OptimizeLayer, orc::IRTransformLayer::TransformFunction(selectOptLevel))
+    CompileLayer(ES, ObjectLayer, std::make_unique<CompilerT<N_optlevels>>(orc::irManglingOptionsFromTargetOptions(TM->Options), *TM)),
+    JITPointers(std::make_unique<JITPointersT>(SharedBytes, RLST_mutex)),
+    JITPointersLayer(ES, CompileLayer, IRTransformRef(*JITPointers)),
+    Optimizers(std::make_unique<OptimizerT>(*TM, PrintLLVMTimers, llvm_printing_mutex)),
+    OptimizeLayer(ES, JITPointersLayer, IRTransformRef(*Optimizers)),
+    OptSelLayer(ES, OptimizeLayer, static_cast<orc::ThreadSafeModule (*)(orc::ThreadSafeModule, orc::MaterializationResponsibility&)>(selectOptLevel))
 {
-    JL_MUTEX_INIT(&this->jitlock, "JuliaOJIT");
 #ifdef JL_USE_JITLINK
 # if defined(LLVM_SHLIB)
     // When dynamically linking against LLVM, use our custom EH frame registration code
@@ -1606,12 +1938,7 @@ JuliaOJIT::JuliaOJIT()
     ObjectLayer.addPlugin(std::make_unique<JLDebuginfoPlugin>());
     ObjectLayer.addPlugin(std::make_unique<JLMemoryUsagePlugin>(jit_bytes_size));
 #else
-    ObjectLayer.setNotifyLoaded(
-        [this](orc::MaterializationResponsibility &MR,
-               const object::ObjectFile &Object,
-               const RuntimeDyld::LoadedObjectInfo &LO) {
-            registerRTDyldJITObject(Object, LO, MemMgr);
-        });
+    UnlockedObjectLayer.setNotifyLoaded(registerRTDyldJITObject);
 #endif
 
     std::string ErrorStr;
@@ -1741,19 +2068,34 @@ JuliaOJIT::JuliaOJIT()
     #endif
     cantFail(GlobalJD.define(orc::absoluteSymbols(msan_crt)));
 #endif
+#if JL_LLVM_VERSION < 190000
 #ifdef _COMPILER_ASAN_ENABLED_
+    // this is a hack to work around a bad assertion:
+    //   /workspace/srcdir/llvm-project/llvm/lib/ExecutionEngine/Orc/Core.cpp:3028: llvm::Error llvm::orc::ExecutionSession::OL_notifyResolved(llvm::orc::MaterializationResponsibility&, const SymbolMap&): Assertion `(KV.second.getFlags() & ~JITSymbolFlags::Common) == (I->second & ~JITSymbolFlags::Common) && "Resolving symbol with incorrect flags"' failed.
+    // hopefully fixed upstream by e7698a13e319a9919af04d3d693a6f6ea7168a44
+    static int64_t jl___asan_globals_registered;
     orc::SymbolMap asan_crt;
     #if JL_LLVM_VERSION >= 170000
-    asan_crt[mangle("___asan_globals_registered")] = {ExecutorAddr::fromPtr(&___asan_globals_registered), JITSymbolFlags::Exported};
+    asan_crt[mangle("___asan_globals_registered")] = {ExecutorAddr::fromPtr(&jl___asan_globals_registered), JITSymbolFlags::Common | JITSymbolFlags::Exported};
     #else
-    asan_crt[mangle("___asan_globals_registered")] = JITEvaluatedSymbol::fromPointer(&___asan_globals_registered, JITSymbolFlags::Exported);
+    asan_crt[mangle("___asan_globals_registered")] = JITEvaluatedSymbol::fromPointer(&jl___asan_globals_registered, JITSymbolFlags::Common | JITSymbolFlags::Exported);
     #endif
     cantFail(JD.define(orc::absoluteSymbols(asan_crt)));
 #endif
+#endif
 }
 
 JuliaOJIT::~JuliaOJIT() = default;
 
+ThreadSafeContext JuliaOJIT::makeContext()
+{
+    auto ctx = std::make_unique<LLVMContext>();
+    #if JL_LLVM_VERSION < 170000
+    SetOpaquePointer(*ctx);
+    #endif
+    return orc::ThreadSafeContext(std::move(ctx));
+}
+
 orc::SymbolStringPtr JuliaOJIT::mangle(StringRef Name)
 {
     std::string MangleName = getMangledName(Name);
@@ -1773,40 +2115,32 @@ void JuliaOJIT::addModule(orc::ThreadSafeModule TSM)
 {
     JL_TIMING(LLVM_JIT, JIT_Total);
     ++ModulesAdded;
-#ifndef JL_USE_JITLINK
-    orc::SymbolLookupSet NewExports;
-    TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
-        for (auto &F : M.global_values()) {
-            if (!F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) {
-                auto Name = ES.intern(getMangledName(F.getName()));
-                NewExports.add(std::move(Name));
-            }
-        }
-        assert(!verifyLLVMIR(M));
-    });
-#endif
-
-    auto Err = OptSelLayer.add(JD, std::move(TSM));
+    TSM = selectOptLevel(std::move(TSM));
+    TSM = (*Optimizers)(std::move(TSM));
+    TSM = (*JITPointers)(std::move(TSM));
+    auto Lock = TSM.getContext().getLock();
+    Module &M = *TSM.getModuleUnlocked();
+    // Treat this as if one of the passes might contain a safepoint
+    // even though that shouldn't be the case and might be unwise
+    Expected<std::unique_ptr<MemoryBuffer>> Obj = CompileLayer.getCompiler()(M);
+    if (!Obj) {
+        ES.reportError(Obj.takeError());
+        errs() << "Failed to add module to JIT!\n";
+        errs() << "Dumping failing module\n" << M << "\n";
+        return;
+    }
+    { auto release = std::move(Lock); }
+    auto Err = JuliaOJIT::addObjectFile(JD, std::move(*Obj));
     if (Err) {
         ES.reportError(std::move(Err));
-        errs() << "Failed to add module to JIT!\n";
+        errs() << "Failed to add objectfile to JIT!\n";
         abort();
     }
-#ifndef JL_USE_JITLINK
-    // force eager compilation (for now), due to memory management specifics
-    // (can't handle compilation recursion)
-    auto Lookups = ES.lookup({{&JD, orc::JITDylibLookupFlags::MatchExportedSymbolsOnly}}, NewExports);
-    if (!Lookups) {
-        ES.reportError(Lookups.takeError());
-        errs() << "Failed to lookup symbols in module!\n";
-    }
-#endif
 }
 
 Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM, bool ShouldOptimize)
 {
-    if (auto Err = TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT -> Error
-            {
+    if (auto Err = TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT -> Error {
             if (M.getDataLayout().isDefault())
                 M.setDataLayout(DL);
             if (M.getDataLayout() != DL)
@@ -1815,24 +2149,29 @@ Error JuliaOJIT::addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM,
                     M.getDataLayout().getStringRepresentation() + " (module) vs " +
                     DL.getStringRepresentation() + " (jit)",
                 inconvertibleErrorCode());
-
+            // OrcJIT requires that all modules / files have unique names:
+            M.setModuleIdentifier((M.getModuleIdentifier() + Twine("-") + Twine(jl_atomic_fetch_add_relaxed(&jitcounter, 1))).str());
             return Error::success();
-            }))
+        }))
         return Err;
+    //if (ShouldOptimize)
+    //    return OptimizeLayer.add(JD, std::move(TSM));
     return CompileLayer.add(JD.getDefaultResourceTracker(), std::move(TSM));
 }
 
 Error JuliaOJIT::addObjectFile(orc::JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
     assert(Obj && "Can not add null object");
-#ifdef JL_USE_JITLINK
+    // OrcJIT requires that all modules / files have unique names:
+    // https://llvm.org/doxygen/namespacellvm_1_1orc.html#a1f5a1bc60c220cdccbab0f26b2a425e1
+    // so we have to force a copy here
+    std::string Name = ("jitted-" + Twine(jl_atomic_fetch_add_relaxed(&jitcounter, 1))).str();
+    Obj = Obj->getMemBufferCopy(Obj->getBuffer(), Name);
     return ObjectLayer.add(JD.getDefaultResourceTracker(), std::move(Obj));
-#else
-    return LockLayer.add(JD.getDefaultResourceTracker(), std::move(Obj));
-#endif
 }
 
 SmallVector<uint64_t> JuliaOJIT::findSymbols(ArrayRef<StringRef> Names)
 {
+    // assert(MemMgr.use_count() == 1); (true single-threaded, but slightly race-y to assert it with concurrent threads)
     DenseMap<orc::NonOwningSymbolStringPtr, size_t> Unmangled;
     orc::SymbolLookupSet Exports;
     for (StringRef Name : Names) {
@@ -1978,6 +2317,7 @@ void JuliaOJIT::enableJITDebuggingSupport()
     addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBAllocAction);
     auto registerJITLoaderGDBWrapper = addAbsoluteToMap(GDBFunctions,llvm_orc_registerJITLoaderGDBWrapper);
     cantFail(JD.define(orc::absoluteSymbols(GDBFunctions)));
+    (void)registerJITLoaderGDBWrapper;
     if (TM->getTargetTriple().isOSBinFormatMachO())
         ObjectLayer.addPlugin(cantFail(orc::GDBJITDebugInfoRegistrationPlugin::Create(ES, JD, TM->getTargetTriple())));
 #ifndef _COMPILER_ASAN_ENABLED_ // TODO: Fix duplicated sections spam #51794
@@ -2013,12 +2353,12 @@ void JuliaOJIT::enableOProfileJITEventListener()
 void JuliaOJIT::enablePerfJITEventListener()
 {
 #if JL_LLVM_VERSION >= 180000
-    orc::SymbolMap PerfFunctions;
-    auto StartAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfStart);
-    auto EndAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfEnd);
-    auto ImplAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfImpl);
-    cantFail(JD.define(orc::absoluteSymbols(PerfFunctions)));
     if (TM->getTargetTriple().isOSBinFormatELF()) {
+        orc::SymbolMap PerfFunctions;
+        auto StartAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfStart);
+        auto EndAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfEnd);
+        auto ImplAddr = addAbsoluteToMap(PerfFunctions,llvm_orc_registerJITLoaderPerfImpl);
+        cantFail(JD.define(orc::absoluteSymbols(PerfFunctions)));
         ObjectLayer.addPlugin(cantFail(DebugInfoPreservationPlugin::Create()));
         //ObjectLayer.addPlugin(cantFail(PerfSupportPlugin::Create(
         //    ES.getExecutorProcessControl(), *JD, true, true)));
@@ -2032,7 +2372,7 @@ void JuliaOJIT::enablePerfJITEventListener()
 void JuliaOJIT::RegisterJITEventListener(JITEventListener *L)
 {
     if (L)
-        ObjectLayer.registerJITEventListener(*L);
+        UnlockedObjectLayer.registerJITEventListener(*L);
 }
 void JuliaOJIT::enableJITDebuggingSupport()
 {
@@ -2071,7 +2411,7 @@ std::string JuliaOJIT::getMangledName(const GlobalValue *GV)
 
 size_t JuliaOJIT::getTotalBytes() const
 {
-    auto bytes = jit_bytes_size.load(std::memory_order_relaxed);
+    auto bytes = jl_atomic_load_relaxed(&jit_bytes_size);
 #ifndef JL_USE_JITLINK
     size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT;
     bytes += getRTDyldMemoryManagerTotalBytes(MemMgr.get());
@@ -2081,7 +2421,7 @@ size_t JuliaOJIT::getTotalBytes() const
 
 void JuliaOJIT::addBytes(size_t bytes)
 {
-    jit_bytes_size.fetch_add(bytes, std::memory_order_relaxed);
+    jl_atomic_fetch_add_relaxed(&jit_bytes_size, bytes);
 }
 
 void JuliaOJIT::printTimers()
@@ -2326,74 +2666,6 @@ static void jl_decorate_module(Module &M) {
 #undef ASM_USES_ELF
 }
 
-#ifndef JL_USE_JITLINK
-// Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable
-static int jl_add_to_ee(
-        orc::ThreadSafeModule &M,
-        const StringMap<orc::ThreadSafeModule*> &NewExports,
-        DenseMap<orc::ThreadSafeModule*, int> &Queued,
-        SmallVectorImpl<orc::ThreadSafeModule*> &Stack)
-{
-    // First check if the TSM is empty (already compiled)
-    if (!M)
-        return 0;
-    // Next check and record if it is on the stack somewhere
-    {
-        auto &Id = Queued[&M];
-        if (Id)
-            return Id;
-        Stack.push_back(&M);
-        Id = Stack.size();
-    }
-    // Finally work out the SCC
-    int depth = Stack.size();
-    int MergeUp = depth;
-    SmallVector<orc::ThreadSafeModule*, 0> Children;
-    M.withModuleDo([&](Module &m) JL_NOTSAFEPOINT {
-        for (auto &F : m.global_objects()) {
-            if (F.isDeclaration() && F.getLinkage() == GlobalValue::ExternalLinkage) {
-                auto Callee = NewExports.find(F.getName());
-                if (Callee != NewExports.end()) {
-                    auto *CM = Callee->second;
-                    if (*CM && CM != &M) {
-                        auto Down = Queued.find(CM);
-                        if (Down != Queued.end())
-                            MergeUp = std::min(MergeUp, Down->second);
-                        else
-                            Children.push_back(CM);
-                    }
-                }
-            }
-        }
-    });
-    assert(MergeUp > 0);
-    for (auto *CM : Children) {
-        int Down = jl_add_to_ee(*CM, NewExports, Queued, Stack);
-        assert(Down <= (int)Stack.size());
-        if (Down)
-            MergeUp = std::min(MergeUp, Down);
-    }
-    if (MergeUp < depth)
-        return MergeUp;
-    while (1) {
-        // Not in a cycle (or at the top of it)
-        // remove SCC state and merge every CM from the cycle into M
-        orc::ThreadSafeModule *CM = Stack.back();
-        auto it = Queued.find(CM);
-        assert(it->second == (int)Stack.size());
-        Queued.erase(it);
-        Stack.pop_back();
-        if ((int)Stack.size() < depth) {
-            assert(&M == CM);
-            break;
-        }
-        jl_merge_module(M, std::move(*CM));
-    }
-    jl_ExecutionEngine->addModule(std::move(M));
-    return 0;
-}
-#endif
-
 // helper function for adding a DLLImport (dlsym) address to the execution engine
 void add_named_global(StringRef name, void *addr)
 {
diff --git a/src/jitlayers.h b/src/jitlayers.h
index f4b9a6ea5395a..ba4ac3081795e 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -69,7 +69,6 @@
 using namespace llvm;
 
 extern "C" jl_cgparams_t jl_default_cgparams;
-extern arraylist_t new_invokes;
 
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeContext, LLVMOrcThreadSafeContextRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
@@ -154,11 +153,11 @@ struct jl_locked_stream {
         std::unique_lock<std::mutex> lck;
         ios_t *&stream;
 
-        lock(std::mutex &mutex, ios_t *&stream) JL_NOTSAFEPOINT
+        lock(std::mutex &mutex, ios_t *&stream) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER
             : lck(mutex), stream(stream) {}
         lock(lock&) = delete;
         lock(lock&&) JL_NOTSAFEPOINT = default;
-        ~lock() JL_NOTSAFEPOINT = default;
+        ~lock() JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT = default;
 
         ios_t *&operator*() JL_NOTSAFEPOINT {
             return stream;
@@ -177,8 +176,8 @@ struct jl_locked_stream {
         }
     };
 
-    jl_locked_stream() JL_NOTSAFEPOINT = default;
-    ~jl_locked_stream() JL_NOTSAFEPOINT = default;
+    jl_locked_stream() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER = default;
+    ~jl_locked_stream() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE = default;
 
     lock operator*() JL_NOTSAFEPOINT {
         return lock(mutex, stream);
@@ -210,12 +209,12 @@ struct jl_codegen_call_target_t {
     jl_returninfo_t::CallingConv cc;
     unsigned return_roots;
     llvm::Function *decl;
+    llvm::Function *oc;
     bool specsig;
 };
 
 typedef SmallVector<std::pair<jl_code_instance_t*, jl_codegen_call_target_t>, 0> jl_workqueue_t;
-// TODO DenseMap?
-typedef std::map<jl_code_instance_t*, std::pair<orc::ThreadSafeModule, jl_llvm_functions_t>> jl_compiled_functions_t;
+
 typedef std::list<std::tuple<std::string, std::string, unsigned int>> CallFrames;
 struct jl_codegen_params_t {
     orc::ThreadSafeContext tsctx;
@@ -229,7 +228,6 @@ struct jl_codegen_params_t {
     typedef StringMap<GlobalVariable*> SymMapGV;
     // outputs
     jl_workqueue_t workqueue;
-    jl_compiled_functions_t compiled_functions;
     std::map<void*, GlobalVariable*> global_targets;
     std::map<std::tuple<jl_code_instance_t*,bool>, GlobalVariable*> external_fns;
     std::map<jl_datatype_t*, DIType*> ditypes;
@@ -292,13 +290,20 @@ enum CompilationPolicy {
     Extern = 1,
 };
 
-void jl_compile_workqueue(
-    jl_codegen_params_t &params,
-    CompilationPolicy policy);
-
 Function *jl_cfunction_object(jl_function_t *f, jl_value_t *rt, jl_tupletype_t *argt,
     jl_codegen_params_t &params);
 
+Function *emit_tojlinvoke(jl_code_instance_t *codeinst, StringRef theFptrName, Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOINT;
+void emit_specsig_to_fptr1(
+        Function *gf_thunk, jl_returninfo_t::CallingConv cc, unsigned return_roots,
+        jl_value_t *calltype, jl_value_t *rettype, bool is_for_opaque_closure,
+        size_t nargs,
+        jl_codegen_params_t &params,
+        Function *target,
+        size_t min_world, size_t max_world) JL_NOTSAFEPOINT;
+Function *get_or_emit_fptr1(StringRef Name, Module *M) JL_NOTSAFEPOINT;
+void jl_init_function(Function *F, const Triple &TT) JL_NOTSAFEPOINT;
+
 void add_named_global(StringRef name, void *addr) JL_NOTSAFEPOINT;
 
 static inline Constant *literal_static_pointer_val(const void *p, Type *T) JL_NOTSAFEPOINT
@@ -371,6 +376,11 @@ using OptimizerResultT = Expected<orc::ThreadSafeModule>;
 using SharedBytesT = StringSet<MaxAlignedAllocImpl<sizeof(StringSet<>::MapEntryTy)>>;
 
 class JuliaOJIT {
+private:
+    // any verification the user wants to do when adding an OwningResource to the pool
+    template <typename AnyT>
+    static void verifyResource(AnyT &resource) JL_NOTSAFEPOINT { }
+    static void verifyResource(orc::ThreadSafeContext &context) JL_NOTSAFEPOINT { assert(context.getContext()); }
 public:
 #ifdef JL_USE_JITLINK
     typedef orc::ObjectLinkingLayer ObjLayerT;
@@ -385,13 +395,13 @@ class JuliaOJIT {
                             std::unique_ptr<MemoryBuffer> O) override {
             JL_TIMING(LLVM_JIT, JIT_Link);
 #ifndef JL_USE_JITLINK
-            std::lock_guard<std::mutex> lock(EmissionMutex);
+            std::lock_guard<std::recursive_mutex> lock(EmissionMutex);
 #endif
             BaseLayer.emit(std::move(R), std::move(O));
         }
     private:
         orc::ObjectLayer &BaseLayer;
-        std::mutex EmissionMutex;
+        std::recursive_mutex EmissionMutex;
     };
 #endif
     typedef orc::IRCompileLayer CompileLayerT;
@@ -420,11 +430,16 @@ class JuliaOJIT {
                 : pool(pool), resource(std::move(resource)) {}
             OwningResource(const OwningResource &) = delete;
             OwningResource &operator=(const OwningResource &) = delete;
-            OwningResource(OwningResource &&) JL_NOTSAFEPOINT = default;
+            OwningResource(OwningResource &&other) JL_NOTSAFEPOINT
+                : pool(other.pool), resource(std::move(other.resource)) {
+                    other.resource.reset();
+                }
             OwningResource &operator=(OwningResource &&) JL_NOTSAFEPOINT = default;
             ~OwningResource() JL_NOTSAFEPOINT { // _LEAVE
-                if (resource)
+                if (resource) {
+                    verifyResource(*resource);
                     pool.release(std::move(*resource));
+                }
             }
             ResourceT release() JL_NOTSAFEPOINT {
                 ResourceT res(std::move(*resource));
@@ -510,7 +525,11 @@ class JuliaOJIT {
         std::unique_ptr<WNMutex> mutex;
     };
 
+    typedef ResourcePool<orc::ThreadSafeContext, 0, std::queue<orc::ThreadSafeContext>> ContextPoolT;
+
     struct DLSymOptimizer;
+    struct OptimizerT;
+    struct JITPointersT;
 
 #ifndef JL_USE_JITLINK
     void RegisterJITEventListener(JITEventListener *L) JL_NOTSAFEPOINT;
@@ -528,7 +547,7 @@ class JuliaOJIT {
 
     orc::SymbolStringPtr mangle(StringRef Name) JL_NOTSAFEPOINT;
     void addGlobalMapping(StringRef Name, uint64_t Addr) JL_NOTSAFEPOINT;
-    void addModule(orc::ThreadSafeModule M) JL_NOTSAFEPOINT;
+    void addModule(orc::ThreadSafeModule M) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER;
 
     //Methods for the C API
     Error addExternalModule(orc::JITDylib &JD, orc::ThreadSafeModule TSM,
@@ -552,15 +571,7 @@ class JuliaOJIT {
     uint64_t getGlobalValueAddress(StringRef Name) JL_NOTSAFEPOINT;
     uint64_t getFunctionAddress(StringRef Name) JL_NOTSAFEPOINT;
     StringRef getFunctionAtAddress(uint64_t Addr, jl_callptr_t invoke, jl_code_instance_t *codeinst) JL_NOTSAFEPOINT;
-    auto getContext() JL_NOTSAFEPOINT {
-        return *ContextPool;
-    }
-    orc::ThreadSafeContext acquireContext() { // JL_NOTSAFEPOINT_ENTER?
-        return ContextPool.acquire();
-    }
-    void releaseContext(orc::ThreadSafeContext &&ctx) { // JL_NOTSAFEPOINT_LEAVE?
-        ContextPool.release(std::move(ctx));
-    }
+    orc::ThreadSafeContext makeContext() JL_NOTSAFEPOINT;
     const DataLayout& getDataLayout() const JL_NOTSAFEPOINT;
 
     // TargetMachine pass-through methods
@@ -576,22 +587,21 @@ class JuliaOJIT {
     void addBytes(size_t bytes) JL_NOTSAFEPOINT;
     void printTimers() JL_NOTSAFEPOINT;
 
-    jl_locked_stream &get_dump_emitted_mi_name_stream() JL_NOTSAFEPOINT {
+    jl_locked_stream &get_dump_emitted_mi_name_stream() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER {
         return dump_emitted_mi_name_stream;
     }
-    jl_locked_stream &get_dump_compiles_stream() JL_NOTSAFEPOINT {
+    jl_locked_stream &get_dump_compiles_stream() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER {
         return dump_compiles_stream;
     }
-    jl_locked_stream &get_dump_llvm_opt_stream() JL_NOTSAFEPOINT {
+    jl_locked_stream &get_dump_llvm_opt_stream() JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER {
         return dump_llvm_opt_stream;
     }
     std::string getMangledName(StringRef Name) JL_NOTSAFEPOINT;
     std::string getMangledName(const GlobalValue *GV) JL_NOTSAFEPOINT;
 
-    // Note that this is a safepoint due to jl_get_library_ and jl_dlsym calls
-    void optimizeDLSyms(Module &M);
-
-    jl_mutex_t jitlock;
+    // Note that this is a potential safepoint due to jl_get_library_ and jl_dlsym calls
+    // but may be called from inside safe-regions due to jit compilation locks
+    void optimizeDLSyms(Module &M) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER;
 
 private:
 
@@ -618,20 +628,20 @@ class JuliaOJIT {
     std::mutex llvm_printing_mutex{};
     SmallVector<std::function<void()>, 0> PrintLLVMTimers;
 
-    ResourcePool<orc::ThreadSafeContext, 0, std::queue<orc::ThreadSafeContext>> ContextPool;
-
-    std::atomic<size_t> jit_bytes_size{0};
-#ifndef JL_USE_JITLINK
-    const std::shared_ptr<RTDyldMemoryManager> MemMgr;
-#else
+    _Atomic(size_t) jit_bytes_size{0};
+    _Atomic(size_t) jitcounter{0};
+#ifdef JL_USE_JITLINK
     const std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr;
-#endif
     ObjLayerT ObjectLayer;
-#ifndef JL_USE_JITLINK
-    LockLayerT LockLayer;
+#else
+    const std::shared_ptr<RTDyldMemoryManager> MemMgr; // shared_ptr protected by LockLayerT.EmissionMutex
+    ObjLayerT UnlockedObjectLayer;
+    LockLayerT ObjectLayer;
 #endif
     CompileLayerT CompileLayer;
+    std::unique_ptr<JITPointersT> JITPointers;
     JITPointersLayerT JITPointersLayer;
+    std::unique_ptr<OptimizerT> Optimizers;
     OptimizeLayerT OptimizeLayer;
     OptSelLayerT OptSelLayer;
 };
diff --git a/src/julia.h b/src/julia.h
index 7bb5f31eda708..168ba0deff1ec 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -426,8 +426,8 @@ typedef struct _jl_opaque_closure_t {
     jl_value_t *captures;
     size_t world;
     jl_method_t *source;
-    jl_fptr_args_t invoke;
-    void *specptr;
+    jl_fptr_args_t invoke; // n.b. despite the similar name, this is not an invoke ABI (jl_call_t / julia.call2), but rather the fptr1 (jl_fptr_args_t / julia.call) ABI
+    void *specptr; // n.b. despite the similarity in field name, this is not arbitrary private data for jlcall, but rather the codegen ABI for specsig, and is mandatory if specsig is valid
 } jl_opaque_closure_t;
 
 // This type represents an executable operation
@@ -475,7 +475,7 @@ typedef struct _jl_code_instance_t {
                                    // & 0b100 == From image
     _Atomic(uint8_t) precompile;  // if set, this will be added to the output system image
     uint8_t relocatability;  // nonzero if all roots are built into sysimg or tagged by module key
-    _Atomic(jl_callptr_t) invoke; // jlcall entry point
+    _Atomic(jl_callptr_t) invoke; // jlcall entry point usually, but if this codeinst belongs to an OC Method, then this is an jl_fptr_args_t fptr1 instead, unless it is not, because it is a special token object instead
     union _jl_generic_specptr_t {
         _Atomic(void*) fptr;
         _Atomic(jl_fptr_args_t) fptr1;
@@ -2339,7 +2339,13 @@ JL_DLLEXPORT JL_CONST_FUNC jl_gcframe_t **(jl_get_pgcstack)(void) JL_GLOBALLY_RO
 extern JL_DLLIMPORT int jl_task_gcstack_offset;
 extern JL_DLLIMPORT int jl_task_ptls_offset;
 
+#ifdef __cplusplus
+}
+#endif
 #include "julia_locks.h"   // requires jl_task_t definition
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 // Return the exception currently being handled, or `jl_nothing`.
 //
diff --git a/src/julia_atomics.h b/src/julia_atomics.h
index c094afcc54cd5..d05f0fafab28f 100644
--- a/src/julia_atomics.h
+++ b/src/julia_atomics.h
@@ -103,12 +103,12 @@ enum jl_memory_order {
 // this wrong thus we include the correct definitions here (with implicit
 // conversion), instead of using the macro version
 template<class T>
-T jl_atomic_load(std::atomic<T> *ptr)
+T jl_atomic_load(const std::atomic<T> *ptr)
 {
      return std::atomic_load<T>(ptr);
 }
 template<class T>
-T jl_atomic_load_explicit(std::atomic<T> *ptr, std::memory_order order)
+T jl_atomic_load_explicit(const std::atomic<T> *ptr, std::memory_order order)
 {
      return std::atomic_load_explicit<T>(ptr, order);
 }
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 82c91c6d073af..bb8169c6e5f9e 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1715,13 +1715,14 @@ JL_DLLEXPORT int jl_tupletype_length_compat(jl_value_t *v, size_t nargs) JL_NOTS
 
 JL_DLLEXPORT jl_value_t *jl_argtype_with_function(jl_value_t *f, jl_value_t *types0);
 JL_DLLEXPORT jl_value_t *jl_argtype_with_function_type(jl_value_t *ft JL_MAYBE_UNROOTED, jl_value_t *types0);
+JL_DLLEXPORT jl_value_t *jl_argtype_without_function(jl_value_t *ftypes);
 
 JL_DLLEXPORT unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *field_type);
 
-void register_eh_frames(uint8_t *Addr, size_t Size);
-void deregister_eh_frames(uint8_t *Addr, size_t Size);
+void register_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;
+void deregister_eh_frames(uint8_t *Addr, size_t Size) JL_NOTSAFEPOINT;
 
-STATIC_INLINE void *jl_get_frame_addr(void)
+STATIC_INLINE void *jl_get_frame_addr(void) JL_NOTSAFEPOINT
 {
 #ifdef __GNUC__
     return __builtin_frame_address(0);
diff --git a/src/julia_locks.h b/src/julia_locks.h
index 5774ddada60c6..4d1345177f965 100644
--- a/src/julia_locks.h
+++ b/src/julia_locks.h
@@ -103,6 +103,33 @@ JL_DLLEXPORT void jl_unlock_field(jl_mutex_t *v) JL_NOTSAFEPOINT;
 
 #ifdef __cplusplus
 }
+
+#include <mutex>
+#include <condition_variable>
+// simple C++ shim around a std::unique_lock + gc-safe + disabled finalizers region
+// since we nearly always want that combination together
+class jl_unique_gcsafe_lock {
+public:
+    int8_t gc_state;
+    std::unique_lock<std::mutex> native;
+    explicit jl_unique_gcsafe_lock(std::mutex &native) JL_NOTSAFEPOINT_ENTER
+    {
+        jl_task_t *ct = jl_current_task;
+        gc_state = jl_gc_safe_enter(ct->ptls);
+        this->native = std::unique_lock(native);
+        ct->ptls->engine_nqueued++; // disables finalizers until inference is finished on this method graph
+    }
+    jl_unique_gcsafe_lock(jl_unique_gcsafe_lock &&native) = delete;
+    jl_unique_gcsafe_lock(jl_unique_gcsafe_lock &native) = delete;
+    ~jl_unique_gcsafe_lock() JL_NOTSAFEPOINT_LEAVE {
+        jl_task_t *ct = jl_current_task;
+        jl_gc_safe_leave(ct->ptls, gc_state);
+        ct->ptls->engine_nqueued--; // enable finalizers (but don't run them until the next gc)
+    }
+    void wait(std::condition_variable& cond) JL_NOTSAFEPOINT {
+        cond.wait(native);
+    }
+};
 #endif
 
 #endif
diff --git a/src/opaque_closure.c b/src/opaque_closure.c
index 0bf3a729cbcb1..9fe36f32d2030 100644
--- a/src/opaque_closure.c
+++ b/src/opaque_closure.c
@@ -80,14 +80,16 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
         if (!jl_subtype(rt_lb, selected_rt)) {
             // TODO: It would be better to try to get a specialization with the
             // correct rt check here (or we could codegen a wrapper).
-            specptr = NULL; invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+            specptr = NULL; // this will force codegen of the unspecialized version
+            invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
             jl_value_t *ts[2] = {rt_lb, (jl_value_t*)ci->rettype};
             selected_rt = jl_type_union(ts, 2);
         }
         if (!jl_subtype(ci->rettype, rt_ub)) {
             // TODO: It would be better to try to get a specialization with the
             // correct rt check here (or we could codegen a wrapper).
-            specptr = NULL; invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
+            specptr = NULL; // this will force codegen of the unspecialized version
+            invoke = (jl_fptr_args_t)jl_interpret_opaque_closure;
             selected_rt = jl_type_intersection(rt_ub, selected_rt);
         }
 
@@ -108,8 +110,7 @@ static jl_opaque_closure_t *new_opaque_closure(jl_tupletype_t *argt, jl_value_t
     jl_value_t *oc_type JL_ALWAYS_LEAFTYPE = jl_apply_type2((jl_value_t*)jl_opaque_closure_type, (jl_value_t*)argt, selected_rt);
     JL_GC_PROMISE_ROOTED(oc_type);
 
-    if (!specptr) {
-        sigtype = jl_argtype_with_function_type((jl_value_t*)oc_type, (jl_value_t*)argt);
+    if (specptr == NULL) {
         jl_method_instance_t *mi_generic = jl_specializations_get_linfo(jl_opaque_closure_method, sigtype, jl_emptysvec);
 
         // OC wrapper methods are not world dependent
@@ -197,7 +198,7 @@ int jl_tupletype_length_compat(jl_value_t *v, size_t nargs)
 
 JL_CALLABLE(jl_f_opaque_closure_call)
 {
-    jl_opaque_closure_t* oc = (jl_opaque_closure_t*)F;
+    jl_opaque_closure_t *oc = (jl_opaque_closure_t*)F;
     jl_value_t *argt = jl_tparam0(jl_typeof(oc));
     if (!jl_tupletype_length_compat(argt, nargs))
         jl_method_error(F, args, nargs + 1, oc->world);
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 09d51598ea8b7..f8935070bb001 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -980,3 +980,9 @@ extern "C" JL_DLLEXPORT_CODEGEN
 ::llvm::PassPluginLibraryInfo llvmGetPassPluginInfo() JL_NOTSAFEPOINT {
       return {LLVM_PLUGIN_API_VERSION, "Julia", "1", registerCallbacks};
 }
+
+void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis)
+{
+    PM->add(new TargetLibraryInfoWrapperPass(triple));
+    PM->add(createTargetTransformInfoWrapperPass(std::move(analysis)));
+}
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6784e601bcfba..7c6f946fe73c5 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -642,13 +642,13 @@ void jl_print_native_codeloc(uintptr_t ip) JL_NOTSAFEPOINT
     for (i = 0; i < n; i++) {
         jl_frame_t frame = frames[i];
         if (!frame.func_name) {
-            jl_safe_printf("unknown function (ip: %p)\n", (void*)ip);
+            jl_safe_printf("unknown function (ip: %p) at %s\n", (void*)ip, frame.file_name ? frame.file_name : "(unknown file)");
         }
         else {
             jl_safe_print_codeloc(frame.func_name, frame.file_name, frame.line, frame.inlined);
             free(frame.func_name);
-            free(frame.file_name);
         }
+        free(frame.file_name);
     }
     free(frames);
 }