From 70dd008a23d9e504b456eed22795dc27fb4aae13 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 5 Oct 2025 15:50:43 +0800 Subject: [PATCH 1/7] Enable TSAN with FULL4G and T2C support ThreadSanitizer (TSAN) can now detect race conditions across the entire multi-threaded JIT pipeline with full 4GB address space emulation. This enables testing of the tier-2 LLVM compilation thread while maintaining production memory layout. Memory Layout (TSAN-compatible): - Main memory: MAP_FIXED at 0x7d0000000000 (4GB) - JIT buffer: MAP_FIXED at 0x7d1000000000 - Both allocations within TSAN app range (0x7cf-0x7ff trillion) - Prevents conflicts with TSAN shadow memory (0x02a-0x7ce trillion) ASLR Mitigation: - Added setarch -R wrapper for TSAN test execution - Disables ASLR to prevent random allocations in shadow memory - Only affects test runs, not production builds SDL Conflict Resolution: - SDL (uninstrumented system library) creates threads TSAN cannot track - Disabled SDL when TSAN enabled to focus on built-in race detection - Production builds still fully support SDL --- Makefile | 30 ++++++++++++++++++++++- src/emulate.c | 25 +++++++++++++++---- src/io.c | 21 ++++++++++++++++ src/jit.c | 22 +++++++++++++++-- src/main.c | 22 +++++++++++++++++ src/riscv.c | 59 ++++++++++++++++++++++++++++++++++++--------- src/riscv_private.h | 3 ++- src/t2c.c | 4 ++- 8 files changed, 165 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 4481ed3a..5a34979b 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,28 @@ endif ENABLE_ARCH_TEST ?= 0 $(call set-feature, ARCH_TEST) +# ThreadSanitizer support +# TSAN on x86-64 memory layout: +# Shadow: 0x02a000000000 - 0x7cefffffffff (reserved by TSAN) +# App: 0x7cf000000000 - 0x7ffffffff000 (usable by application) +# +# We use MAP_FIXED to allocate FULL4G's 4GB memory at a fixed address +# (0x7d0000000000) within TSAN's app range, ensuring compatibility. +# +# IMPORTANT: TSAN requires ASLR (Address Space Layout Randomization) to be +# disabled to prevent system allocations from landing in TSAN's shadow memory. +# Tests are run with 'setarch $(uname -m) -R' to disable ASLR. +ENABLE_TSAN ?= 0 +ifeq ("$(ENABLE_TSAN)", "1") +override ENABLE_SDL := 0 # SDL (uninstrumented system lib) creates threads TSAN cannot track +override ENABLE_LTO := 0 # LTO interferes with TSAN instrumentation +CFLAGS += -DTSAN_ENABLED # Signal code to use TSAN-compatible allocations +# Disable ASLR for TSAN tests to prevent allocations in TSAN shadow memory +BIN_WRAPPER = setarch $(shell uname -m) -R +else +BIN_WRAPPER = +endif + # Enable link-time optimization (LTO) ENABLE_LTO ?= 1 ifeq ($(call has, LTO), 1) @@ -332,6 +354,12 @@ CFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all LDFLAGS += -fsanitize=undefined -fno-sanitize=alignment -fno-sanitize-recover=all endif +# ThreadSanitizer flags (ENABLE_TSAN is set earlier to override SDL/FULL4G) +ifeq ("$(ENABLE_TSAN)", "1") +CFLAGS += -fsanitize=thread -g +LDFLAGS += -fsanitize=thread +endif + $(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector # .DEFAULT_GOAL should be set to all since the very first target is not all @@ -445,7 +473,7 @@ define check-test $(Q)true; \ $(PRINTF) "Running $(3) ... "; \ OUTPUT_FILE="$$(mktemp)"; \ -if (LC_ALL=C $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ +if (LC_ALL=C $(BIN_WRAPPER) $(BIN) $(1) $(2) > "$$OUTPUT_FILE") && \ [ "$$(cat "$$OUTPUT_FILE" | $(LOG_FILTER) | $(4))" = "$(5)" ]; then \ $(call notice, [OK]); \ else \ diff --git a/src/emulate.c b/src/emulate.c index e5e4cddf..b97c9493 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -304,6 +304,7 @@ static block_t *block_alloc(riscv_t *rv) block->hot2 = false; block->has_loops = false; block->n_invoke = 0; + block->func = NULL; INIT_LIST_HEAD(&block->list); #if RV32_HAS(T2C) block->compiled = false; @@ -1176,22 +1177,32 @@ void rv_step(void *arg) #if RV32_HAS(JIT) #if RV32_HAS(T2C) /* executed through the tier-2 JIT compiler */ - if (block->hot2) { + /* Use acquire semantics to ensure we see func write before using it */ + if (__atomic_load_n(&block->hot2, __ATOMIC_ACQUIRE)) { ((exec_t2c_func_t) block->func)(rv); prev = NULL; continue; } /* check if invoking times of t1 generated code exceed threshold */ - else if (!block->compiled && block->n_invoke >= THRESHOLD) { - block->compiled = true; + else if (!__atomic_load_n(&block->compiled, __ATOMIC_RELAXED) && + __atomic_load_n(&block->n_invoke, __ATOMIC_RELAXED) >= + THRESHOLD) { + __atomic_store_n(&block->compiled, true, __ATOMIC_RELAXED); queue_entry_t *entry = malloc(sizeof(queue_entry_t)); if (unlikely(!entry)) { /* Malloc failed - reset compiled flag to allow retry later */ - block->compiled = false; + __atomic_store_n(&block->compiled, false, __ATOMIC_RELAXED); continue; } - entry->block = block; + /* Store cache key instead of pointer to prevent use-after-free */ +#if RV32_HAS(SYSTEM) + entry->key = + (uint64_t) block->pc_start | ((uint64_t) block->satp << 32); +#else + entry->key = (uint64_t) block->pc_start; +#endif pthread_mutex_lock(&rv->wait_queue_lock); list_add(&entry->list, &rv->wait_queue); + pthread_cond_signal(&rv->wait_queue_cond); pthread_mutex_unlock(&rv->wait_queue_lock); } #endif @@ -1203,7 +1214,11 @@ void rv_step(void *arg) * entry in compiled binary buffer. */ if (block->hot) { +#if RV32_HAS(T2C) + __atomic_fetch_add(&block->n_invoke, 1, __ATOMIC_RELAXED); +#else block->n_invoke++; +#endif ((exec_block_func_t) state->buf)( rv, (uintptr_t) (state->buf + block->offset)); prev = NULL; diff --git a/src/io.c b/src/io.c index 4ff325d3..1e5b73b9 100644 --- a/src/io.c +++ b/src/io.c @@ -27,12 +27,33 @@ memory_t *memory_new(uint32_t size) return NULL; assert(mem); #if HAVE_MMAP +#if defined(TSAN_ENABLED) && defined(__x86_64__) + /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific + * address within TSAN's app range (0x7cf000000000 - 0x7ffffffff000). + * + * Fixed address: 0x7d0000000000 + * Size: up to 4GB (0x100000000) + * End: 0x7d0100000000 (well within app range) + * + * This guarantees the allocation won't land in TSAN's shadow memory, + * preventing "unexpected memory mapping" errors. + */ + void *fixed_addr = (void *) 0x7d0000000000UL; + data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (data_memory_base == MAP_FAILED) { + free(mem); + return NULL; + } +#else + /* Standard allocation without TSAN */ data_memory_base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (data_memory_base == MAP_FAILED) { free(mem); return NULL; } +#endif #else data_memory_base = malloc(size); if (!data_memory_base) { diff --git a/src/jit.c b/src/jit.c index a6dfdb70..631b1554 100644 --- a/src/jit.c +++ b/src/jit.c @@ -2336,6 +2336,25 @@ struct jit_state *jit_state_init(size_t size) state->offset = 0; state->size = size; +#if defined(TSAN_ENABLED) && defined(__x86_64__) + /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed + * address above the main memory region to avoid conflicts. + * + * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G) + * JIT buffer: 0x7d1000000000 + size + * + * This keeps both allocations in TSAN's app range (0x7cf000000000 - + * 0x7ffffffff000) and prevents overlap with main memory or TSAN shadow. + */ + void *jit_addr = (void *) 0x7d1000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (state->buf == MAP_FAILED) { + free(state); + return NULL; + } +#else + /* Standard allocation without TSAN */ state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS #if defined(__APPLE__) @@ -2347,8 +2366,7 @@ struct jit_state *jit_state_init(size_t size) free(state); return NULL; } - assert(state->buf != MAP_FAILED); - +#endif state->n_blocks = 0; set_reset(&state->set); reset_reg(); diff --git a/src/main.c b/src/main.c index 4c851edd..45374bb1 100644 --- a/src/main.c +++ b/src/main.c @@ -19,6 +19,28 @@ #include "riscv.h" #include "utils.h" +/* ThreadSanitizer configuration for FULL4G compatibility + * + * We use MAP_FIXED to allocate emulated memory at 0x7d0000000000, which is + * within TSAN's application memory range (0x7cf000000000 - 0x7ffffffff000). + * This avoids conflicts with TSAN's shadow memory and allows race detection + * to work with FULL4G's 4GB address space. + * + * Configuration optimizes for race detection with minimal overhead. + */ +#if defined(__SANITIZE_THREAD__) +const char *__tsan_default_options() +{ + return "halt_on_error=0" /* Continue after errors */ + ":report_bugs=1" /* Report data races */ + ":second_deadlock_stack=1" /* Full deadlock info */ + ":verbosity=0" /* Reduce noise */ + ":memory_limit_mb=0" /* No memory limit */ + ":history_size=7" /* Larger race detection window */ + ":io_sync=0"; /* Don't sync on I/O */ +} +#endif + /* enable program trace mode */ #if !RV32_HAS(SYSTEM) || (RV32_HAS(SYSTEM) && RV32_HAS(ELF_LOADER)) static bool opt_trace = false; diff --git a/src/riscv.c b/src/riscv.c index b892cf27..e500aa67 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -206,19 +206,41 @@ static pthread_t t2c_thread; static void *t2c_runloop(void *arg) { riscv_t *rv = (riscv_t *) arg; + pthread_mutex_lock(&rv->wait_queue_lock); while (!rv->quit) { - if (!list_empty(&rv->wait_queue)) { - queue_entry_t *entry = - list_last_entry(&rv->wait_queue, queue_entry_t, list); - pthread_mutex_lock(&rv->wait_queue_lock); - list_del_init(&entry->list); - pthread_mutex_unlock(&rv->wait_queue_lock); - pthread_mutex_lock(&rv->cache_lock); - t2c_compile(rv, entry->block); - pthread_mutex_unlock(&rv->cache_lock); - free(entry); - } + /* Wait for work or quit signal */ + while (list_empty(&rv->wait_queue) && !rv->quit) + pthread_cond_wait(&rv->wait_queue_cond, &rv->wait_queue_lock); + + if (rv->quit) + break; + + /* Extract work item while holding the lock */ + queue_entry_t *entry = + list_last_entry(&rv->wait_queue, queue_entry_t, list); + list_del_init(&entry->list); + pthread_mutex_unlock(&rv->wait_queue_lock); + + /* Perform compilation with cache lock */ + pthread_mutex_lock(&rv->cache_lock); + /* Look up block from cache using the key (might have been evicted) */ + uint32_t pc = (uint32_t) entry->key; + block_t *block = (block_t *) cache_get(rv->block_cache, pc, false); +#if RV32_HAS(SYSTEM) + /* Verify SATP matches (for system mode) */ + uint32_t satp = (uint32_t) (entry->key >> 32); + if (block && block->satp != satp) + block = NULL; +#endif + /* Compile only if block still exists in cache */ + if (block) + t2c_compile(rv, block); + pthread_mutex_unlock(&rv->cache_lock); + free(entry); + + pthread_mutex_lock(&rv->wait_queue_lock); } + pthread_mutex_unlock(&rv->wait_queue_lock); return NULL; } #endif @@ -777,6 +799,7 @@ riscv_t *rv_create(riscv_user_t rv_attr) /* prepare wait queue. */ pthread_mutex_init(&rv->wait_queue_lock, NULL); pthread_mutex_init(&rv->cache_lock, NULL); + pthread_cond_init(&rv->wait_queue_cond, NULL); INIT_LIST_HEAD(&rv->wait_queue); /* activate the background compilation thread. */ pthread_create(&t2c_thread, NULL, t2c_runloop, rv); @@ -910,10 +933,24 @@ void rv_delete(riscv_t *rv) block_map_destroy(rv); #else #if RV32_HAS(T2C) + /* Signal the thread to quit */ + pthread_mutex_lock(&rv->wait_queue_lock); rv->quit = true; + pthread_cond_signal(&rv->wait_queue_cond); + pthread_mutex_unlock(&rv->wait_queue_lock); + pthread_join(t2c_thread, NULL); + + /* Clean up any remaining entries in wait queue */ + queue_entry_t *entry, *safe; + list_for_each_entry_safe (entry, safe, &rv->wait_queue, list) { + list_del(&entry->list); + free(entry); + } + pthread_mutex_destroy(&rv->wait_queue_lock); pthread_mutex_destroy(&rv->cache_lock); + pthread_cond_destroy(&rv->wait_queue_cond); jit_cache_exit(rv->jit_cache); #endif jit_state_exit(rv->jit_state); diff --git a/src/riscv_private.h b/src/riscv_private.h index 12a3bfd0..89165011 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -105,7 +105,7 @@ typedef struct block { #if RV32_HAS(JIT) && RV32_HAS(T2C) typedef struct { - block_t *block; + uint64_t key; /**< cache key (PC or PC|SATP) to look up block */ struct list_head list; } queue_entry_t; #endif @@ -197,6 +197,7 @@ struct riscv_internal { #if RV32_HAS(T2C) struct list_head wait_queue; pthread_mutex_t wait_queue_lock, cache_lock; + pthread_cond_t wait_queue_cond; volatile bool quit; /**< Determine the main thread is terminated or not */ #endif void *jit_state; diff --git a/src/t2c.c b/src/t2c.c index 343b85e6..2115adaf 100644 --- a/src/t2c.c +++ b/src/t2c.c @@ -346,7 +346,9 @@ void t2c_compile(riscv_t *rv, block_t *block) jit_cache_update(rv->jit_cache, key, block->func); - block->hot2 = true; + /* Use release semantics to ensure func write is visible before hot2 is set + */ + __atomic_store_n(&block->hot2, true, __ATOMIC_RELEASE); } struct jit_cache *jit_cache_init() From e6f83646616a85ba01a48e69b7e19b975f0f91c4 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sun, 5 Oct 2025 16:02:50 +0800 Subject: [PATCH 2/7] Add Arm64 TSAN support and fix JIT cache coherency This commit adds ThreadSanitizer (TSAN) support for ARM64/Apple Silicon and fixes critical JIT instruction cache coherency issues. ARM64 TSAN Support: - Extended TSAN-compatible memory allocation to ARM64 architecture - Main memory allocated at fixed address 0x150000000000 (21TB) - JIT buffer allocated at 0x151000000000 with MAP_JIT for Apple Silicon - Both allocations avoid TSAN shadow memory and enable race detection - Note: Requires ASLR disabled on macOS (SIP restrictions may apply) JIT Cache Coherency Fixes: 1. Fixed pthread_jit_write_protect_np() ordering in update_branch_imm 2. Added sys_icache_invalidate() in update_branch_imm 3. Added cache invalidation in resolve_jumps() for x86_64 Fix JIT regalloc conflicts in memory load After reset_reg() clears the register allocator state, load instructions (lb/lh/lw/lbu/lhu) could reallocate the same host register for both the address and destination, causing data corruption. This commit uses map_vm_reg_reserved() to prevent reusing the address register. --- src/emulate.c | 19 ++++++++++++---- src/io.c | 24 +++++++++++++++----- src/jit.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++--- src/main.c | 2 +- src/rv32_jit.c | 10 ++++----- 5 files changed, 98 insertions(+), 18 deletions(-) diff --git a/src/emulate.c b/src/emulate.c index b97c9493..67d26d86 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -24,6 +24,7 @@ extern struct target_ops gdbstub_ops; #endif #include "decode.h" +#include "log.h" #include "mpool.h" #include "riscv.h" #include "riscv_private.h" @@ -1230,10 +1231,20 @@ void rv_step(void *arg) #endif ) { jit_translate(rv, block); - ((exec_block_func_t) state->buf)( - rv, (uintptr_t) (state->buf + block->offset)); - prev = NULL; - continue; + /* Only execute if translation succeeded (block is hot) */ + if (block->hot) { + rv_log_debug("JIT: Executing block pc=0x%08x, offset=%u", + block->pc_start, block->offset); + ((exec_block_func_t) state->buf)( + rv, (uintptr_t) (state->buf + block->offset)); + prev = NULL; + continue; + } + /* Fall through to interpreter if translation failed */ + rv_log_debug( + "JIT: Translation failed for block pc=0x%08x, using " + "interpreter", + block->pc_start); } set_reset(&pc_set); has_loops = false; diff --git a/src/io.c b/src/io.c index 1e5b73b9..975013ee 100644 --- a/src/io.c +++ b/src/io.c @@ -27,18 +27,32 @@ memory_t *memory_new(uint32_t size) return NULL; assert(mem); #if HAVE_MMAP -#if defined(TSAN_ENABLED) && defined(__x86_64__) +#if defined(TSAN_ENABLED) /* ThreadSanitizer compatibility: Use MAP_FIXED to allocate at a specific - * address within TSAN's app range (0x7cf000000000 - 0x7ffffffff000). + * address to avoid conflicts with TSAN's shadow memory. + */ +#if defined(__x86_64__) + /* x86_64: Allocate within TSAN's range (0x7cf000000000 - 0x7ffffffff000). * * Fixed address: 0x7d0000000000 * Size: up to 4GB (0x100000000) * End: 0x7d0100000000 (well within app range) - * - * This guarantees the allocation won't land in TSAN's shadow memory, - * preventing "unexpected memory mapping" errors. */ void *fixed_addr = (void *) 0x7d0000000000UL; +#elif defined(__aarch64__) + /* ARM64 (macOS/Apple Silicon): Use higher address range. + * + * Fixed address: 0x150000000000 (21TB) + * Size: up to 4GB (0x100000000) + * End: 0x150100000000 + * + * This avoids TSAN's shadow memory and typical process allocations. + * Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *fixed_addr = (void *) 0x150000000000UL; +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif data_memory_base = mmap(fixed_addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); if (data_memory_base == MAP_FAILED) { diff --git a/src/jit.c b/src/jit.c index 631b1554..5fa1f32f 100644 --- a/src/jit.c +++ b/src/jit.c @@ -42,6 +42,7 @@ #include "decode.h" #include "io.h" #include "jit.h" +#include "log.h" #include "riscv.h" #include "riscv_private.h" #include "utils.h" @@ -593,24 +594,30 @@ static void update_branch_imm(struct jit_state *state, assert((imm & 3) == 0); uint32_t insn; imm >>= 2; + rv_log_debug("JIT: Patching branch at offset=%u, imm=%d", offset, imm * 4); + /* Read instruction while in execute mode (MAP_JIT requirement) */ memcpy(&insn, state->buf + offset, sizeof(uint32_t)); if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ || (insn & 0x7e000000U) == 0x34000000U) { /* Compare and branch immediate. */ assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0); + insn &= ~(0x7ffffU << 5); /* Clear old offset bits */ insn |= (imm & 0x7ffff) << 5; } else if ((insn & 0x7c000000U) == 0x14000000U) { /* Unconditional branch immediate. */ assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0); + insn &= ~0x03ffffffU; /* Clear old offset bits */ insn |= (imm & 0x03ffffffU) << 0; } else { assert(false); insn = BAD_OPCODE; } #if defined(__APPLE__) && defined(__aarch64__) + /* Switch to write mode only for writing */ pthread_jit_write_protect_np(false); #endif memcpy(state->buf + offset, &insn, sizeof(uint32_t)); + sys_icache_invalidate(state->buf + offset, sizeof(uint32_t)); #if defined(__APPLE__) && defined(__aarch64__) pthread_jit_write_protect_np(true); #endif @@ -2164,9 +2171,12 @@ void clear_hot(block_t *block) static void code_cache_flush(struct jit_state *state, riscv_t *rv) { + rv_log_info("JIT: Flushing code cache (n_blocks=%d, n_jumps=%d, offset=%u)", + state->n_blocks, state->n_jumps, state->offset); should_flush = false; state->offset = state->org_size; state->n_blocks = 0; + state->n_jumps = 0; /* Reset jump count when flushing */ set_reset(&state->set); clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot); #if RV32_HAS(T2C) @@ -2196,6 +2206,7 @@ static void translate(struct jit_state *state, riscv_t *rv, block_t *block) static void resolve_jumps(struct jit_state *state) { + rv_log_debug("JIT: Resolving %d jumps", state->n_jumps); for (int i = 0; i < state->n_jumps; i++) { struct jump jump = state->jumps[i]; int target_loc; @@ -2218,6 +2229,10 @@ static void resolve_jumps(struct jit_state *state) (if (jump.target_satp == state->offset_map[i].satp), ) { target_loc = state->offset_map[i].offset; + rv_log_debug( + "JIT: Jump %d resolved to block pc=0x%08x, " + "offset=%d", + i, jump.target_pc, target_loc); break; } } @@ -2229,6 +2244,7 @@ static void resolve_jumps(struct jit_state *state) uint8_t *offset_ptr = &state->buf[jump.offset_loc]; memcpy(offset_ptr, &rel, sizeof(uint32_t)); + sys_icache_invalidate(offset_ptr, sizeof(uint32_t)); #elif defined(__aarch64__) int32_t rel = target_loc - jump.offset_loc; update_branch_imm(state, jump.offset_loc, rel); @@ -2308,23 +2324,35 @@ void jit_translate(riscv_t *rv, block_t *block) ) { block->offset = state->offset_map[i].offset; block->hot = true; + rv_log_debug("JIT: Cache hit for block pc=0x%08x, offset=%u", + block->pc_start, block->offset); return; } } assert(NULL); __UNREACHABLE; } + rv_log_debug("JIT: Starting translation for block pc=0x%08x", + block->pc_start); restart: memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump)); state->n_jumps = 0; block->offset = state->offset; translate_chained_block(state, rv, block); if (unlikely(should_flush)) { + /* Mark block as not translated since translation was incomplete */ + block->hot = false; + /* Don't reset offset - it will be set correctly on restart */ + rv_log_debug("JIT: Translation triggered flush for block pc=0x%08x", + block->pc_start); code_cache_flush(state, rv); goto restart; } resolve_jumps(state); block->hot = true; + rv_log_debug( + "JIT: Translation completed for block pc=0x%08x, offset=%u, size=%u", + block->pc_start, block->offset, state->offset - block->offset); } struct jit_state *jit_state_init(size_t size) @@ -2336,10 +2364,12 @@ struct jit_state *jit_state_init(size_t size) state->offset = 0; state->size = size; -#if defined(TSAN_ENABLED) && defined(__x86_64__) +#if defined(TSAN_ENABLED) /* ThreadSanitizer compatibility: Allocate JIT code buffer at a fixed * address above the main memory region to avoid conflicts. - * + */ +#if defined(__x86_64__) + /* x86_64 memory layout: * Main memory: 0x7d0000000000 - 0x7d0100000000 (4GB for FULL4G) * JIT buffer: 0x7d1000000000 + size * @@ -2348,7 +2378,32 @@ struct jit_state *jit_state_init(size_t size) */ void *jit_addr = (void *) 0x7d1000000000UL; state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#elif defined(__aarch64__) + /* ARM64 memory layout (macOS/Apple Silicon): + * Main memory: 0x150000000000 - 0x150100000000 (4GB for FULL4G) + * JIT buffer: 0x151000000000 + size + * + * Apple Silicon requires MAP_JIT for executable memory. The fixed + * address is chosen to avoid TSAN's shadow memory and typical process + * allocations. Requires ASLR disabled via: setarch $(uname -m) -R + */ + void *jit_addr = (void *) 0x151000000000UL; + state->buf = mmap(jit_addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); +#else +#error "TSAN is only supported on x86_64 and aarch64" +#endif if (state->buf == MAP_FAILED) { free(state); return NULL; diff --git a/src/main.c b/src/main.c index 45374bb1..a2f67d6d 100644 --- a/src/main.c +++ b/src/main.c @@ -304,7 +304,7 @@ int main(int argc, char **args) .args_offset_size = ARGS_OFFSET_SIZE, .argc = prog_argc, .argv = prog_args, - .log_level = LOG_TRACE, + .log_level = LOG_INFO, .run_flag = run_flag, .profile_output_file = prof_out_file, .cycle_per_step = CYCLE_PER_STEP, diff --git a/src/rv32_jit.c b/src/rv32_jit.c index 8e084f62..6ea9c8e9 100644 --- a/src/rv32_jit.c +++ b/src/rv32_jit.c @@ -180,7 +180,7 @@ GEN(lb, { emit_cmp_imm32(state, temp_reg, 0); uint32_t jump_loc_0 = state->offset; emit_jcc_offset(state, 0x84); - vm_reg[1] = map_vm_reg(state, ir->rd); + vm_reg[1] = map_vm_reg_reserved(state, ir->rd, vm_reg[0]); emit_load(state, S32, parameter_reg[0], vm_reg[1], offsetof(riscv_t, X) + 4 * ir->rd); @@ -232,7 +232,7 @@ GEN(lh, { emit_cmp_imm32(state, temp_reg, 0); uint32_t jump_loc_0 = state->offset; emit_jcc_offset(state, 0x84); - vm_reg[1] = map_vm_reg(state, ir->rd); + vm_reg[1] = map_vm_reg_reserved(state, ir->rd, vm_reg[0]); emit_load(state, S32, parameter_reg[0], vm_reg[1], offsetof(riscv_t, X) + 4 * ir->rd); @@ -284,7 +284,7 @@ GEN(lw, { emit_cmp_imm32(state, temp_reg, 0); uint32_t jump_loc_0 = state->offset; emit_jcc_offset(state, 0x84); - vm_reg[1] = map_vm_reg(state, ir->rd); + vm_reg[1] = map_vm_reg_reserved(state, ir->rd, vm_reg[0]); emit_load(state, S32, parameter_reg[0], vm_reg[1], offsetof(riscv_t, X) + 4 * ir->rd); @@ -336,7 +336,7 @@ GEN(lbu, { emit_cmp_imm32(state, temp_reg, 0); uint32_t jump_loc_0 = state->offset; emit_jcc_offset(state, 0x84); - vm_reg[1] = map_vm_reg(state, ir->rd); + vm_reg[1] = map_vm_reg_reserved(state, ir->rd, vm_reg[0]); emit_load(state, S32, parameter_reg[0], vm_reg[1], offsetof(riscv_t, X) + 4 * ir->rd); @@ -388,7 +388,7 @@ GEN(lhu, { emit_cmp_imm32(state, temp_reg, 0); uint32_t jump_loc_0 = state->offset; emit_jcc_offset(state, 0x84); - vm_reg[1] = map_vm_reg(state, ir->rd); + vm_reg[1] = map_vm_reg_reserved(state, ir->rd, vm_reg[0]); emit_load(state, S32, parameter_reg[0], vm_reg[1], offsetof(riscv_t, X) + 4 * ir->rd); From e362fba9529fae2131302728a958462b6a6d661b Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 8 Oct 2025 21:13:02 +0800 Subject: [PATCH 3/7] Detect early JIT compilation issues in CI/CD This commit introduces a comprehensive JIT debugging infrastructure to catch register allocation conflicts and cache coherency issues before they cause subtle runtime failures in production. --- .ci/jit-debug-test.sh | 54 ++++++++++++++++++++++ .github/workflows/main.yml | 8 ++++ Makefile | 5 ++ src/jit.c | 93 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+) create mode 100755 .ci/jit-debug-test.sh diff --git a/.ci/jit-debug-test.sh b/.ci/jit-debug-test.sh new file mode 100755 index 00000000..a3d36e26 --- /dev/null +++ b/.ci/jit-debug-test.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# JIT Debug Test Script +# This script tests JIT compiler with debug mode enabled to catch issues early + +set -e + +PARALLEL="${PARALLEL:--j$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)}" + +echo "======================================" +echo "JIT Debug Mode Test" +echo "======================================" + +# Test 1: Standard JIT with debug +echo "" +echo "Test 1: Building with ENABLE_JIT_DEBUG=1..." +make distclean +make ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 $PARALLEL + +echo "" +echo "Running basic tests with JIT debug..." +make ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 check + +# Test 2: JIT with EXT_C=0 and debug (regression test) +echo "" +echo "Test 2: Building with ENABLE_EXT_C=0 ENABLE_JIT_DEBUG=1..." +make distclean +make ENABLE_EXT_C=0 ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 $PARALLEL + +echo "" +echo "Running tests with EXT_C=0 and JIT debug..." +make ENABLE_EXT_C=0 ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 check + +# Test 3: JIT with various extension combinations +echo "" +echo "Test 3: Testing multiple JIT configurations with debug..." +for config in \ + "ENABLE_EXT_A=0" \ + "ENABLE_EXT_F=0" \ + "ENABLE_EXT_M=0" \ + "ENABLE_Zba=0" \ + "ENABLE_Zbb=0" +do + echo "" + echo "Testing: $config with JIT debug" + make distclean + make $config ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 $PARALLEL + make $config ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 check +done + +echo "" +echo "======================================" +echo "All JIT debug tests passed!" +echo "======================================" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b700ecdc..a07ccd42 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -474,6 +474,14 @@ jobs: make ENABLE_JIT=1 clean && make ENABLE_MOP_FUSION=0 ENABLE_JIT=1 check $PARALLEL make ENABLE_JIT=1 clean && make ENABLE_BLOCK_CHAINING=0 ENABLE_JIT=1 check $PARALLEL if: ${{ always() }} + - name: JIT debug test + env: + CC: ${{ steps.install_cc.outputs.cc }} + run: | + # Run JIT tests with debug mode to catch register allocation and cache coherency issues + make distclean && make ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 check $PARALLEL + make distclean && make ENABLE_EXT_C=0 ENABLE_JIT=1 ENABLE_JIT_DEBUG=1 check $PARALLEL + if: ${{ always() }} - name: undefined behavior test env: CC: ${{ steps.install_cc.outputs.cc }} diff --git a/Makefile b/Makefile index 5a34979b..b1640df1 100644 --- a/Makefile +++ b/Makefile @@ -302,6 +302,11 @@ ENABLE_JIT ?= 0 $(call set-feature, JIT) ifeq ($(call has, JIT), 1) OBJS_EXT += jit.o + # JIT debug mode for early issue detection in CI/CD + ENABLE_JIT_DEBUG ?= 0 + ifeq ("$(ENABLE_JIT_DEBUG)", "1") + CFLAGS += -DENABLE_JIT_DEBUG=1 + endif ENABLE_T2C ?= 1 $(call set-feature, T2C) ifeq ($(call has, T2C), 1) diff --git a/src/jit.c b/src/jit.c index 5fa1f32f..61ee6ee5 100644 --- a/src/jit.c +++ b/src/jit.c @@ -299,6 +299,89 @@ static inline void offset_map_insert(struct jit_state *state, block_t *block) __builtin___clear_cache((char *) (addr), (char *) (addr) + (size)); #endif +/* JIT debug helpers - enable with ENABLE_JIT_DEBUG=1 to detect issues early */ +#ifndef ENABLE_JIT_DEBUG +#define ENABLE_JIT_DEBUG 0 +#endif + +#if ENABLE_JIT_DEBUG +static void jit_dump_regmap(const char *ctx) +{ + rv_log_debug("JIT RegMap [%s]:", ctx); + for (int i = 0; i < n_host_regs; i++) { + if (register_map[i].vm_reg_idx >= 0) { + rv_log_debug(" Host R%d -> VM x%d (dirty=%d)", + register_map[i].reg_idx, register_map[i].vm_reg_idx, + register_map[i].dirty); + } + } +} + +static void jit_check_regmap_conflict(int vm_reg, + int host_reg, + const char *insn) +{ + int found_idx = -1; + /* Check if VM register is already mapped */ + for (int i = 0; i < n_host_regs; i++) { + if (register_map[i].vm_reg_idx == vm_reg) { + if (found_idx >= 0 && found_idx != i) { + /* VM register mapped to multiple host registers */ + rv_log_error( + "JIT RegMap CONFLICT in %s: VM x%d mapped to " + "Host R%d (idx %d) and R%d (idx %d)", + insn, vm_reg, register_map[found_idx].reg_idx, found_idx, + register_map[i].reg_idx, i); + jit_dump_regmap("CONFLICT"); + assert(false); + } + found_idx = i; + /* Verify the found mapping is correct */ + if (register_map[i].reg_idx != host_reg) { + rv_log_error( + "JIT RegMap CONFLICT in %s: VM x%d expected at " + "Host R%d but found at R%d", + insn, vm_reg, host_reg, register_map[i].reg_idx); + jit_dump_regmap("CONFLICT"); + assert(false); + } + } else if (register_map[i].reg_idx == host_reg && + register_map[i].vm_reg_idx >= 0) { + /* Host register holds different VM register */ + rv_log_error( + "JIT RegMap CONFLICT in %s: Host R%d already holds " + "VM x%d, cannot map VM x%d", + insn, host_reg, register_map[i].vm_reg_idx, vm_reg); + jit_dump_regmap("CONFLICT"); + assert(false); + } + } +} + +static void jit_verify_cache_coherency(struct jit_state *state, uint32_t pc) + UNUSED; +static void jit_verify_cache_coherency(struct jit_state *state, uint32_t pc) +{ + /* On ARM64, verify instruction cache was properly invalidated */ +#if defined(__aarch64__) + if (state->offset > 0) { + rv_log_debug("JIT: Cache coherency check at PC=0x%08x, offset=%u", pc, + state->offset); + } +#endif +} +#else +#define jit_dump_regmap(ctx) \ + do { \ + } while (0) +#define jit_check_regmap_conflict(vm_reg, host_reg, insn) \ + do { \ + } while (0) +#define jit_verify_cache_coherency(state, pc) \ + do { \ + } while (0) +#endif + static bool should_flush = false; static void emit_bytes(struct jit_state *state, void *data, uint32_t len) { @@ -1890,6 +1973,7 @@ static inline int map_vm_reg(struct jit_state *state, int vm_reg_idx) save_reg(state, idx); unmap_vm_reg(idx); set_vm_reg(idx, vm_reg_idx); + jit_check_regmap_conflict(vm_reg_idx, target_reg, "map_vm_reg"); return target_reg; } @@ -1933,6 +2017,15 @@ static inline int map_vm_reg_reserved(struct jit_state *state, save_reg(state, idx); unmap_vm_reg(idx); set_vm_reg(idx, vm_reg_idx); + jit_check_regmap_conflict(vm_reg_idx, target_reg, "map_vm_reg_reserved"); + /* Additional check: ensure we didn't allocate the reserved register */ + if (target_reg == reserved_reg_idx) { + rv_log_error( + "JIT RegMap ERROR: map_vm_reg_reserved allocated reserved " + "register R%d for VM x%d", + reserved_reg_idx, vm_reg_idx); + assert(false); + } return target_reg; } From f616fa1b7714b958bb43dfc4298bdbd1ac29acaf Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 8 Oct 2025 22:14:58 +0800 Subject: [PATCH 4/7] Fix user-space emulation requiring ELF loader User-space emulation tests were failing because ENABLE_ELF_LOADER defaulted to 0, preventing ELF file loading. The fix automatically enables ELF_LOADER when SYSTEM=0, as user-space mode always requires it to load test ELF files. --- Makefile | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index b1640df1..5d5d0eb4 100644 --- a/Makefile +++ b/Makefile @@ -20,13 +20,6 @@ CFLAGS += -include src/common.h -Isrc/ OBJS_EXT := -# In the system test suite, the executable is an ELF file (e.g., MMU). -# However, the Linux kernel emulation includes the Image, DT, and -# root filesystem (rootfs). Therefore, the test suite needs this -# flag to load the ELF and differentiate it from the kernel emulation. -ENABLE_ELF_LOADER ?= 0 -$(call set-feature, ELF_LOADER) - # Enable MOP fusion, easier for ablation study ENABLE_MOP_FUSION ?= 1 $(call set-feature, MOP_FUSION) @@ -43,6 +36,18 @@ $(call set-feature, LOG_COLOR) ENABLE_SYSTEM ?= 0 $(call set-feature, SYSTEM) +# In the system test suite, the executable is an ELF file (e.g., MMU). +# However, the Linux kernel emulation includes the Image, DT, and +# root filesystem (rootfs). Therefore, the test suite needs this +# flag to load the ELF and differentiate it from the kernel emulation. +# User-space emulation (SYSTEM=0) always needs ELF loader. +ifeq ($(ENABLE_SYSTEM), 0) + override ENABLE_ELF_LOADER := 1 +else + ENABLE_ELF_LOADER ?= 0 +endif +$(call set-feature, ELF_LOADER) + ifeq ($(call has, SYSTEM), 1) OBJS_EXT += system.o endif From 169ca45476b272bed3b4aa9dce58c31fc4063780 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 8 Oct 2025 22:31:50 +0800 Subject: [PATCH 5/7] Fix CI coding-style check failures --- .ci/jit-debug-test.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.ci/jit-debug-test.sh b/.ci/jit-debug-test.sh index a3d36e26..964efdfc 100755 --- a/.ci/jit-debug-test.sh +++ b/.ci/jit-debug-test.sh @@ -5,7 +5,7 @@ set -e -PARALLEL="${PARALLEL:--j$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)}" +PARALLEL="${PARALLEL:--j$(nproc 2> /dev/null || sysctl -n hw.ncpu 2> /dev/null || echo 4)}" echo "======================================" echo "JIT Debug Mode Test" @@ -39,8 +39,7 @@ for config in \ "ENABLE_EXT_F=0" \ "ENABLE_EXT_M=0" \ "ENABLE_Zba=0" \ - "ENABLE_Zbb=0" -do + "ENABLE_Zbb=0"; do echo "" echo "Testing: $config with JIT debug" make distclean From 2b06e85ef9e49aab919099e73e5bb7fe5e3fac12 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Wed, 8 Oct 2025 22:31:50 +0800 Subject: [PATCH 6/7] Fix user-space emulation requiring ELF loader User-space emulation tests were failing because ENABLE_ELF_LOADER defaulted to 0, preventing ELF file loading. The fix automatically enables ELF_LOADER when SYSTEM=0, except for architecture tests which have their own binary loading mechanism. --- Makefile | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 5d5d0eb4..c3a5afa5 100644 --- a/Makefile +++ b/Makefile @@ -36,18 +36,6 @@ $(call set-feature, LOG_COLOR) ENABLE_SYSTEM ?= 0 $(call set-feature, SYSTEM) -# In the system test suite, the executable is an ELF file (e.g., MMU). -# However, the Linux kernel emulation includes the Image, DT, and -# root filesystem (rootfs). Therefore, the test suite needs this -# flag to load the ELF and differentiate it from the kernel emulation. -# User-space emulation (SYSTEM=0) always needs ELF loader. -ifeq ($(ENABLE_SYSTEM), 0) - override ENABLE_ELF_LOADER := 1 -else - ENABLE_ELF_LOADER ?= 0 -endif -$(call set-feature, ELF_LOADER) - ifeq ($(call has, SYSTEM), 1) OBJS_EXT += system.o endif @@ -85,6 +73,22 @@ endif ENABLE_ARCH_TEST ?= 0 $(call set-feature, ARCH_TEST) +# In the system test suite, the executable is an ELF file (e.g., MMU). +# However, the Linux kernel emulation includes the Image, DT, and +# root filesystem (rootfs). Therefore, the test suite needs this +# flag to load the ELF and differentiate it from the kernel emulation. +# User-space emulation (SYSTEM=0) always needs ELF loader, except for architecture tests. +ifeq ($(ENABLE_SYSTEM), 0) + ifneq ($(ENABLE_ARCH_TEST), 1) + override ENABLE_ELF_LOADER := 1 + else + ENABLE_ELF_LOADER ?= 0 + endif +else + ENABLE_ELF_LOADER ?= 0 +endif +$(call set-feature, ELF_LOADER) + # ThreadSanitizer support # TSAN on x86-64 memory layout: # Shadow: 0x02a000000000 - 0x7cefffffffff (reserved by TSAN) From fe1e52adcdb146d777d6887b52c9b1326d7f93c9 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 8 Nov 2025 00:25:05 +0800 Subject: [PATCH 7/7] Fix DTB compilation regression in default build The 'all' target unconditionally included $(BUILD_DTB) and $(BUILD_DTB2C), causing device tree compilation to run even for user-space builds where ENABLE_SYSTEM=0. This triggered dtc syntax errors because the required macros (INITRD_START, INITRD_END, MEM_START, MEM_END) are only defined for system mode. Since DTB_DEPS is already conditionally set to include these targets when needed (ENABLE_SYSTEM=1 and ENABLE_ELF_LOADER=0), the redundant explicit dependencies are removed. Fixes regression introduced in commit 2b06e85 where ELF_LOADER logic was reordered. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c3a5afa5..bf43ca6f 100644 --- a/Makefile +++ b/Makefile @@ -392,7 +392,7 @@ DTB_DEPS := $(BUILD_DTB) $(BUILD_DTB2C) endif endif -all: config $(DTB_DEPS) $(BUILD_DTB) $(BUILD_DTB2C) $(BIN) +all: config $(DTB_DEPS) $(BIN) OBJS := \ map.o \