From c7232c842dbdff26d6399ff9d4c44f1a3de418b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Doru=20Bl=C3=A2nzeanu?= Date: Fri, 13 Jun 2025 18:57:05 +0300 Subject: [PATCH] gdb: fix sandbox function cancellation when gdb enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - when the `gdb` features is enabled, the EINTR exit reason was always mapped to a Debug stop reason. With the new changes to the threading model, the EINTR exit reason can be a result of a cancel request. We need to first check if a cancel was requested, if not, it means a sandbox with debugging enabled was issued an interrupt request from the debugger - adds a way for a sandbox under debugging to remain attached to the debugger when a crash happens. This way the crash can be inspected. This forbids continuing execution and writing to memory and registers. Signed-off-by: Doru Blânzeanu --- docs/how-to-debug-a-hyperlight-guest.md | 5 + .../src/hypervisor/gdb/event_loop.rs | 4 + src/hyperlight_host/src/hypervisor/gdb/mod.rs | 2 + .../src/hypervisor/gdb/x86_64_target.rs | 48 +++++ .../src/hypervisor/hyperv_linux.rs | 162 ++++++++++++++--- src/hyperlight_host/src/hypervisor/kvm.rs | 171 ++++++++++++++---- src/hyperlight_host/src/hypervisor/mod.rs | 13 ++ 7 files changed, 347 insertions(+), 58 deletions(-) diff --git a/docs/how-to-debug-a-hyperlight-guest.md b/docs/how-to-debug-a-hyperlight-guest.md index f3a565abc..6707df518 100644 --- a/docs/how-to-debug-a-hyperlight-guest.md +++ b/docs/how-to-debug-a-hyperlight-guest.md @@ -14,6 +14,7 @@ The Hyperlight `gdb` feature enables **KVM** and **MSHV** guest debugging to: - read and write addresses - step/continue - get code offset from target + - stop when a crash occurs and only allow read access to the guest memory and registers ## Expected behavior @@ -32,6 +33,10 @@ session of a guest binary running inside a Hyperlight sandbox on Linux. - if two sandbox instances are created with the same debug port, the second instance logs an error and the gdb thread will not be created, but the sandbox will continue to run without gdb debugging +- when a crash happens, the debugger session remains active, and the guest + vCPU is stopped, allowing the gdb client to inspect the state of the guest. + The debug target will refuse any resume, step actions and write operations to + the guest memory and registers until the gdb client disconnects or the sandbox is stopped. ## Example diff --git a/src/hyperlight_host/src/hypervisor/gdb/event_loop.rs b/src/hyperlight_host/src/hypervisor/gdb/event_loop.rs index 160902464..c39f7c06e 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/event_loop.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/event_loop.rs @@ -58,6 +58,10 @@ impl run_blocking::BlockingEventLoop for GdbBlockingEventLoop { tid: (), signal: Signal(SIGRTMIN() as u8), }, + VcpuStopReason::Crash => BaseStopReason::SignalWithThread { + tid: (), + signal: Signal(11), + }, VcpuStopReason::Unknown => { log::warn!("Unknown stop reason received"); diff --git a/src/hyperlight_host/src/hypervisor/gdb/mod.rs b/src/hyperlight_host/src/hypervisor/gdb/mod.rs index 6470c3bbf..47af50b40 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/mod.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/mod.rs @@ -108,6 +108,7 @@ pub(crate) struct X86_64Regs { /// Defines the possible reasons for which a vCPU can be stopped when debugging #[derive(Debug)] pub enum VcpuStopReason { + Crash, DoneStep, /// Hardware breakpoint inserted by the hypervisor so the guest can be stopped /// at the entry point. This is used to avoid the guest from executing @@ -145,6 +146,7 @@ pub(crate) enum DebugResponse { DisableDebug, ErrorOccurred, GetCodeSectionOffset(u64), + NotAllowed, ReadAddr(Vec), ReadRegisters(X86_64Regs), RemoveHwBreakpoint(bool), diff --git a/src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs b/src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs index 8fb5d4beb..0c48f84e9 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs @@ -77,6 +77,12 @@ impl HyperlightSandboxTarget { match self.send_command(DebugMsg::Continue)? { DebugResponse::Continue => Ok(()), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Ok(()) + } msg => { log::error!("Unexpected message received: {:?}", msg); Err(GdbTargetError::UnexpectedMessage) @@ -164,6 +170,12 @@ impl SingleThreadBase for HyperlightSandboxTarget { match self.send_command(DebugMsg::WriteAddr(gva, v))? { DebugResponse::WriteAddr => Ok(()), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Ok(()) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -245,6 +257,12 @@ impl SingleThreadBase for HyperlightSandboxTarget { match self.send_command(DebugMsg::WriteRegisters(regs))? { DebugResponse::WriteRegisters => Ok(()), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Ok(()) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -301,6 +319,12 @@ impl HwBreakpoint for HyperlightSandboxTarget { match self.send_command(DebugMsg::AddHwBreakpoint(addr))? { DebugResponse::AddHwBreakpoint(rsp) => Ok(rsp), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Err(TargetError::NonFatal) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -321,6 +345,12 @@ impl HwBreakpoint for HyperlightSandboxTarget { match self.send_command(DebugMsg::RemoveHwBreakpoint(addr))? { DebugResponse::RemoveHwBreakpoint(rsp) => Ok(rsp), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Err(TargetError::NonFatal) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -343,6 +373,12 @@ impl SwBreakpoint for HyperlightSandboxTarget { match self.send_command(DebugMsg::AddSwBreakpoint(addr))? { DebugResponse::AddSwBreakpoint(rsp) => Ok(rsp), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Err(TargetError::NonFatal) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -363,6 +399,12 @@ impl SwBreakpoint for HyperlightSandboxTarget { match self.send_command(DebugMsg::RemoveSwBreakpoint(addr))? { DebugResponse::RemoveSwBreakpoint(rsp) => Ok(rsp), + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Err(TargetError::NonFatal) + } DebugResponse::ErrorOccurred => { log::error!("Error occurred"); Err(TargetError::NonFatal) @@ -398,6 +440,12 @@ impl SingleThreadSingleStep for HyperlightSandboxTarget { log::error!("Error occurred"); Err(GdbTargetError::UnexpectedError) } + DebugResponse::NotAllowed => { + log::error!("Action not allowed at this time, crash might have occurred"); + // This is a consequence of the target crashing or being in an invalid state + // we cannot continue execution, but we can still read registers and memory + Ok(()) + } msg => { log::error!("Unexpected message received: {:?}", msg); Err(GdbTargetError::UnexpectedMessage) diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index e7f34d171..3919e37b8 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -55,7 +55,9 @@ use {super::crashdump, std::path::Path}; use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT}; #[cfg(gdb)] -use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug}; +use super::gdb::{ + DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug, VcpuStopReason, +}; #[cfg(gdb)] use super::handlers::DbgMemAccessHandlerWrapper; use super::handlers::{MemAccessHandlerWrapper, OutBHandlerWrapper}; @@ -749,6 +751,25 @@ impl Hypervisor for HypervLinuxDriver { .store(false, Ordering::Relaxed); HyperlightExit::Cancelled() } else { + // In case of the gdb feature, if no cancellation was requested, + // and the debugging is enabled it means the vCPU was stopped because + // of an interrupt coming from the debugger thread + #[cfg(gdb)] + if self.debug.is_some() { + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + // NOTE: There is a chance that the vCPU was stopped because of a stale + // signal that was meant to be delivered to a previous/other vCPU on this + // same thread, however, we cannot distinguish between the two cases, so + // we assume that the vCPU was stopped because of an interrupt. + // This is fine, because the debugger will be notified about an interrupt + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + HyperlightExit::Retry() + } + + #[cfg(not(gdb))] HyperlightExit::Retry() } } @@ -835,39 +856,130 @@ impl Hypervisor for HypervLinuxDriver { dbg_mem_access_fn: std::sync::Arc< std::sync::Mutex, >, - stop_reason: super::gdb::VcpuStopReason, + stop_reason: VcpuStopReason, ) -> Result<()> { - self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) - .map_err(|e| new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e))?; + if self.debug.is_none() { + return Err(new_error!("Debugging is not enabled")); + } - loop { - log::debug!("Debug wait for event to resume vCPU"); + match stop_reason { + // If the vCPU stopped because of a crash, we need to handle it differently + // We do not want to allow resuming execution or placing breakpoints + // because the guest has crashed. + // We only allow reading registers and memory + VcpuStopReason::Crash => { + self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) + .map_err(|e| { + new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e) + })?; + + loop { + log::debug!("Debug wait for event to resume vCPU"); + // Wait for a message from gdb + let req = self.recv_dbg_msg()?; + + // Flag to store if we should deny continue or step requests + let mut deny_continue = false; + // Flag to store if we should detach from the gdb session + let mut detach = false; + + let response = match req { + // Allow the detach request to disable debugging by continuing resuming + // hypervisor crash error reporting + DebugMsg::DisableDebug => { + detach = true; + DebugResponse::DisableDebug + } + // Do not allow continue or step requests + DebugMsg::Continue | DebugMsg::Step => { + deny_continue = true; + DebugResponse::NotAllowed + } + // Do not allow adding/removing breakpoints and writing to memory or registers + DebugMsg::AddHwBreakpoint(_) + | DebugMsg::AddSwBreakpoint(_) + | DebugMsg::RemoveHwBreakpoint(_) + | DebugMsg::RemoveSwBreakpoint(_) + | DebugMsg::WriteAddr(_, _) + | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed, + + // For all other requests, we will process them normally + _ => { + let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + match result { + Ok(response) => response, + Err(HyperlightError::TranslateGuestAddress(_)) => { + // Treat non fatal errors separately so the guest doesn't fail + DebugResponse::ErrorOccurred + } + Err(e) => { + log::error!("Error processing debug request: {:?}", e); + return Err(e); + } + } + } + }; - // Wait for a message from gdb - let req = self.recv_dbg_msg()?; + // Send the response to the request back to gdb + self.send_dbg_msg(response) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; - let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + // If we are denying continue or step requests, the debugger assumes the + // execution started so we need to report a stop reason as a crash and let + // it request to read registers/memory to figure out what happened + if deny_continue { + self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash)) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; + } - let response = match result { - Ok(response) => response, - // Treat non fatal errors separately so the guest doesn't fail - Err(HyperlightError::TranslateGuestAddress(_)) => DebugResponse::ErrorOccurred, - Err(e) => { - return Err(e); + // If we are detaching, we will break the loop and the Hypervisor will continue + // to handle the Crash reason + if detach { + break; + } } - }; + } + // If the vCPU stopped because of any other reason except a crash, we can handle it + // normally + _ => { + // Send the stop reason to the gdb thread + self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) + .map_err(|e| { + new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e) + })?; + + loop { + log::debug!("Debug wait for event to resume vCPU"); + // Wait for a message from gdb + let req = self.recv_dbg_msg()?; + + let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + + let response = match result { + Ok(response) => response, + // Treat non fatal errors separately so the guest doesn't fail + Err(HyperlightError::TranslateGuestAddress(_)) => { + DebugResponse::ErrorOccurred + } + Err(e) => { + return Err(e); + } + }; - // If the command was either step or continue, we need to run the vcpu - let cont = matches!( - response, - DebugResponse::Step | DebugResponse::Continue | DebugResponse::DisableDebug - ); + let cont = matches!( + response, + DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug + ); - self.send_dbg_msg(response) - .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; + self.send_dbg_msg(response) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; - if cont { - break; + // Check if we should continue execution + // We continue if the response is one of the following: Step, Continue, or DisableDebug + if cont { + break; + } + } } } diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 5fb772764..1e8ce1eee 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -636,13 +636,7 @@ impl Hypervisor for KVMDriver { } }, Err(e) => match e.errno() { - // In case of the gdb feature, the timeout is not enabled, this - // exit is because of a signal sent from the gdb thread to the - // hypervisor thread to cancel execution - #[cfg(gdb)] - libc::EINTR => HyperlightExit::Debug(VcpuStopReason::Interrupt), // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled - #[cfg(not(gdb))] libc::EINTR => { // If cancellation was not requested for this specific vm, the vcpu was interrupted because of stale signal // that was meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it @@ -652,6 +646,25 @@ impl Hypervisor for KVMDriver { .store(false, Ordering::Relaxed); HyperlightExit::Cancelled() } else { + // In case of the gdb feature, if no cancellation was requested, + // and the debugging is enabled it means the vCPU was stopped because + // of an interrupt coming from the debugger thread + // NOTE: There is a chance that the vCPU was stopped because of a stale + // signal that was meant to be delivered to a previous/other vCPU on this + // same thread, however, we cannot distinguish between the two cases, so + // we assume that the vCPU was stopped because of an interrupt. + // This is fine, because the debugger will be notified about an interrupt + #[cfg(gdb)] + if self.debug.is_some() { + // If the vCPU was stopped because of an interrupt, we need to + // return a special exit reason so that the gdb thread can handle it + // and resume execution + HyperlightExit::Debug(VcpuStopReason::Interrupt) + } else { + HyperlightExit::Retry() + } + + #[cfg(not(gdb))] HyperlightExit::Retry() } } @@ -749,36 +762,128 @@ impl Hypervisor for KVMDriver { dbg_mem_access_fn: Arc>, stop_reason: VcpuStopReason, ) -> Result<()> { - self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) - .map_err(|e| new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e))?; - - loop { - log::debug!("Debug wait for event to resume vCPU"); - // Wait for a message from gdb - let req = self.recv_dbg_msg()?; + if self.debug.is_none() { + return Err(new_error!("Debugging is not enabled")); + } - let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + match stop_reason { + // If the vCPU stopped because of a crash, we need to handle it differently + // We do not want to allow resuming execution or placing breakpoints + // because the guest has crashed. + // We only allow reading registers and memory + VcpuStopReason::Crash => { + self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) + .map_err(|e| { + new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e) + })?; + + loop { + log::debug!("Debug wait for event to resume vCPU"); + // Wait for a message from gdb + let req = self.recv_dbg_msg()?; + + // Flag to store if we should deny continue or step requests + let mut deny_continue = false; + // Flag to store if we should detach from the gdb session + let mut detach = false; + + let response = match req { + // Allow the detach request to disable debugging by continuing resuming + // hypervisor crash error reporting + DebugMsg::DisableDebug => { + detach = true; + DebugResponse::DisableDebug + } + // Do not allow continue or step requests + DebugMsg::Continue | DebugMsg::Step => { + deny_continue = true; + DebugResponse::NotAllowed + } + // Do not allow adding/removing breakpoints and writing to memory or registers + DebugMsg::AddHwBreakpoint(_) + | DebugMsg::AddSwBreakpoint(_) + | DebugMsg::RemoveHwBreakpoint(_) + | DebugMsg::RemoveSwBreakpoint(_) + | DebugMsg::WriteAddr(_, _) + | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed, + + // For all other requests, we will process them normally + _ => { + let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + match result { + Ok(response) => response, + Err(HyperlightError::TranslateGuestAddress(_)) => { + // Treat non fatal errors separately so the guest doesn't fail + DebugResponse::ErrorOccurred + } + Err(e) => { + log::error!("Error processing debug request: {:?}", e); + return Err(e); + } + } + } + }; + + // Send the response to the request back to gdb + self.send_dbg_msg(response) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; + + // If we are denying continue or step requests, the debugger assumes the + // execution started so we need to report a stop reason as a crash and let + // it request to read registers/memory to figure out what happened + if deny_continue { + self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash)) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; + } - let response = match result { - Ok(response) => response, - // Treat non fatal errors separately so the guest doesn't fail - Err(HyperlightError::TranslateGuestAddress(_)) => DebugResponse::ErrorOccurred, - Err(e) => { - return Err(e); + // If we are detaching, we will break the loop and the Hypervisor will continue + // to handle the Crash reason + if detach { + break; + } + } + } + // If the vCPU stopped because of any other reason except a crash, we can handle it + // normally + _ => { + // Send the stop reason to the gdb thread + self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) + .map_err(|e| { + new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e) + })?; + + loop { + log::debug!("Debug wait for event to resume vCPU"); + // Wait for a message from gdb + let req = self.recv_dbg_msg()?; + + let result = self.process_dbg_request(req, dbg_mem_access_fn.clone()); + + let response = match result { + Ok(response) => response, + // Treat non fatal errors separately so the guest doesn't fail + Err(HyperlightError::TranslateGuestAddress(_)) => { + DebugResponse::ErrorOccurred + } + Err(e) => { + return Err(e); + } + }; + + let cont = matches!( + response, + DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug + ); + + self.send_dbg_msg(response) + .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; + + // Check if we should continue execution + // We continue if the response is one of the following: Step, Continue, or DisableDebug + if cont { + break; + } } - }; - - // If the command was either step or continue, we need to run the vcpu - let cont = matches!( - response, - DebugResponse::Step | DebugResponse::Continue | DebugResponse::DisableDebug - ); - - self.send_dbg_msg(response) - .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?; - - if cont { - break; } } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index bc6baa7cf..6eed878fa 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -283,6 +283,11 @@ impl VirtualCPU { #[cfg(crashdump)] crashdump::generate_crashdump(hv)?; + // If GDB is enabled, we handle the debug memory access + // Disregard return value as we want to return the error + #[cfg(gdb)] + let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); + if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) { return Err(HyperlightError::StackOverflow()); } @@ -301,6 +306,10 @@ impl VirtualCPU { Ok(HyperlightExit::Unknown(reason)) => { #[cfg(crashdump)] crashdump::generate_crashdump(hv)?; + // If GDB is enabled, we handle the debug memory access + // Disregard return value as we want to return the error + #[cfg(gdb)] + let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); log_then_return!("Unexpected VM Exit {:?}", reason); } @@ -308,6 +317,10 @@ impl VirtualCPU { Err(e) => { #[cfg(crashdump)] crashdump::generate_crashdump(hv)?; + // If GDB is enabled, we handle the debug memory access + // Disregard return value as we want to return the error + #[cfg(gdb)] + let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); return Err(e); }