From f1c709b233345f4ad2ec05b9e126028ee9595bff Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 15:09:41 -0400 Subject: [PATCH 1/9] check whether zone exists before starting --- .../src/reconciler_task/zones.rs | 183 +++++++++++++++--- 1 file changed, 152 insertions(+), 31 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 0bf14582d76..91c940f6b6a 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -20,6 +20,7 @@ use illumos_utils::zone::AdmError; use illumos_utils::zone::Api as _; use illumos_utils::zone::DeleteAddressError; use illumos_utils::zone::Zones; +use illumos_utils::zpool::ZpoolName; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneType; @@ -32,6 +33,7 @@ use slog::Logger; use slog::info; use slog::warn; use slog_error_chain::InlineErrorChain; +use std::borrow::Cow; use std::collections::BTreeMap; use std::net::IpAddr; use std::net::Ipv6Addr; @@ -248,6 +250,29 @@ impl OmicronZones { is_time_synchronized: bool, all_u2_pools: &CurrentlyManagedZpools, log: &Logger, + ) { + self.start_zones_if_needed_impl( + desired_zones, + sled_agent_facilities, + &RealZoneFacilities, + is_time_synchronized, + all_u2_pools, + log, + ) + .await + } + + async fn start_zones_if_needed_impl< + T: SledAgentFacilities, + U: ZoneFacilities, + >( + &mut self, + desired_zones: &IdMap, + sled_agent_facilities: &T, + zone_facilities: &U, + is_time_synchronized: bool, + all_u2_pools: &CurrentlyManagedZpools, + log: &Logger, ) { // Filter desired zones down to just those that we need to start. See // [`ZoneState`] for more discussion of why we're willing (or unwilling) @@ -295,19 +320,14 @@ impl OmicronZones { // Build up the futures for starting each zone. let all_u2_pools = all_u2_pools.clone().into_vec(); let start_futures = zones_to_start.map(|zone| { - sled_agent_facilities - .start_omicron_zone( - zone, - &self.mount_config, - is_time_synchronized, - &all_u2_pools, - ) - .map(move |result| { - ( - zone.clone(), - result.map_err(ZoneStartError::SledAgentStartFailed), - ) - }) + self.start_single_zone( + zone, + sled_agent_facilities, + zone_facilities, + is_time_synchronized, + &all_u2_pools, + ) + .map(move |result| (zone.clone(), result)) }); // Concurrently start all zones, then record the results. @@ -321,6 +341,45 @@ impl OmicronZones { } } + async fn start_single_zone( + &self, + zone: &OmicronZoneConfig, + sled_agent_facilities: &T, + zone_facilities: &U, + is_time_synchronized: bool, + all_u2_pools: &[ZpoolName], + ) -> Result { + // Ensure no zone by this name exists. This should only happen in the + // event of a sled-agent restart, in which case all the zones the + // previous sled-agent process had started are still running. + self.ensure_removed_before_starting(zone, zone_facilities).await?; + + sled_agent_facilities + .start_omicron_zone( + zone, + &self.mount_config, + is_time_synchronized, + all_u2_pools, + ) + .await + .map_err(ZoneStartError::SledAgentStartFailed) + } + + async fn ensure_removed_before_starting( + &self, + zone: &OmicronZoneConfig, + zone_facilities: &U, + ) -> Result<(), ZoneStartError> { + let zone_name = ZoneName::new(zone); + + // If no zone by this name exists, there's nothing to remove. + if !zone_facilities.zone_with_name_exists(&zone_name).await? { + return Ok(()); + } + + Ok(()) + } + /// Check the timesync status from a running NTP zone (if it exists) pub(super) async fn check_timesync(&self) -> TimeSyncStatus { match &self.timesync_config { @@ -551,7 +610,7 @@ impl OmicronZone { self.resume_shutdown_from_stop( sled_agent_facilities, zone_facilities, - running_zone, + &ZoneName::from(running_zone.name()), log, ) .await @@ -564,17 +623,17 @@ impl OmicronZone { &self, sled_agent_facilities: &T, zone_facilities: &U, - running_zone: &Arc, + zone_name: &ZoneName<'_>, log: &Logger, ) -> Result<(), (PartiallyShutDownState, ZoneShutdownError)> { - if let Err(err) = zone_facilities.halt_zone(running_zone, log).await { + if let Err(err) = zone_facilities.halt_zone(zone_name, log).await { warn!( log, "Failed to stop running zone"; InlineErrorChain::new(&err), ); return Err(( - PartiallyShutDownState::FailedToStop(Arc::clone(running_zone)), + PartiallyShutDownState::FailedToStop(zone_name.to_static()), err, )); } @@ -688,19 +747,54 @@ enum ZoneState { FailedToStart(ZoneStartError), } +#[derive(Debug, Clone)] +struct ZoneName<'a>(Cow<'a, str>); + +impl<'a> From<&'a str> for ZoneName<'a> { + fn from(value: &'a str) -> Self { + Self(Cow::Borrowed(value)) + } +} + +impl ZoneName<'_> { + fn new(config: &OmicronZoneConfig) -> Self { + Self(Cow::Owned(config.zone_name())) + } + + fn to_string(&self) -> String { + self.0.clone().into_owned() + } + + fn to_static(&self) -> ZoneName<'static> { + ZoneName(Cow::Owned(self.0.clone().into_owned())) + } +} + #[derive(Debug)] enum PartiallyShutDownState { FailedToUntrackMetrics(Arc), - FailedToStop(Arc), + FailedToStop(ZoneName<'static>), FailedToCleanUp, } #[derive(Debug, thiserror::Error)] enum ZoneStartError { + #[error("could not determine whether zone already exists")] + CheckZoneExists(#[from] CheckZoneExistsError), #[error("sled agent failed to start service")] SledAgentStartFailed(#[source] anyhow::Error), } +#[derive(Debug, thiserror::Error)] +enum CheckZoneExistsError { + #[error("failed to find zone {name}")] + FindByName { + name: String, + #[source] + err: AdmError, + }, +} + #[derive(Debug, thiserror::Error)] enum ZoneShutdownError { #[error("failed to untrack metrics")] @@ -712,9 +806,14 @@ enum ZoneShutdownError { } trait ZoneFacilities { + async fn zone_with_name_exists( + &self, + name: &ZoneName<'_>, + ) -> Result; + async fn halt_zone( &self, - zone: &RunningZone, + zone: &ZoneName, log: &Logger, ) -> Result<(), ZoneShutdownError>; @@ -727,20 +826,32 @@ trait ZoneFacilities { struct RealZoneFacilities; impl ZoneFacilities for RealZoneFacilities { + async fn zone_with_name_exists( + &self, + name: &ZoneName<'_>, + ) -> Result { + match Zones::real_api().find(&name.0).await { + Ok(maybe_zone) => Ok(maybe_zone.is_some()), + Err(err) => Err(CheckZoneExistsError::FindByName { + name: name.to_string(), + err, + }), + } + } + async fn halt_zone( &self, - zone: &RunningZone, + zone: &ZoneName<'_>, log: &Logger, ) -> Result<(), ZoneShutdownError> { - // We don't use `zone.stop()` here because it doesn't allow repeated - // attempts after a failure: - // https://github.com/oxidecomputer/omicron/issues/7881. Instead, use - // the lower-level `Zones::halt_and_remove_logged()` function directly. - // This may leave our `RunningZone` is a bogus state where it still - // holds a `zoneid_t` that doesn't exist anymore, but if we're in the - // shutdown path we never use that `zoneid_t`. + // We don't use `RunningZone::stop()` here because it doesn't allow + // repeated attempts after a failure + // (https://github.com/oxidecomputer/omicron/issues/7881) and because in + // the case of "an unexpected zone is running", all we have is the name. + // Instead, use the lower-level `Zones::halt_and_remove_logged()` + // function directly. Zones::real_api() - .halt_and_remove_logged(log, zone.name()) + .halt_and_remove_logged(log, &zone.0) .await .map_err(ZoneShutdownError::HaltAndRemove) } @@ -845,9 +956,16 @@ mod tests { } impl ZoneFacilities for FakeZoneFacilities { + async fn zone_with_name_exists( + &self, + _name: &ZoneName<'_>, + ) -> Result { + Ok(false) + } + async fn halt_zone( &self, - _zone: &RunningZone, + _zone: &ZoneName<'_>, _log: &Logger, ) -> Result<(), ZoneShutdownError> { // If a test has called `push_halt_response`, respsect that; @@ -1064,15 +1182,17 @@ mod tests { // Configure our fake sled-agent to fail to start a zone. let sled_agent_facilities = FakeSledAgentFacilities::default(); sled_agent_facilities.push_start_response(Err(anyhow!("test-boom"))); + let zone_facilities = FakeZoneFacilities::default(); // Starting with no zones, we should try and fail to start the one zone // in `desired_zones`. let mut zones = OmicronZones::new(nonexistent_mount_config(), TimeSyncConfig::Skip); zones - .start_zones_if_needed( + .start_zones_if_needed_impl( &desired_zones, &sled_agent_facilities, + &zone_facilities, true, ¤tly_managed_zpools, &logctx.log, @@ -1104,9 +1224,10 @@ mod tests { // Starting from the "zone failed to start" state, we should try again // to start the zone (and succeed this time). zones - .start_zones_if_needed( + .start_zones_if_needed_impl( &desired_zones, &sled_agent_facilities, + &zone_facilities, true, ¤tly_managed_zpools, &logctx.log, From d4a58b8058c9b14d8e89d58bd93186e76abd4557 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 16:11:31 -0400 Subject: [PATCH 2/9] remove preexisting running zones in start zone path --- .../src/reconciler_task/zones.rs | 207 ++++++++++++------ 1 file changed, 135 insertions(+), 72 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 91c940f6b6a..0a17ad659a9 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -326,6 +326,7 @@ impl OmicronZones { zone_facilities, is_time_synchronized, &all_u2_pools, + log, ) .map(move |result| (zone.clone(), result)) }); @@ -334,7 +335,7 @@ impl OmicronZones { let start_results = future::join_all(start_futures).await; for (config, result) in start_results { let state = match result { - Ok(running_zone) => ZoneState::Running(Arc::new(running_zone)), + Ok(state) => state, Err(err) => ZoneState::FailedToStart(err), }; self.zones.insert(OmicronZone { config, state }); @@ -348,13 +349,25 @@ impl OmicronZones { zone_facilities: &U, is_time_synchronized: bool, all_u2_pools: &[ZpoolName], - ) -> Result { + log: &Logger, + ) -> Result { // Ensure no zone by this name exists. This should only happen in the // event of a sled-agent restart, in which case all the zones the // previous sled-agent process had started are still running. - self.ensure_removed_before_starting(zone, zone_facilities).await?; + if let Some(state) = self + .ensure_removed_before_starting( + zone, + sled_agent_facilities, + zone_facilities, + log, + ) + .await? + { + return Ok(state); + } - sled_agent_facilities + // The zone is not running - start it. + match sled_agent_facilities .start_omicron_zone( zone, &self.mount_config, @@ -362,22 +375,70 @@ impl OmicronZones { all_u2_pools, ) .await - .map_err(ZoneStartError::SledAgentStartFailed) + { + Ok(running_zone) => Ok(ZoneState::Running(Arc::new(running_zone))), + Err(err) => Err(ZoneStartError::SledAgentStartFailed(err)), + } } - async fn ensure_removed_before_starting( + // The return type of this function is strange. The possible values are: + // + // * `Ok(None)` - the zone is not running + // * `Err(_)` - we had an error related to zone startup + // * `Ok(Some(state))` - the zone is still running and is in some state that + // we need to do more work to handle (e.g., we found a running zone but + // failed to shut it down cleanly, in which case we'll return + // `Ok(Some(ZoneState::PartiallyShutDown { .. }))`). In this case, our + // caller should do no further work to try to start `zone`, and should + // instead bubble the `state` up to be recorded. + async fn ensure_removed_before_starting< + T: SledAgentFacilities, + U: ZoneFacilities, + >( &self, zone: &OmicronZoneConfig, + sled_agent_facilities: &T, zone_facilities: &U, - ) -> Result<(), ZoneStartError> { + log: &Logger, + ) -> Result, ZoneStartError> { let zone_name = ZoneName::new(zone); // If no zone by this name exists, there's nothing to remove. if !zone_facilities.zone_with_name_exists(&zone_name).await? { - return Ok(()); + return Ok(None); } - Ok(()) + // NOTE: We might want to tell the sled-agent's metrics task to stop + // tracking any links in this zone. However, we don't have very easy + // access to them, without running a command in the zone. These links + // are about to be deleted, and the metrics task will expire them after + // a while anyway, but it might be worth the trouble to do that in the + // future. + // + // Skipping that for now, follow the normal zone shutdown process + // _after_ metrics (i.e., shut down and clean up the zone). + // + // TODO-correctness There's a (very unlikely?) chance that this cleanup + // isn't right: if the running zone (which we have no active knowledge + // of) was started with a different `OmicronZoneConfig`, the cleanup + // steps we do here might not be right. + match resume_shutdown_from_stop( + zone, + sled_agent_facilities, + zone_facilities, + &zone_name, + log, + ) + .await + { + Ok(()) => Ok(None), + Err((state, err)) => { + // We didn't fail to _start_ the zone, so it doesn't make sense + // to return a `ZoneStartError`, but the zone is in a state that + // we need to remember. + Ok(Some(ZoneState::PartiallyShutDown { state, err })) + } + } } /// Check the timesync status from a running NTP zone (if it exists) @@ -545,7 +606,8 @@ impl OmicronZone { state: PartiallyShutDownState::FailedToStop(running_zone), .. } => { - self.resume_shutdown_from_stop( + resume_shutdown_from_stop( + &self.config, sled_agent_facilities, zone_facilities, running_zone, @@ -557,7 +619,8 @@ impl OmicronZone { state: PartiallyShutDownState::FailedToCleanUp, .. } => { - self.resume_shutdown_from_cleanup( + resume_shutdown_from_cleanup( + &self.config, sled_agent_facilities, zone_facilities, &log, @@ -607,7 +670,8 @@ impl OmicronZone { )); } - self.resume_shutdown_from_stop( + resume_shutdown_from_stop( + &self.config, sled_agent_facilities, zone_facilities, &ZoneName::from(running_zone.name()), @@ -615,75 +679,74 @@ impl OmicronZone { ) .await } +} - async fn resume_shutdown_from_stop< - T: SledAgentFacilities, - U: ZoneFacilities, - >( - &self, - sled_agent_facilities: &T, - zone_facilities: &U, - zone_name: &ZoneName<'_>, - log: &Logger, - ) -> Result<(), (PartiallyShutDownState, ZoneShutdownError)> { - if let Err(err) = zone_facilities.halt_zone(zone_name, log).await { +async fn resume_shutdown_from_stop< + T: SledAgentFacilities, + U: ZoneFacilities, +>( + config: &OmicronZoneConfig, + sled_agent_facilities: &T, + zone_facilities: &U, + zone_name: &ZoneName<'_>, + log: &Logger, +) -> Result<(), (PartiallyShutDownState, ZoneShutdownError)> { + if let Err(err) = zone_facilities.halt_zone(zone_name, log).await { + warn!( + log, + "Failed to stop running zone"; + InlineErrorChain::new(&err), + ); + return Err(( + PartiallyShutDownState::FailedToStop(zone_name.to_static()), + err, + )); + } + + resume_shutdown_from_cleanup( + config, + sled_agent_facilities, + zone_facilities, + log, + ) + .await +} + +async fn resume_shutdown_from_cleanup< + T: SledAgentFacilities, + U: ZoneFacilities, +>( + config: &OmicronZoneConfig, + sled_agent_facilities: &T, + zone_facilities: &U, + log: &Logger, +) -> Result<(), (PartiallyShutDownState, ZoneShutdownError)> { + // Special teardown for internal DNS zones: delete the global zone + // address we created for it, and tell DDM to stop advertising the + // prefix of that address. + if let OmicronZoneType::InternalDns { + gz_address, gz_address_index, .. + } = &config.zone_type + { + let addrobj = AddrObject::new( + &sled_agent_facilities.underlay_vnic().0, + &internal_dns_addrobj_name(*gz_address_index), + ) + .expect("internal DNS address object name is well-formed"); + if let Err(err) = zone_facilities.delete_gz_address(addrobj).await { warn!( log, - "Failed to stop running zone"; + "Failed to delete internal-dns gz address"; InlineErrorChain::new(&err), ); - return Err(( - PartiallyShutDownState::FailedToStop(zone_name.to_static()), - err, - )); + return Err((PartiallyShutDownState::FailedToCleanUp, err)); } - self.resume_shutdown_from_cleanup( - sled_agent_facilities, - zone_facilities, - log, - ) - .await + sled_agent_facilities + .ddm_remove_internal_dns_prefix(Ipv6Subnet::new(*gz_address)); } - async fn resume_shutdown_from_cleanup< - T: SledAgentFacilities, - U: ZoneFacilities, - >( - &self, - sled_agent_facilities: &T, - zone_facilities: &U, - log: &Logger, - ) -> Result<(), (PartiallyShutDownState, ZoneShutdownError)> { - // Special teardown for internal DNS zones: delete the global zone - // address we created for it, and tell DDM to stop advertising the - // prefix of that address. - if let OmicronZoneType::InternalDns { - gz_address, - gz_address_index, - .. - } = &self.config.zone_type - { - let addrobj = AddrObject::new( - &sled_agent_facilities.underlay_vnic().0, - &internal_dns_addrobj_name(*gz_address_index), - ) - .expect("internal DNS address object name is well-formed"); - if let Err(err) = zone_facilities.delete_gz_address(addrobj).await { - warn!( - log, - "Failed to delete internal-dns gz address"; - InlineErrorChain::new(&err), - ); - return Err((PartiallyShutDownState::FailedToCleanUp, err)); - } - - sled_agent_facilities - .ddm_remove_internal_dns_prefix(Ipv6Subnet::new(*gz_address)); - } - - Ok(()) - } + Ok(()) } fn internal_dns_addrobj_name(gz_address_index: u32) -> String { From 8c73e8ed79eaf2f374a7faa375c6776a22967baa Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 16:32:58 -0400 Subject: [PATCH 3/9] add tests for "shutdown before starting" --- .../src/reconciler_task/zones.rs | 157 +++++++++++++++++- 1 file changed, 151 insertions(+), 6 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 0a17ad659a9..a76d092b8d4 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -810,7 +810,7 @@ enum ZoneState { FailedToStart(ZoneStartError), } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] struct ZoneName<'a>(Cow<'a, str>); impl<'a> From<&'a str> for ZoneName<'a> { @@ -819,6 +819,12 @@ impl<'a> From<&'a str> for ZoneName<'a> { } } +impl From for ZoneName<'static> { + fn from(value: String) -> Self { + Self(Cow::Owned(value)) + } +} + impl ZoneName<'_> { fn new(config: &OmicronZoneConfig) -> Self { Self(Cow::Owned(config.zone_name())) @@ -933,6 +939,7 @@ impl ZoneFacilities for RealZoneFacilities { mod tests { use super::*; use crate::CurrentlyManagedZpoolsReceiver; + use assert_matches::assert_matches; use anyhow::anyhow; use camino_tempfile::Utf8TempDir; use illumos_utils::dladm::Etherstub; @@ -1007,11 +1014,17 @@ mod tests { #[derive(Debug, Default)] struct FakeZoneFacilitiesInner { + existing_zones: BTreeSet, halt_responses: Option>>, removed_gz_addresses: BTreeSet, } impl FakeZoneFacilities { + fn push_existing_zone(&self, name: String) { + let mut inner = self.inner.lock().unwrap(); + inner.existing_zones.insert(name); + } + fn push_halt_response(&self, response: Result<(), ZoneShutdownError>) { let mut inner = self.inner.lock().unwrap(); inner.halt_responses.get_or_insert_default().push_back(response); @@ -1021,14 +1034,15 @@ mod tests { impl ZoneFacilities for FakeZoneFacilities { async fn zone_with_name_exists( &self, - _name: &ZoneName<'_>, + name: &ZoneName<'_>, ) -> Result { - Ok(false) + let inner = self.inner.lock().unwrap(); + Ok(inner.existing_zones.contains(&*name.0)) } async fn halt_zone( &self, - _zone: &ZoneName<'_>, + zone: &ZoneName<'_>, _log: &Logger, ) -> Result<(), ZoneShutdownError> { // If a test has called `push_halt_response`, respsect that; @@ -1036,9 +1050,18 @@ mod tests { let mut inner = self.inner.lock().unwrap(); match inner.halt_responses.as_mut() { Some(resp) => { - resp.pop_front().expect("have a response for halt_zone()") + let resp = resp + .pop_front() + .expect("have a response for halt_zone()"); + if resp.is_ok() { + inner.existing_zones.remove(&*zone.0); + } + resp + } + None => { + inner.existing_zones.remove(&*zone.0); + Ok(()) } - None => Ok(()), } } @@ -1312,6 +1335,128 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn start_zone_stops_preexisting_zones() { + let logctx = dev::test_setup_log("start_zone_stops_preexisting_zones"); + + // Construct a zone we want to start. + let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); + let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + let currently_managed_zpools = + CurrentlyManagedZpoolsReceiver::fake_static( + desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), + ) + .current(); + + // Configure our fake zone facilities to report a zone with this name as + // already running. + let sled_agent_facilities = FakeSledAgentFacilities::default(); + let zone_facilities = FakeZoneFacilities::default(); + zone_facilities.push_existing_zone(fake_zone.zone_name()); + + let mut zones = + OmicronZones::new(nonexistent_mount_config(), TimeSyncConfig::Skip); + + // Set up our fake sled-agent to return success once the old zone has + // been halted. + let fake_zone_builder = FakeZoneBuilder::new(); + sled_agent_facilities.push_start_response(Ok(fake_zone_builder + .make_running_zone("test", logctx.log.clone()) + .await)); + + // Start zones: this should halt the preexisting zone. + zones + .start_zones_if_needed_impl( + &desired_zones, + &sled_agent_facilities, + &zone_facilities, + true, + ¤tly_managed_zpools, + &logctx.log, + ) + .await; + + assert_eq!( + zone_facilities.inner.lock().unwrap().existing_zones, + BTreeSet::new() + ); + + assert_eq!(zones.zones.len(), 1); + let zone_should_be_running = + zones.zones.get(&fake_zone.id).expect("zone is present"); + assert_eq!( + zone_should_be_running.config, + *desired_zones.get(&fake_zone.id).unwrap() + ); + match &zone_should_be_running.state { + ZoneState::Running(_) => (), + other => panic!("unexpected zone state: {other:?}"), + } + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn start_zone_fails_if_halting_preexisting_zone_fails() { + let logctx = dev::test_setup_log( + "start_zone_fails_if_halting_preexisting_zone_fails", + ); + + // Construct a zone we want to start. + let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); + let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + let currently_managed_zpools = + CurrentlyManagedZpoolsReceiver::fake_static( + desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), + ) + .current(); + + // Configure our fake zone facilities to report a zone with this name as + // already running, and configure halting this zone to fail. + let zone_facilities = FakeZoneFacilities::default(); + zone_facilities.push_existing_zone(fake_zone.zone_name()); + zone_facilities.push_halt_response(Err( + ZoneShutdownError::UntrackMetrics(anyhow!("boom")), + )); + let sled_agent_facilities = FakeSledAgentFacilities::default(); + + let mut zones = + OmicronZones::new(nonexistent_mount_config(), TimeSyncConfig::Skip); + + // Start zones: this should try and fail to halt the preexisting zone. + zones + .start_zones_if_needed_impl( + &desired_zones, + &sled_agent_facilities, + &zone_facilities, + true, + ¤tly_managed_zpools, + &logctx.log, + ) + .await; + + assert_eq!( + zone_facilities.inner.lock().unwrap().existing_zones, + [fake_zone.zone_name()].into_iter().collect::>(), + ); + + assert_eq!(zones.zones.len(), 1); + let zone = zones.zones.get(&fake_zone.id).expect("zone is present"); + assert_eq!(zone.config, *desired_zones.get(&fake_zone.id).unwrap()); + + // The zone should now be in the "partially shut down" state. + match &zone.state { + ZoneState::PartiallyShutDown { state, err } => { + assert_matches!(state, PartiallyShutDownState::FailedToStop(_)); + let err = InlineErrorChain::new(err).to_string(); + assert!(err.contains("boom"), "unexpected error: {err}"); + } + other => panic!("unexpected zone state: {other:?}"), + } + + logctx.cleanup_successful(); + } + #[tokio::test] async fn shutdown_dns_does_dns_specific_cleanup() { let logctx = From 5578ba19dec7c0c2880e0c5ad67d05086ddc4976 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 16:34:00 -0400 Subject: [PATCH 4/9] remove ServiceManager::ensure_removed() --- sled-agent/src/services.rs | 95 -------------------------------------- 1 file changed, 95 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 0400aff0b3f..87e305ce2bd 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3283,12 +3283,6 @@ impl ServiceManager { time_is_synchronized: bool, all_u2_pools: &[ZpoolName], ) -> Result { - // Ensure the zone has been fully removed before we try to boot it. - // - // This ensures that old "partially booted/stopped" zones do not - // interfere with our installation. - self.ensure_removed(&zone).await?; - // If this zone requires timesync and we aren't ready, fail it early. if zone.zone_type.requires_timesync() && !time_is_synchronized { return Err(Error::TimeNotSynchronized); @@ -3322,95 +3316,6 @@ impl ServiceManager { Ok(runtime) } - // Ensures that if a zone is about to be installed, it does not exist. - async fn ensure_removed( - &self, - zone_config: &OmicronZoneConfig, - ) -> Result<(), Error> { - let zone_name = zone_config.zone_name(); - match self.inner.system_api.zones().find(&zone_name).await { - Ok(Some(zone)) => { - warn!( - self.inner.log, - "removing zone"; - "zone" => &zone_name, - "state" => ?zone.state(), - ); - // NOTE: We might want to tell the sled-agent's metrics task to - // stop tracking any links in this zone. However, we don't have - // very easy access to them, without running a command in the - // zone. These links are about to be deleted, and the metrics - // task will expire them after a while anyway, but it might be - // worth the trouble to do that in the future. - if let Err(e) = self - .inner - .system_api - .zones() - .halt_and_remove_logged(&self.inner.log, &zone_name) - .await - { - error!( - self.inner.log, - "Failed to remove zone"; - "zone" => &zone_name, - InlineErrorChain::new(&e), - ); - return Err(Error::ZoneRemoval { - zone_name: zone_name.to_string(), - err: e, - }); - } - if let Err(e) = - self.clean_up_after_zone_shutdown(zone_config).await - { - error!( - self.inner.log, - "Failed to clean up after removing zone"; - "zone" => &zone_name, - InlineErrorChain::new(&e), - ); - return Err(e); - } - Ok(()) - } - Ok(None) => Ok(()), - Err(err) => Err(Error::ZoneList(err)), - } - } - - // Perform any outside-the-zone cleanup required after shutting down a zone. - async fn clean_up_after_zone_shutdown( - &self, - zone: &OmicronZoneConfig, - ) -> Result<(), Error> { - // Special teardown for internal DNS zones: delete the global zone - // address we created for it, and tell DDM to stop advertising the - // prefix of that address. - if let OmicronZoneType::InternalDns { - gz_address, - gz_address_index, - .. - } = &zone.zone_type - { - let addrobj = AddrObject::new( - &self.inner.underlay_vnic.0, - &internal_dns_addrobj_name(*gz_address_index), - ) - .expect("internal DNS address object name is well-formed"); - Zones::delete_address(None, &addrobj).await.map_err(|err| { - Error::ZoneCleanup { - zone_name: zone.zone_name(), - err: Box::new(err), - } - })?; - - self.ddm_reconciler() - .remove_internal_dns_subnet(Ipv6Subnet::new(*gz_address)); - } - - Ok(()) - } - // Returns a zone filesystem mountpoint, after ensuring that U.2 storage // is valid. async fn validate_storage_and_pick_mountpoint( From 5d4ecbc34d9702f184d162adda2b95efe59244fa Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 17:12:46 -0400 Subject: [PATCH 5/9] validate zones' dataset dependencies before starting them --- nexus-sled-agent-shared/src/inventory.rs | 40 +++++++++ .../src/dataset_serialization_task.rs | 8 ++ .../config-reconciler/src/reconciler_task.rs | 1 + .../src/reconciler_task/datasets.rs | 83 +++++++++++++++++++ .../src/reconciler_task/zones.rs | 55 +++++++++++- sled-agent/src/lib.rs | 1 - sled-agent/src/params.rs | 70 ---------------- sled-agent/src/services.rs | 2 +- 8 files changed, 187 insertions(+), 73 deletions(-) delete mode 100644 sled-agent/src/params.rs diff --git a/nexus-sled-agent-shared/src/inventory.rs b/nexus-sled-agent-shared/src/inventory.rs index 4a775d7ab39..dbec9a2d069 100644 --- a/nexus-sled-agent-shared/src/inventory.rs +++ b/nexus-sled-agent-shared/src/inventory.rs @@ -12,6 +12,7 @@ use chrono::{DateTime, Utc}; use daft::Diffable; use id_map::IdMap; use id_map::IdMappable; +use omicron_common::disk::{DatasetKind, DatasetName}; use omicron_common::ledger::Ledgerable; use omicron_common::{ api::{ @@ -327,6 +328,10 @@ impl OmicronZoneConfig { Some(self.id), ) } + + pub fn dataset_name(&self) -> Option { + self.zone_type.dataset_name() + } } /// Describes a persistent ZFS dataset associated with an Omicron zone @@ -613,6 +618,41 @@ impl OmicronZoneType { | OmicronZoneType::Oximeter { .. } => None, } } + + /// If this kind of zone has an associated dataset, return the dataset's + /// name. Otherwise, return `None`. + pub fn dataset_name(&self) -> Option { + let (dataset, dataset_kind) = match self { + OmicronZoneType::BoundaryNtp { .. } + | OmicronZoneType::InternalNtp { .. } + | OmicronZoneType::Nexus { .. } + | OmicronZoneType::Oximeter { .. } + | OmicronZoneType::CruciblePantry { .. } => None, + OmicronZoneType::Clickhouse { dataset, .. } => { + Some((dataset, DatasetKind::Clickhouse)) + } + OmicronZoneType::ClickhouseKeeper { dataset, .. } => { + Some((dataset, DatasetKind::ClickhouseKeeper)) + } + OmicronZoneType::ClickhouseServer { dataset, .. } => { + Some((dataset, DatasetKind::ClickhouseServer)) + } + OmicronZoneType::CockroachDb { dataset, .. } => { + Some((dataset, DatasetKind::Cockroach)) + } + OmicronZoneType::Crucible { dataset, .. } => { + Some((dataset, DatasetKind::Crucible)) + } + OmicronZoneType::ExternalDns { dataset, .. } => { + Some((dataset, DatasetKind::ExternalDns)) + } + OmicronZoneType::InternalDns { dataset, .. } => { + Some((dataset, DatasetKind::InternalDns)) + } + }?; + + Some(DatasetName::new(dataset.pool_name, dataset_kind)) + } } /// Like [`OmicronZoneType`], but without any associated data. diff --git a/sled-agent/config-reconciler/src/dataset_serialization_task.rs b/sled-agent/config-reconciler/src/dataset_serialization_task.rs index 67576c897f7..05556c3abf5 100644 --- a/sled-agent/config-reconciler/src/dataset_serialization_task.rs +++ b/sled-agent/config-reconciler/src/dataset_serialization_task.rs @@ -175,6 +175,14 @@ impl IdMappable for DatasetEnsureResult { pub(crate) struct DatasetTaskHandle(mpsc::Sender); impl DatasetTaskHandle { + // For testing, create a handle on which requests will always fail with a + // `DatasetTaskError`. + #[cfg(any(test, feature = "testing"))] + pub(crate) fn spawn_noop() -> Self { + let (tx, _rx) = mpsc::channel(1); + Self(tx) + } + pub fn spawn_dataset_task( mount_config: Arc, currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver, diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 76cb11cf706..42a1b700415 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -465,6 +465,7 @@ impl ReconcilerTask { &sled_config.zones, sled_agent_facilities, timesync_status.is_synchronized(), + &self.datasets, ¤tly_managed_zpools, &self.log, ) diff --git a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs index cf97ee28589..d31a0532fe8 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs @@ -13,7 +13,10 @@ use crate::dataset_serialization_task::DatasetTaskHandle; use id_map::IdMap; use id_map::IdMappable; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; +use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use omicron_common::disk::DatasetConfig; +use omicron_common::disk::DatasetKind; +use omicron_common::disk::DatasetName; use omicron_uuid_kinds::DatasetUuid; use slog::Logger; use slog::warn; @@ -21,6 +24,18 @@ use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::sync::Arc; +#[derive(Debug, thiserror::Error)] +pub(super) enum ZoneDatasetDependencyError { + #[error("zone config is missing a filesystem pool")] + MissingFilesystemPool, + #[error( + "zone's transient root dataset is not available: {}", .0.full_name(), + )] + TransientZoneDatasetNotAvailable(DatasetName), + #[error("zone's durable dataset is not available: {}", .0.full_name())] + DurableDatasetNotAvailable(DatasetName), +} + #[derive(Debug)] pub(super) struct OmicronDatasets { datasets: IdMap, @@ -28,10 +43,78 @@ pub(super) struct OmicronDatasets { } impl OmicronDatasets { + #[cfg(any(test, feature = "testing"))] + pub(super) fn with_datasets(datasets: I) -> Self + where + I: Iterator)>, + { + let dataset_task = DatasetTaskHandle::spawn_noop(); + let datasets = datasets + .map(|(config, result)| OmicronDataset { + config, + state: match result { + Ok(()) => DatasetState::Ensured, + Err(err) => DatasetState::FailedToEnsure(Arc::new(err)), + }, + }) + .collect(); + Self { datasets, dataset_task } + } + pub(super) fn new(dataset_task: DatasetTaskHandle) -> Self { Self { datasets: IdMap::default(), dataset_task } } + pub(super) fn validate_zone_dependencies( + &self, + zone: &OmicronZoneConfig, + ) -> Result<(), ZoneDatasetDependencyError> { + // Confirm that there's an ensured `TransientZoneRoot` dataset on this + // zone's filesystem pool. + let Some(filesystem_pool) = zone.filesystem_pool else { + return Err(ZoneDatasetDependencyError::MissingFilesystemPool); + }; + + let filesystem_dataset_name = DatasetName::new( + filesystem_pool, + DatasetKind::TransientZone { name: zone.zone_name() }, + ); + + // TODO-cleanup It would be nicer if the zone included the filesystem + // dataset ID, so we could just do a lookup here instead of searching. + if !self.datasets.iter().any(|d| { + matches!(d.state, DatasetState::Ensured) + && d.config.name == filesystem_dataset_name + }) { + return Err( + ZoneDatasetDependencyError::TransientZoneDatasetNotAvailable( + filesystem_dataset_name, + ), + ); + } + + // Confirm that the durable dataset for this zone has been ensured, if + // it has one. + let Some(durable_dataset) = zone.dataset_name() else { + return Ok(()); + }; + + // TODO-cleanup As above, if we had an ID we could look up instead of + // searching. + if !self.datasets.iter().any(|d| { + matches!(d.state, DatasetState::Ensured) + && d.config.name == durable_dataset + }) { + return Err( + ZoneDatasetDependencyError::DurableDatasetNotAvailable( + durable_dataset, + ), + ); + } + + Ok(()) + } + pub(super) fn remove_datasets_if_needed( &mut self, datasets: &IdMap, diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index a76d092b8d4..a2b983da6e9 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -42,6 +42,8 @@ use std::str::FromStr as _; use std::sync::Arc; use super::CurrentlyManagedZpools; +use super::OmicronDatasets; +use super::datasets::ZoneDatasetDependencyError; #[derive(Debug, Clone)] pub enum TimeSyncStatus { @@ -248,6 +250,7 @@ impl OmicronZones { desired_zones: &IdMap, sled_agent_facilities: &T, is_time_synchronized: bool, + datasets: &OmicronDatasets, all_u2_pools: &CurrentlyManagedZpools, log: &Logger, ) { @@ -256,6 +259,7 @@ impl OmicronZones { sled_agent_facilities, &RealZoneFacilities, is_time_synchronized, + datasets, all_u2_pools, log, ) @@ -271,6 +275,7 @@ impl OmicronZones { sled_agent_facilities: &T, zone_facilities: &U, is_time_synchronized: bool, + datasets: &OmicronDatasets, all_u2_pools: &CurrentlyManagedZpools, log: &Logger, ) { @@ -325,6 +330,7 @@ impl OmicronZones { sled_agent_facilities, zone_facilities, is_time_synchronized, + datasets, &all_u2_pools, log, ) @@ -348,6 +354,7 @@ impl OmicronZones { sled_agent_facilities: &T, zone_facilities: &U, is_time_synchronized: bool, + datasets: &OmicronDatasets, all_u2_pools: &[ZpoolName], log: &Logger, ) -> Result { @@ -366,6 +373,9 @@ impl OmicronZones { return Ok(state); } + // Ensure the dataset dependencies of this zone are okay. + datasets.validate_zone_dependencies(zone)?; + // The zone is not running - start it. match sled_agent_facilities .start_omicron_zone( @@ -850,6 +860,8 @@ enum PartiallyShutDownState { enum ZoneStartError { #[error("could not determine whether zone already exists")] CheckZoneExists(#[from] CheckZoneExistsError), + #[error(transparent)] + DatasetDependency(#[from] ZoneDatasetDependencyError), #[error("sled agent failed to start service")] SledAgentStartFailed(#[source] anyhow::Error), } @@ -939,8 +951,8 @@ impl ZoneFacilities for RealZoneFacilities { mod tests { use super::*; use crate::CurrentlyManagedZpoolsReceiver; - use assert_matches::assert_matches; use anyhow::anyhow; + use assert_matches::assert_matches; use camino_tempfile::Utf8TempDir; use illumos_utils::dladm::Etherstub; use illumos_utils::dladm::EtherstubVnic; @@ -952,7 +964,12 @@ mod tests { use nexus_sled_agent_shared::inventory::OmicronZoneDataset; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use omicron_common::address::SLED_PREFIX; + use omicron_common::disk::DatasetConfig; + use omicron_common::disk::DatasetKind; + use omicron_common::disk::DatasetName; + use omicron_common::disk::SharedDatasetConfig; use omicron_test_utils::dev; + use omicron_uuid_kinds::DatasetUuid; use omicron_uuid_kinds::ZpoolUuid; use std::collections::BTreeSet; use std::collections::VecDeque; @@ -1169,6 +1186,35 @@ mod tests { } } + // Helper to build an `OmicronDatasets` for the given zone config. + fn make_datasets<'a>( + zones: impl Iterator, + ) -> OmicronDatasets { + let mut datasets = Vec::new(); + for zone in zones { + if let Some(pool) = zone.filesystem_pool { + datasets.push(DatasetConfig { + id: DatasetUuid::new_v4(), + name: DatasetName::new( + pool, + DatasetKind::TransientZone { name: zone.zone_name() }, + ), + inner: SharedDatasetConfig::default(), + }); + } + if let Some(dataset) = zone.dataset_name() { + datasets.push(DatasetConfig { + id: DatasetUuid::new_v4(), + name: dataset, + inner: SharedDatasetConfig::default(), + }); + } + } + OmicronDatasets::with_datasets( + datasets.into_iter().map(|d| (d, Ok(()))), + ) + } + #[tokio::test] async fn shutdown_retries_after_failed_halt() { let logctx = dev::test_setup_log("shutdown_retries_after_failed_halt"); @@ -1259,6 +1305,7 @@ mod tests { let fake_zone_id = OmicronZoneUuid::new_v4(); let desired_zones: IdMap<_> = [make_zone_config(fake_zone_id)].into_iter().collect(); + let datasets = make_datasets(desired_zones.iter()); let currently_managed_zpools = CurrentlyManagedZpoolsReceiver::fake_static( desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), @@ -1280,6 +1327,7 @@ mod tests { &sled_agent_facilities, &zone_facilities, true, + &datasets, ¤tly_managed_zpools, &logctx.log, ) @@ -1315,6 +1363,7 @@ mod tests { &sled_agent_facilities, &zone_facilities, true, + &datasets, ¤tly_managed_zpools, &logctx.log, ) @@ -1342,6 +1391,7 @@ mod tests { // Construct a zone we want to start. let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + let datasets = make_datasets(desired_zones.iter()); let currently_managed_zpools = CurrentlyManagedZpoolsReceiver::fake_static( desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), @@ -1371,6 +1421,7 @@ mod tests { &sled_agent_facilities, &zone_facilities, true, + &datasets, ¤tly_managed_zpools, &logctx.log, ) @@ -1405,6 +1456,7 @@ mod tests { // Construct a zone we want to start. let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + let datasets = make_datasets(desired_zones.iter()); let currently_managed_zpools = CurrentlyManagedZpoolsReceiver::fake_static( desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), @@ -1430,6 +1482,7 @@ mod tests { &sled_agent_facilities, &zone_facilities, true, + &datasets, ¤tly_managed_zpools, &logctx.log, ) diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index 97d1493467d..4e7c27ea052 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -28,7 +28,6 @@ mod instance_manager; mod long_running_tasks; mod metrics; mod nexus; -pub mod params; mod probe_manager; mod profile; pub mod rack_setup; diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs deleted file mode 100644 index 56411092a70..00000000000 --- a/sled-agent/src/params.rs +++ /dev/null @@ -1,70 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use nexus_sled_agent_shared::inventory::{OmicronZoneConfig, OmicronZoneType}; -use omicron_common::disk::{DatasetKind, DatasetName}; -pub use sled_hardware::DendriteAsic; -use std::net::SocketAddrV6; - -/// Extension trait for `OmicronZoneType` and `OmicronZoneConfig`. -/// -/// This lives here because it requires extra dependencies that -/// nexus-sled-agent-shared doesn't have. -pub(crate) trait OmicronZoneTypeExt { - fn as_omicron_zone_type(&self) -> &OmicronZoneType; - - /// If this kind of zone has an associated dataset, return the dataset's name. - /// Otherwise, return `None`. - fn dataset_name(&self) -> Option { - self.dataset_name_and_address().map(|(name, _)| name) - } - - /// If this kind of zone has an associated dataset, return the dataset's name - /// and the associated "service address". Otherwise, return `None`. - fn dataset_name_and_address(&self) -> Option<(DatasetName, SocketAddrV6)> { - let (dataset, dataset_kind, address) = match self.as_omicron_zone_type() - { - OmicronZoneType::BoundaryNtp { .. } - | OmicronZoneType::InternalNtp { .. } - | OmicronZoneType::Nexus { .. } - | OmicronZoneType::Oximeter { .. } - | OmicronZoneType::CruciblePantry { .. } => None, - OmicronZoneType::Clickhouse { dataset, address, .. } => { - Some((dataset, DatasetKind::Clickhouse, address)) - } - OmicronZoneType::ClickhouseKeeper { dataset, address, .. } => { - Some((dataset, DatasetKind::ClickhouseKeeper, address)) - } - OmicronZoneType::ClickhouseServer { dataset, address, .. } => { - Some((dataset, DatasetKind::ClickhouseServer, address)) - } - OmicronZoneType::CockroachDb { dataset, address, .. } => { - Some((dataset, DatasetKind::Cockroach, address)) - } - OmicronZoneType::Crucible { dataset, address, .. } => { - Some((dataset, DatasetKind::Crucible, address)) - } - OmicronZoneType::ExternalDns { dataset, http_address, .. } => { - Some((dataset, DatasetKind::ExternalDns, http_address)) - } - OmicronZoneType::InternalDns { dataset, http_address, .. } => { - Some((dataset, DatasetKind::InternalDns, http_address)) - } - }?; - - Some((DatasetName::new(dataset.pool_name, dataset_kind), *address)) - } -} - -impl OmicronZoneTypeExt for OmicronZoneType { - fn as_omicron_zone_type(&self) -> &OmicronZoneType { - self - } -} - -impl OmicronZoneTypeExt for OmicronZoneConfig { - fn as_omicron_zone_type(&self) -> &OmicronZoneType { - &self.zone_type - } -} diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 87e305ce2bd..ecddba0ed6d 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -32,7 +32,6 @@ use crate::bootstrap::early_networking::{ use crate::config::SidecarRevision; use crate::ddm_reconciler::DdmReconciler; use crate::metrics::MetricsRequestQueue; -use crate::params::{DendriteAsic, OmicronZoneTypeExt}; use crate::profile::*; use anyhow::anyhow; use camino::{Utf8Path, Utf8PathBuf}; @@ -99,6 +98,7 @@ use rand::prelude::SliceRandom; use sled_agent_config_reconciler::InternalDisksReceiver; use sled_agent_types::sled::SWITCH_ZONE_BASEBOARD_FILE; use sled_agent_zone_images::ZoneImageSourceResolver; +use sled_hardware::DendriteAsic; use sled_hardware::SledMode; use sled_hardware::is_gimlet; use sled_hardware::underlay; From e4e1a7685562f1bcaa777c79339dae450315726a Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 23 May 2025 17:43:52 -0400 Subject: [PATCH 6/9] add tests for zone dataset validation --- .../src/reconciler_task/zones.rs | 240 ++++++++++++++++-- 1 file changed, 223 insertions(+), 17 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index a2b983da6e9..1c37dec7fca 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -951,6 +951,7 @@ impl ZoneFacilities for RealZoneFacilities { mod tests { use super::*; use crate::CurrentlyManagedZpoolsReceiver; + use crate::dataset_serialization_task::DatasetEnsureError; use anyhow::anyhow; use assert_matches::assert_matches; use camino_tempfile::Utf8TempDir; @@ -1186,33 +1187,67 @@ mod tests { } } - // Helper to build an `OmicronDatasets` for the given zone config. - fn make_datasets<'a>( - zones: impl Iterator, - ) -> OmicronDatasets { - let mut datasets = Vec::new(); - for zone in zones { - if let Some(pool) = zone.filesystem_pool { - datasets.push(DatasetConfig { + #[derive(Default)] + struct DatasetsBuilder { + datasets: Vec<(DatasetConfig, Result<(), DatasetEnsureError>)>, + } + + impl DatasetsBuilder { + fn push_root( + &mut self, + zone: &OmicronZoneConfig, + result: Result<(), DatasetEnsureError>, + ) { + let Some(pool) = zone.filesystem_pool else { + return; + }; + self.datasets.push(( + DatasetConfig { id: DatasetUuid::new_v4(), name: DatasetName::new( pool, DatasetKind::TransientZone { name: zone.zone_name() }, ), inner: SharedDatasetConfig::default(), - }); - } - if let Some(dataset) = zone.dataset_name() { - datasets.push(DatasetConfig { + }, + result, + )); + } + + fn push_durable( + &mut self, + zone: &OmicronZoneConfig, + result: Result<(), DatasetEnsureError>, + ) { + let Some(dataset) = zone.dataset_name() else { + return; + }; + self.datasets.push(( + DatasetConfig { id: DatasetUuid::new_v4(), name: dataset, inner: SharedDatasetConfig::default(), - }); - } + }, + result, + )); } - OmicronDatasets::with_datasets( - datasets.into_iter().map(|d| (d, Ok(()))), - ) + + fn build(self) -> OmicronDatasets { + OmicronDatasets::with_datasets(self.datasets.into_iter()) + } + } + + // Helper to build an all-dependencies-met `OmicronDatasets` for the given + // zone config. + fn make_datasets<'a>( + zones: impl Iterator, + ) -> OmicronDatasets { + let mut builder = DatasetsBuilder::default(); + for zone in zones { + builder.push_root(zone, Ok(())); + builder.push_durable(zone, Ok(())); + } + builder.build() } #[tokio::test] @@ -1576,4 +1611,175 @@ mod tests { logctx.cleanup_successful(); } + + #[tokio::test] + async fn start_zone_fails_if_missing_root_dataset() { + let logctx = + dev::test_setup_log("start_zone_fails_if_missing_root_dataset"); + + // Construct a zone we want to start. + let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); + let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + + // datasets0: missing root dataset entirely + let datasets0 = { + let mut builder = DatasetsBuilder::default(); + for zone in &desired_zones { + builder.push_durable(zone, Ok(())); + } + builder.build() + }; + + // datasets1: root exists but failed to ensure + let datasets1 = { + let mut builder = DatasetsBuilder::default(); + for zone in &desired_zones { + builder.push_root( + zone, + Err(DatasetEnsureError::TestError("boom")), + ); + builder.push_durable(zone, Ok(())); + } + builder.build() + }; + + let currently_managed_zpools = + CurrentlyManagedZpoolsReceiver::fake_static( + desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), + ) + .current(); + + let zone_facilities = FakeZoneFacilities::default(); + let sled_agent_facilities = FakeSledAgentFacilities::default(); + + // Both dataset variations should fail the same way. + for datasets in [&datasets0, &datasets1] { + let mut zones = OmicronZones::new( + nonexistent_mount_config(), + TimeSyncConfig::Skip, + ); + + zones + .start_zones_if_needed_impl( + &desired_zones, + &sled_agent_facilities, + &zone_facilities, + true, + datasets, + ¤tly_managed_zpools, + &logctx.log, + ) + .await; + + assert_eq!(zones.zones.len(), 1); + let zone = zones.zones.get(&fake_zone.id).expect("zone is present"); + assert_eq!(zone.config, *desired_zones.get(&fake_zone.id).unwrap()); + + // The zone should now be in the "partially shut down" state. + match &zone.state { + ZoneState::FailedToStart(err) => { + assert_matches!( + err, + ZoneStartError::DatasetDependency( + ZoneDatasetDependencyError::TransientZoneDatasetNotAvailable(_) + ) + ); + } + other => panic!("unexpected zone state: {other:?}"), + } + } + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn start_zone_fails_if_missing_durable_dataset() { + let logctx = + dev::test_setup_log("start_zone_fails_if_missing_durable_dataset"); + + // Construct a zone we want to start, using a zone type that has a + // durable dataset. + let fake_zone = OmicronZoneConfig { + id: OmicronZoneUuid::new_v4(), + filesystem_pool: Some(ZpoolName::new_external(ZpoolUuid::new_v4())), + zone_type: OmicronZoneType::Crucible { + address: "[::1]:0".parse().unwrap(), + dataset: OmicronZoneDataset { + pool_name: ZpoolName::new_external(ZpoolUuid::new_v4()), + }, + }, + image_source: OmicronZoneImageSource::InstallDataset, + }; + let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + + // datasets0: missing durable dataset entirely + let datasets0 = { + let mut builder = DatasetsBuilder::default(); + for zone in &desired_zones { + builder.push_root(zone, Ok(())); + } + builder.build() + }; + + // datasets1: durable exists but failed to ensure + let datasets1 = { + let mut builder = DatasetsBuilder::default(); + for zone in &desired_zones { + builder.push_root(zone, Ok(())); + builder.push_durable( + zone, + Err(DatasetEnsureError::TestError("boom")), + ); + } + builder.build() + }; + + let currently_managed_zpools = + CurrentlyManagedZpoolsReceiver::fake_static( + desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), + ) + .current(); + + let zone_facilities = FakeZoneFacilities::default(); + let sled_agent_facilities = FakeSledAgentFacilities::default(); + + // Both dataset variations should fail the same way. + for datasets in [&datasets0, &datasets1] { + let mut zones = OmicronZones::new( + nonexistent_mount_config(), + TimeSyncConfig::Skip, + ); + + zones + .start_zones_if_needed_impl( + &desired_zones, + &sled_agent_facilities, + &zone_facilities, + true, + datasets, + ¤tly_managed_zpools, + &logctx.log, + ) + .await; + + assert_eq!(zones.zones.len(), 1); + let zone = zones.zones.get(&fake_zone.id).expect("zone is present"); + assert_eq!(zone.config, *desired_zones.get(&fake_zone.id).unwrap()); + + // The zone should now be in the "partially shut down" state. + match &zone.state { + ZoneState::FailedToStart(err) => { + assert_matches!( + err, + ZoneStartError::DatasetDependency( + ZoneDatasetDependencyError::DurableDatasetNotAvailable(_) + ) + ); + } + other => panic!("unexpected zone state: {other:?}"), + } + } + + logctx.cleanup_successful(); + } } From cff4f7b45c24274a10f2054ef2554ddcb27e0b18 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 May 2025 11:03:56 -0400 Subject: [PATCH 7/9] move most start_omicron_zone work to config-reconciler --- .../config-reconciler/src/reconciler_task.rs | 3 - .../src/reconciler_task/datasets.rs | 42 +++- .../src/reconciler_task/zones.rs | 113 +++++------ .../src/sled_agent_facilities.rs | 7 +- sled-agent/src/services.rs | 180 +----------------- sled-agent/src/sled_agent.rs | 14 +- 6 files changed, 105 insertions(+), 254 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task.rs b/sled-agent/config-reconciler/src/reconciler_task.rs index 42a1b700415..7ee407e64d1 100644 --- a/sled-agent/config-reconciler/src/reconciler_task.rs +++ b/sled-agent/config-reconciler/src/reconciler_task.rs @@ -458,15 +458,12 @@ impl ReconcilerTask { // the old instance. match zone_shutdown_result { Ok(()) => { - let currently_managed_zpools = - self.external_disks.currently_managed_zpools(); self.zones .start_zones_if_needed( &sled_config.zones, sled_agent_facilities, timesync_status.is_synchronized(), &self.datasets, - ¤tly_managed_zpools, &self.log, ) .await; diff --git a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs index d31a0532fe8..88167e80da7 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs @@ -12,12 +12,16 @@ use crate::dataset_serialization_task::DatasetEnsureResult; use crate::dataset_serialization_task::DatasetTaskHandle; use id_map::IdMap; use id_map::IdMappable; +use illumos_utils::zpool::PathInPool; +use illumos_utils::zpool::ZpoolOrRamdisk; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use omicron_common::disk::DatasetConfig; use omicron_common::disk::DatasetKind; use omicron_common::disk::DatasetName; use omicron_uuid_kinds::DatasetUuid; +use sled_storage::config::MountConfig; +use sled_storage::dataset::ZONE_DATASET; use slog::Logger; use slog::warn; use slog_error_chain::InlineErrorChain; @@ -65,13 +69,20 @@ impl OmicronDatasets { Self { datasets: IdMap::default(), dataset_task } } - pub(super) fn validate_zone_dependencies( + /// Confirm that any dataset dependencies of `zone` have been ensured + /// successfully, returning the path for the zone's filesystem root. + pub(super) fn validate_zone_storage( &self, zone: &OmicronZoneConfig, - ) -> Result<(), ZoneDatasetDependencyError> { + mount_config: &MountConfig, + ) -> Result { // Confirm that there's an ensured `TransientZoneRoot` dataset on this // zone's filesystem pool. let Some(filesystem_pool) = zone.filesystem_pool else { + // This should never happen: Reconfigurator guarantees all zones + // have filesystem pools. `filesystem_pool` is non-optional in + // blueprints; we should make it non-optional in `OmicronZoneConfig` + // too: https://github.com/oxidecomputer/omicron/issues/8216 return Err(ZoneDatasetDependencyError::MissingFilesystemPool); }; @@ -82,6 +93,7 @@ impl OmicronDatasets { // TODO-cleanup It would be nicer if the zone included the filesystem // dataset ID, so we could just do a lookup here instead of searching. + // https://github.com/oxidecomputer/omicron/issues/7214 if !self.datasets.iter().any(|d| { matches!(d.state, DatasetState::Ensured) && d.config.name == filesystem_dataset_name @@ -93,10 +105,18 @@ impl OmicronDatasets { ); } + let zone_root_path = PathInPool { + pool: ZpoolOrRamdisk::Zpool(filesystem_pool), + // TODO-cleanup Should we get this path from the dataset we found + // above? + path: filesystem_pool + .dataset_mountpoint(&mount_config.root, ZONE_DATASET), + }; + // Confirm that the durable dataset for this zone has been ensured, if // it has one. let Some(durable_dataset) = zone.dataset_name() else { - return Ok(()); + return Ok(zone_root_path); }; // TODO-cleanup As above, if we had an ID we could look up instead of @@ -112,7 +132,21 @@ impl OmicronDatasets { ); } - Ok(()) + // TODO-correctness Before we moved the logic of this method here, it + // was performed by + // `ServiceManager::validate_storage_and_pick_mountpoint()`. That method + // did not have access to `self.datasets`, so it issued an explicit + // `zfs` command to check the `zoned`, `canmount`, and `encryption` + // properties of the durable dataset. Should we: + // + // * Do that here + // * Do nothing here, as the dataset task should have have already done + // this + // * Check the dataset config properties? (All three of the properties + // are implied by the dataset kind, so this is probably equivalent to + // "do nothing") + + Ok(zone_root_path) } pub(super) fn remove_datasets_if_needed( diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 1c37dec7fca..55fc044bedd 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -20,7 +20,6 @@ use illumos_utils::zone::AdmError; use illumos_utils::zone::Api as _; use illumos_utils::zone::DeleteAddressError; use illumos_utils::zone::Zones; -use illumos_utils::zpool::ZpoolName; use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use nexus_sled_agent_shared::inventory::OmicronZoneType; @@ -41,7 +40,6 @@ use std::num::NonZeroUsize; use std::str::FromStr as _; use std::sync::Arc; -use super::CurrentlyManagedZpools; use super::OmicronDatasets; use super::datasets::ZoneDatasetDependencyError; @@ -251,7 +249,6 @@ impl OmicronZones { sled_agent_facilities: &T, is_time_synchronized: bool, datasets: &OmicronDatasets, - all_u2_pools: &CurrentlyManagedZpools, log: &Logger, ) { self.start_zones_if_needed_impl( @@ -260,7 +257,6 @@ impl OmicronZones { &RealZoneFacilities, is_time_synchronized, datasets, - all_u2_pools, log, ) .await @@ -276,7 +272,6 @@ impl OmicronZones { zone_facilities: &U, is_time_synchronized: bool, datasets: &OmicronDatasets, - all_u2_pools: &CurrentlyManagedZpools, log: &Logger, ) { // Filter desired zones down to just those that we need to start. See @@ -323,7 +318,6 @@ impl OmicronZones { }); // Build up the futures for starting each zone. - let all_u2_pools = all_u2_pools.clone().into_vec(); let start_futures = zones_to_start.map(|zone| { self.start_single_zone( zone, @@ -331,7 +325,6 @@ impl OmicronZones { zone_facilities, is_time_synchronized, datasets, - &all_u2_pools, log, ) .map(move |result| (zone.clone(), result)) @@ -355,7 +348,6 @@ impl OmicronZones { zone_facilities: &U, is_time_synchronized: bool, datasets: &OmicronDatasets, - all_u2_pools: &[ZpoolName], log: &Logger, ) -> Result { // Ensure no zone by this name exists. This should only happen in the @@ -373,17 +365,18 @@ impl OmicronZones { return Ok(state); } - // Ensure the dataset dependencies of this zone are okay. - datasets.validate_zone_dependencies(zone)?; + // Ensure that time is sync'd, if needed by this zone. + if zone.zone_type.requires_timesync() && !is_time_synchronized { + return Err(ZoneStartError::TimeNotSynchronized); + } + + // Ensure all dataset dependencies of this zone are okay. + let zone_root_path = + datasets.validate_zone_storage(zone, &self.mount_config)?; // The zone is not running - start it. match sled_agent_facilities - .start_omicron_zone( - zone, - &self.mount_config, - is_time_synchronized, - all_u2_pools, - ) + .start_omicron_zone(zone, zone_root_path) .await { Ok(running_zone) => Ok(ZoneState::Running(Arc::new(running_zone))), @@ -860,6 +853,8 @@ enum PartiallyShutDownState { enum ZoneStartError { #[error("could not determine whether zone already exists")] CheckZoneExists(#[from] CheckZoneExistsError), + #[error("Time not yet synchronized")] + TimeNotSynchronized, #[error(transparent)] DatasetDependency(#[from] ZoneDatasetDependencyError), #[error("sled agent failed to start service")] @@ -950,7 +945,6 @@ impl ZoneFacilities for RealZoneFacilities { #[cfg(test)] mod tests { use super::*; - use crate::CurrentlyManagedZpoolsReceiver; use crate::dataset_serialization_task::DatasetEnsureError; use anyhow::anyhow; use assert_matches::assert_matches; @@ -1130,9 +1124,7 @@ mod tests { async fn start_omicron_zone( &self, _zone_config: &OmicronZoneConfig, - _mount_config: &MountConfig, - _is_time_synchronized: bool, - _all_u2_pools: &[ZpoolName], + _zone_root_path: PathInPool, ) -> anyhow::Result { let mut inner = self.inner.lock().unwrap(); inner @@ -1341,11 +1333,6 @@ mod tests { let desired_zones: IdMap<_> = [make_zone_config(fake_zone_id)].into_iter().collect(); let datasets = make_datasets(desired_zones.iter()); - let currently_managed_zpools = - CurrentlyManagedZpoolsReceiver::fake_static( - desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), - ) - .current(); // Configure our fake sled-agent to fail to start a zone. let sled_agent_facilities = FakeSledAgentFacilities::default(); @@ -1363,7 +1350,6 @@ mod tests { &zone_facilities, true, &datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; @@ -1399,7 +1385,6 @@ mod tests { &zone_facilities, true, &datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; @@ -1427,11 +1412,6 @@ mod tests { let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); let datasets = make_datasets(desired_zones.iter()); - let currently_managed_zpools = - CurrentlyManagedZpoolsReceiver::fake_static( - desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), - ) - .current(); // Configure our fake zone facilities to report a zone with this name as // already running. @@ -1457,7 +1437,6 @@ mod tests { &zone_facilities, true, &datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; @@ -1482,6 +1461,54 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn start_zone_fails_if_time_not_synced_when_required() { + let logctx = dev::test_setup_log( + "start_zone_fails_if_time_not_synced_when_required", + ); + + // Construct a zone we want to start, of a type that requires time to be + // sync'd. + let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); + assert!(fake_zone.zone_type.requires_timesync()); + let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); + let datasets = make_datasets(desired_zones.iter()); + + let zone_facilities = FakeZoneFacilities::default(); + let sled_agent_facilities = FakeSledAgentFacilities::default(); + + let mut zones = OmicronZones::new( + nonexistent_mount_config(), + TimeSyncConfig::Normal, + ); + + // Start zones: this should refuse to start the zone. + zones + .start_zones_if_needed_impl( + &desired_zones, + &sled_agent_facilities, + &zone_facilities, + false, // is_time_synchronized + &datasets, + &logctx.log, + ) + .await; + + assert_eq!(zones.zones.len(), 1); + let zone = zones.zones.get(&fake_zone.id).expect("zone is present"); + assert_eq!(zone.config, *desired_zones.get(&fake_zone.id).unwrap()); + + // The zone should now be in the expected error state. + match &zone.state { + ZoneState::FailedToStart(err) => { + assert_matches!(err, ZoneStartError::TimeNotSynchronized); + } + other => panic!("unexpected zone state: {other:?}"), + } + + logctx.cleanup_successful(); + } + #[tokio::test] async fn start_zone_fails_if_halting_preexisting_zone_fails() { let logctx = dev::test_setup_log( @@ -1492,11 +1519,6 @@ mod tests { let fake_zone = make_zone_config(OmicronZoneUuid::new_v4()); let desired_zones: IdMap<_> = [fake_zone.clone()].into_iter().collect(); let datasets = make_datasets(desired_zones.iter()); - let currently_managed_zpools = - CurrentlyManagedZpoolsReceiver::fake_static( - desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), - ) - .current(); // Configure our fake zone facilities to report a zone with this name as // already running, and configure halting this zone to fail. @@ -1518,7 +1540,6 @@ mod tests { &zone_facilities, true, &datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; @@ -1643,12 +1664,6 @@ mod tests { builder.build() }; - let currently_managed_zpools = - CurrentlyManagedZpoolsReceiver::fake_static( - desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), - ) - .current(); - let zone_facilities = FakeZoneFacilities::default(); let sled_agent_facilities = FakeSledAgentFacilities::default(); @@ -1666,7 +1681,6 @@ mod tests { &zone_facilities, true, datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; @@ -1734,12 +1748,6 @@ mod tests { builder.build() }; - let currently_managed_zpools = - CurrentlyManagedZpoolsReceiver::fake_static( - desired_zones.iter().map(|z| z.filesystem_pool.unwrap()), - ) - .current(); - let zone_facilities = FakeZoneFacilities::default(); let sled_agent_facilities = FakeSledAgentFacilities::default(); @@ -1757,7 +1765,6 @@ mod tests { &zone_facilities, true, datasets, - ¤tly_managed_zpools, &logctx.log, ) .await; diff --git a/sled-agent/config-reconciler/src/sled_agent_facilities.rs b/sled-agent/config-reconciler/src/sled_agent_facilities.rs index 98f3a164cc2..8445b51d21b 100644 --- a/sled-agent/config-reconciler/src/sled_agent_facilities.rs +++ b/sled-agent/config-reconciler/src/sled_agent_facilities.rs @@ -7,12 +7,11 @@ use illumos_utils::dladm::EtherstubVnic; use illumos_utils::running_zone::RunningZone; -use illumos_utils::zpool::ZpoolName; +use illumos_utils::zpool::PathInPool; use nexus_sled_agent_shared::inventory::OmicronZoneConfig; use omicron_common::address::Ipv6Subnet; use omicron_common::address::SLED_PREFIX; use sled_agent_types::zone_bundle::ZoneBundleCause; -use sled_storage::config::MountConfig; use std::future::Future; use tufaceous_artifact::ArtifactHash; @@ -38,9 +37,7 @@ pub trait SledAgentFacilities: Send + Sync + 'static { fn start_omicron_zone( &self, zone_config: &OmicronZoneConfig, - mount_config: &MountConfig, - is_time_synchronized: bool, - all_u2_pools: &[ZpoolName], + zone_root_path: PathInPool, ) -> impl Future> + Send; /// Stop tracking metrics for a zone's datalinks. diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index ecddba0ed6d..794963f7199 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -62,7 +62,6 @@ use illumos_utils::{PFEXEC, execute}; use internal_dns_resolver::Resolver; use internal_dns_types::names::BOUNDARY_NTP_DNS_NAME; use internal_dns_types::names::DNS_ZONE; -use itertools::Itertools; use nexus_config::{ConfigDropshotWithTls, DeploymentConfig}; use nexus_sled_agent_shared::inventory::{ OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, @@ -94,7 +93,6 @@ use omicron_common::disk::{DatasetKind, DatasetName}; use omicron_common::ledger::Ledgerable; use omicron_ddm_admin_client::DdmError; use omicron_uuid_kinds::OmicronZoneUuid; -use rand::prelude::SliceRandom; use sled_agent_config_reconciler::InternalDisksReceiver; use sled_agent_types::sled::SWITCH_ZONE_BASEBOARD_FILE; use sled_agent_zone_images::ZoneImageSourceResolver; @@ -103,8 +101,6 @@ use sled_hardware::SledMode; use sled_hardware::is_gimlet; use sled_hardware::underlay; use sled_hardware_types::Baseboard; -use sled_storage::config::MountConfig; -use sled_storage::dataset::ZONE_DATASET; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; @@ -150,9 +146,6 @@ pub enum Error { #[error("Sled Agent not initialized yet")] SledAgentNotReady, - #[error("No U.2 devices found with a {ZONE_DATASET} mountpoint")] - U2NotFound, - #[error("Switch zone error: {0}")] SwitchZone(anyhow::Error), @@ -195,9 +188,6 @@ pub enum Error { #[error(transparent)] ZoneInstall(#[from] illumos_utils::running_zone::InstallZoneError), - #[error("Failed to initialize zones: {errors:?}")] - ZoneEnsure { errors: Vec<(String, Error)> }, - #[error("Error contacting ddmd: {0}")] DdmError(#[from] DdmError), @@ -228,34 +218,9 @@ pub enum Error { #[error("Failed to get address: {0}")] GetAddressFailure(#[from] illumos_utils::zone::GetAddressError), - #[error( - "Failed to launch zone {zone} because ZFS value cannot be accessed" - )] - GetZfsValue { - zone: String, - #[source] - source: illumos_utils::zfs::GetValueError, - }, - - #[error( - "Cannot launch {zone} with {dataset} (saw {prop_name} = {prop_value}, expected {prop_value_expected})" - )] - DatasetNotReady { - zone: String, - dataset: String, - prop_name: String, - prop_value: String, - prop_value_expected: String, - }, - #[error("NTP zone not ready")] NtpZoneNotReady, - // This isn't exactly "NtpZoneNotReady" -- it can happen when the NTP zone - // is up, but time is still in the process of synchronizing. - #[error("Time not yet synchronized")] - TimeNotSynchronized, - #[error("Execution error: {0}")] ExecutionError(#[from] illumos_utils::ExecutionError), @@ -334,39 +299,6 @@ impl From for omicron_common::api::external::Error { Error::RequestedZoneConfigOutdated { .. } => { omicron_common::api::external::Error::conflict(&err.to_string()) } - Error::TimeNotSynchronized => { - omicron_common::api::external::Error::unavail(&err.to_string()) - } - Error::ZoneEnsure { errors } => { - // As a special case, if any zones failed to timesync, - // prioritize that error. - // - // This conversion to a 503 error was requested in - // https://github.com/oxidecomputer/omicron/issues/4776 , - // and we preserve that behavior here, even though we may - // launch many zones at the same time. - if let Some(err) = errors.iter().find_map(|(_, err)| { - if matches!(err, Error::TimeNotSynchronized) { - Some(err) - } else { - None - } - }) { - omicron_common::api::external::Error::unavail( - &err.to_string(), - ) - } else { - let internal_message = errors - .iter() - .map(|(name, err)| { - format!("failed to start {name}: {err:?}") - }) - .join("\n"); - omicron_common::api::external::Error::InternalError { - internal_message, - } - } - } _ => omicron_common::api::external::Error::InternalError { internal_message: err.to_string(), }, @@ -3278,25 +3210,10 @@ impl ServiceManager { // storage configuration against the reality of the current sled. pub(crate) async fn start_omicron_zone( &self, - mount_config: &MountConfig, zone: &OmicronZoneConfig, - time_is_synchronized: bool, - all_u2_pools: &[ZpoolName], + zone_root_path: PathInPool, ) -> Result { - // If this zone requires timesync and we aren't ready, fail it early. - if zone.zone_type.requires_timesync() && !time_is_synchronized { - return Err(Error::TimeNotSynchronized); - } - - // Ensure that this zone's storage is ready. - let zone_root_path = self - .validate_storage_and_pick_mountpoint( - mount_config, - &zone, - &all_u2_pools, - ) - .await?; - + // TODO-john fixme let config = OmicronZoneConfigLocal { zone: zone.clone(), root: zone_root_path.path.clone(), @@ -3316,99 +3233,6 @@ impl ServiceManager { Ok(runtime) } - // Returns a zone filesystem mountpoint, after ensuring that U.2 storage - // is valid. - async fn validate_storage_and_pick_mountpoint( - &self, - mount_config: &MountConfig, - zone: &OmicronZoneConfig, - all_u2_pools: &[ZpoolName], - ) -> Result { - let name = zone.zone_name(); - - // If the caller has requested a specific durable dataset, - // ensure that it is encrypted and that it exists. - // - // Typically, the transient filesystem pool will be placed on the same - // zpool as the durable dataset (to reduce the fault domain), but that - // decision belongs to Nexus, and is not enforced here. - if let Some(dataset) = zone.dataset_name() { - // Check that the dataset is actually ready to be used. - let [zoned, canmount, encryption] = - illumos_utils::zfs::Zfs::get_values( - &dataset.full_name(), - &["zoned", "canmount", "encryption"], - None, - ) - .await - .map_err(|err| Error::GetZfsValue { - zone: zone.zone_name(), - source: err, - })?; - - let check_property = |name, actual, expected| { - if actual != expected { - return Err(Error::DatasetNotReady { - zone: zone.zone_name(), - dataset: dataset.full_name(), - prop_name: String::from(name), - prop_value: actual, - prop_value_expected: String::from(expected), - }); - } - return Ok(()); - }; - check_property("zoned", zoned, "on")?; - check_property("canmount", canmount, "on")?; - if dataset.kind().dataset_should_be_encrypted() { - check_property("encryption", encryption, "aes-256-gcm")?; - } - - let data_pool = dataset.pool(); - if !all_u2_pools.contains(&data_pool) { - warn!( - self.inner.log, - "zone dataset requested on a zpool which doesn't exist"; - "zone" => &name, - "zpool" => %data_pool - ); - return Err(Error::MissingDevice { - device: format!("zpool: {data_pool}"), - }); - } - } - - let filesystem_pool = match (&zone.filesystem_pool, zone.dataset_name()) - { - // If a pool was explicitly requested, use it. - (Some(pool), _) => *pool, - // NOTE: The following cases are for backwards compatibility. - // - // If no pool was selected, prefer to use the same pool as the - // durable dataset. Otherwise, pick one randomly. - (None, Some(dataset)) => *dataset.pool(), - (None, None) => *all_u2_pools - .choose(&mut rand::thread_rng()) - .ok_or_else(|| Error::U2NotFound)?, - }; - - if !all_u2_pools.contains(&filesystem_pool) { - warn!( - self.inner.log, - "zone filesystem dataset requested on a zpool which doesn't exist"; - "zone" => &name, - "zpool" => %filesystem_pool - ); - return Err(Error::MissingDevice { - device: format!("zpool: {filesystem_pool}"), - }); - } - let path = filesystem_pool - .dataset_mountpoint(&mount_config.root, ZONE_DATASET); - let pool = ZpoolOrRamdisk::Zpool(filesystem_pool); - Ok(PathInPool { pool, path }) - } - /// Adjust the system boot time to the latest boot time of all zones. fn boottime_rewrite(&self) { // Call out to the 'tmpx' utility program which will rewrite the wtmpx diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 93c0b9e4730..240d00d838b 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -31,7 +31,7 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use illumos_utils::opte::PortManager; use illumos_utils::running_zone::RunningZone; -use illumos_utils::zpool::ZpoolName; +use illumos_utils::zpool::PathInPool; use itertools::Itertools as _; use nexus_sled_agent_shared::inventory::{ Inventory, OmicronSledConfig, OmicronZoneConfig, SledRole, @@ -76,7 +76,6 @@ use sled_hardware::{ }; use sled_hardware_types::Baseboard; use sled_hardware_types::underlay::BootstrapInterface; -use sled_storage::config::MountConfig; use slog::Logger; use slog_error_chain::InlineErrorChain; use sprockets_tls::keys::SprocketsConfig; @@ -1328,18 +1327,11 @@ impl SledAgentFacilities for ReconcilerFacilities { async fn start_omicron_zone( &self, zone_config: &OmicronZoneConfig, - mount_config: &MountConfig, - is_time_synchronized: bool, - all_u2_pools: &[ZpoolName], + zone_root_path: PathInPool, ) -> anyhow::Result { let zone = self .service_manager - .start_omicron_zone( - mount_config, - zone_config, - is_time_synchronized, - all_u2_pools, - ) + .start_omicron_zone(zone_config, zone_root_path) .await?; Ok(zone) } From 386b1021fd741373b52158febc6eda6549a69a38 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 3 Jun 2025 12:17:27 -0400 Subject: [PATCH 8/9] PR feedback --- .../src/reconciler_task/datasets.rs | 20 +++---------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs index 88167e80da7..141c3f731c7 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/datasets.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/datasets.rs @@ -86,7 +86,7 @@ impl OmicronDatasets { return Err(ZoneDatasetDependencyError::MissingFilesystemPool); }; - let filesystem_dataset_name = DatasetName::new( + let transient_dataset_name = DatasetName::new( filesystem_pool, DatasetKind::TransientZone { name: zone.zone_name() }, ); @@ -96,11 +96,11 @@ impl OmicronDatasets { // https://github.com/oxidecomputer/omicron/issues/7214 if !self.datasets.iter().any(|d| { matches!(d.state, DatasetState::Ensured) - && d.config.name == filesystem_dataset_name + && d.config.name == transient_dataset_name }) { return Err( ZoneDatasetDependencyError::TransientZoneDatasetNotAvailable( - filesystem_dataset_name, + transient_dataset_name, ), ); } @@ -132,20 +132,6 @@ impl OmicronDatasets { ); } - // TODO-correctness Before we moved the logic of this method here, it - // was performed by - // `ServiceManager::validate_storage_and_pick_mountpoint()`. That method - // did not have access to `self.datasets`, so it issued an explicit - // `zfs` command to check the `zoned`, `canmount`, and `encryption` - // properties of the durable dataset. Should we: - // - // * Do that here - // * Do nothing here, as the dataset task should have have already done - // this - // * Check the dataset config properties? (All three of the properties - // are implied by the dataset kind, so this is probably equivalent to - // "do nothing") - Ok(zone_root_path) } From 4b230b334bba288b81706df3f68127febc9e69ec Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 4 Jun 2025 15:59:22 -0400 Subject: [PATCH 9/9] [sled-agent] Remove `OmicronZonesConfigLocal` (#8220) With all the zone start and ledgering moved to `sled-agent-config-reconciler`, we can remove this type entirely from `sled-agent`. I kept the schema check but moved it to the `legacy_configs.rs` module in the config reconciler, where the same structs still exist to allow conversion of the old ledgers -> the new combined ledger. Builds on top of #8219. --- Cargo.lock | 1 + dev-tools/omdb/src/bin/omdb/db.rs | 46 ++- schema/all-zones-requests.json | 17 +- sled-agent/config-reconciler/Cargo.toml | 1 + .../src/ledger/legacy_configs.rs | 12 + .../src/reconciler_task/zones.rs | 14 +- sled-agent/src/services.rs | 281 ++++-------------- 7 files changed, 110 insertions(+), 262 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f47b5decfc..d1f491afb42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11658,6 +11658,7 @@ dependencies = [ "omicron-uuid-kinds", "omicron-workspace-hack", "proptest", + "schemars", "scopeguard", "serde", "serde_json", diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 9ab25e2ae13..5597241d5da 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -7350,30 +7350,28 @@ fn inv_collection_print_sleds(collection: &Collection) { "LAST RECONCILED CONFIG", &last_reconciliation.last_reconciled_config, ); - let disk_errs = collect_config_reconciler_errors( - &last_reconciliation.external_disks, - ); - let dataset_errs = collect_config_reconciler_errors( - &last_reconciliation.datasets, - ); - let zone_errs = collect_config_reconciler_errors( - &last_reconciliation.zones, - ); - for (label, errs) in [ - ("disk", disk_errs), - ("dataset", dataset_errs), - ("zone", zone_errs), - ] { - if errs.is_empty() { - println!(" all {label}s reconciled successfully"); - } else { - println!( - " {} {label} reconciliation errors:", - errs.len() - ); - for err in errs { - println!(" {err}"); - } + } + let disk_errs = collect_config_reconciler_errors( + &last_reconciliation.external_disks, + ); + let dataset_errs = + collect_config_reconciler_errors(&last_reconciliation.datasets); + let zone_errs = + collect_config_reconciler_errors(&last_reconciliation.zones); + for (label, errs) in [ + ("disk", disk_errs), + ("dataset", dataset_errs), + ("zone", zone_errs), + ] { + if errs.is_empty() { + println!(" all {label}s reconciled successfully"); + } else { + println!( + " {} {label} reconciliation errors:", + errs.len() + ); + for err in errs { + println!(" {err}"); } } } diff --git a/schema/all-zones-requests.json b/schema/all-zones-requests.json index 73e2e4de6e0..e9400245755 100644 --- a/schema/all-zones-requests.json +++ b/schema/all-zones-requests.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "title": "OmicronZonesConfigLocal", - "description": "Combines the Nexus-provided `OmicronZonesConfig` (which describes what Nexus wants for all of its zones) with the locally-determined configuration for these zones.", + "description": "Legacy type of the ledgered zone config.", "type": "object", "required": [ "ledger_generation", @@ -10,20 +10,10 @@ ], "properties": { "ledger_generation": { - "description": "ledger-managed generation number\n\nThis generation is managed by the ledger facility itself. It's bumped whenever we write a new ledger. In practice, we don't currently have any reason to bump this _for a given Omicron generation_ so it's somewhat redundant. In principle, if we needed to modify the ledgered configuration due to some event that doesn't change the Omicron config (e.g., if we wanted to move the root filesystem to a different path), we could do that by bumping this generation.", - "allOf": [ - { - "$ref": "#/definitions/Generation" - } - ] + "$ref": "#/definitions/Generation" }, "omicron_generation": { - "description": "generation of the Omicron-provided part of the configuration\n\nThis generation number is outside of Sled Agent's control. We store exactly what we were given and use this number to decide when to fail requests to establish an outdated configuration.\n\nYou can think of this as a major version number, with `ledger_generation` being a minor version number. See `is_newer_than()`.", - "allOf": [ - { - "$ref": "#/definitions/Generation" - } - ] + "$ref": "#/definitions/Generation" }, "zones": { "type": "array", @@ -269,7 +259,6 @@ } }, "OmicronZoneConfigLocal": { - "description": "Combines the Nexus-provided `OmicronZoneConfig` (which describes what Nexus wants for this zone) with any locally-determined configuration (like the path to the root filesystem)", "type": "object", "required": [ "root", diff --git a/sled-agent/config-reconciler/Cargo.toml b/sled-agent/config-reconciler/Cargo.toml index 9d173f35861..f5f8a61e4e1 100644 --- a/sled-agent/config-reconciler/Cargo.toml +++ b/sled-agent/config-reconciler/Cargo.toml @@ -45,6 +45,7 @@ expectorate.workspace = true illumos-utils = { workspace = true, features = ["testing"] } omicron-test-utils.workspace = true proptest.workspace = true +schemars.workspace = true scopeguard.workspace = true serde_json.workspace = true sled-storage = { workspace = true, features = ["testing"] } diff --git a/sled-agent/config-reconciler/src/ledger/legacy_configs.rs b/sled-agent/config-reconciler/src/ledger/legacy_configs.rs index 7bd3e9b7c6f..587303397ad 100644 --- a/sled-agent/config-reconciler/src/ledger/legacy_configs.rs +++ b/sled-agent/config-reconciler/src/ledger/legacy_configs.rs @@ -218,6 +218,7 @@ fn merge_old_configs( /// Legacy type of the ledgered zone config. #[derive(Debug, Clone, Deserialize, Serialize)] +#[cfg_attr(test, derive(schemars::JsonSchema))] struct OmicronZonesConfigLocal { omicron_generation: Generation, ledger_generation: Generation, @@ -237,9 +238,11 @@ impl Ledgerable for OmicronZonesConfigLocal { } #[derive(Debug, Clone, Deserialize, Serialize)] +#[cfg_attr(test, derive(schemars::JsonSchema))] struct OmicronZoneConfigLocal { zone: OmicronZoneConfig, #[serde(rename = "root")] + #[cfg_attr(test, schemars(with = "String"))] _root: Utf8PathBuf, } @@ -264,6 +267,15 @@ pub(super) mod tests { const MERGED_CONFIG_PATH: &str = "test-data/expectorate/merged-sled-config.json"; + #[test] + fn test_old_config_schema() { + let schema = schemars::schema_for!(OmicronZonesConfigLocal); + expectorate::assert_contents( + "../../schema/all-zones-requests.json", + &serde_json::to_string_pretty(&schema).unwrap(), + ); + } + #[test] fn test_merge_old_configs() { let disks: OmicronPhysicalDisksConfig = { diff --git a/sled-agent/config-reconciler/src/reconciler_task/zones.rs b/sled-agent/config-reconciler/src/reconciler_task/zones.rs index 55fc044bedd..e0a6e370445 100644 --- a/sled-agent/config-reconciler/src/reconciler_task/zones.rs +++ b/sled-agent/config-reconciler/src/reconciler_task/zones.rs @@ -630,7 +630,16 @@ impl OmicronZone { ) .await } - ZoneState::FailedToStart(_) => { + // With these errors, we never even tried to start the zone, so + // there's no cleanup required: we can just return. + ZoneState::FailedToStart(ZoneStartError::TimeNotSynchronized) + | ZoneState::FailedToStart(ZoneStartError::CheckZoneExists(_)) + | ZoneState::FailedToStart(ZoneStartError::DatasetDependency(_)) => { + Ok(()) + } + ZoneState::FailedToStart(ZoneStartError::SledAgentStartFailed( + err, + )) => { // TODO-correctness What do we need to do to try to shut down a // zone that we tried to start? We need fine-grained status of // what startup things succeeded that need to be cleaned up. For @@ -639,7 +648,8 @@ impl OmicronZone { log, "need to shut down zone that failed to start, but this \ is currently unimplemented: assuming no cleanup work \ - required" + required"; + "start-err" => InlineErrorChain::new(err.as_ref()), ); Ok(()) } diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 794963f7199..a60da9210db 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -64,8 +64,7 @@ use internal_dns_types::names::BOUNDARY_NTP_DNS_NAME; use internal_dns_types::names::DNS_ZONE; use nexus_config::{ConfigDropshotWithTls, DeploymentConfig}; use nexus_sled_agent_shared::inventory::{ - OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, - OmicronZonesConfig, ZoneKind, + OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, ZoneKind, }; use omicron_common::address::AZ_PREFIX; use omicron_common::address::DENDRITE_PORT; @@ -90,7 +89,6 @@ use omicron_common::backoff::{ BackoffError, retry_notify, retry_policy_internal_service_aggressive, }; use omicron_common::disk::{DatasetKind, DatasetName}; -use omicron_common::ledger::Ledgerable; use omicron_ddm_admin_client::DdmError; use omicron_uuid_kinds::OmicronZoneUuid; use sled_agent_config_reconciler::InternalDisksReceiver; @@ -424,96 +422,6 @@ impl RealSystemApi { impl SystemApi for RealSystemApi {} -/// Combines the Nexus-provided `OmicronZonesConfig` (which describes what Nexus -/// wants for all of its zones) with the locally-determined configuration for -/// these zones. -#[derive( - Clone, - Debug, - Eq, - PartialEq, - serde::Serialize, - serde::Deserialize, - schemars::JsonSchema, -)] -pub struct OmicronZonesConfigLocal { - /// generation of the Omicron-provided part of the configuration - /// - /// This generation number is outside of Sled Agent's control. We store - /// exactly what we were given and use this number to decide when to - /// fail requests to establish an outdated configuration. - /// - /// You can think of this as a major version number, with - /// `ledger_generation` being a minor version number. See - /// `is_newer_than()`. - pub omicron_generation: Generation, - - /// ledger-managed generation number - /// - /// This generation is managed by the ledger facility itself. It's bumped - /// whenever we write a new ledger. In practice, we don't currently have - /// any reason to bump this _for a given Omicron generation_ so it's - /// somewhat redundant. In principle, if we needed to modify the ledgered - /// configuration due to some event that doesn't change the Omicron config - /// (e.g., if we wanted to move the root filesystem to a different path), we - /// could do that by bumping this generation. - pub ledger_generation: Generation, - pub zones: Vec, -} - -impl Ledgerable for OmicronZonesConfigLocal { - fn is_newer_than(&self, other: &OmicronZonesConfigLocal) -> bool { - self.omicron_generation > other.omicron_generation - || (self.omicron_generation == other.omicron_generation - && self.ledger_generation >= other.ledger_generation) - } - - fn generation_bump(&mut self) { - self.ledger_generation = self.ledger_generation.next(); - } -} - -impl OmicronZonesConfigLocal { - /// Returns the initial configuration for generation 1, which has no zones - pub fn initial() -> OmicronZonesConfigLocal { - OmicronZonesConfigLocal { - omicron_generation: Generation::new(), - ledger_generation: Generation::new(), - zones: vec![], - } - } - - pub fn to_omicron_zones_config(self) -> OmicronZonesConfig { - OmicronZonesConfig { - generation: self.omicron_generation, - zones: self.zones.into_iter().map(|z| z.zone).collect(), - } - } -} - -/// Combines the Nexus-provided `OmicronZoneConfig` (which describes what Nexus -/// wants for this zone) with any locally-determined configuration (like the -/// path to the root filesystem) -// -// NOTE: Although the path to the root filesystem is not exactly equal to the -// ZpoolName, it is derivable from it, and the ZpoolName for the root filesystem -// is now being supplied as a part of OmicronZoneConfig. Therefore, this struct -// is less necessary than it has been historically. -#[derive( - Clone, - Debug, - Eq, - PartialEq, - serde::Serialize, - serde::Deserialize, - schemars::JsonSchema, -)] -pub struct OmicronZoneConfigLocal { - pub zone: OmicronZoneConfig, - #[schemars(with = "String")] - pub root: Utf8PathBuf, -} - /// Describes how we want a switch zone to be configured /// /// This is analogous to `OmicronZoneConfig`, but for the switch zone (which is @@ -568,7 +476,7 @@ impl illumos_utils::smf_helper::Service for SwitchService { /// Describes either an Omicron-managed zone or the switch zone, used for /// functions that operate on either one or the other enum ZoneArgs<'a> { - Omicron(&'a OmicronZoneConfigLocal), + Omicron(&'a OmicronZoneConfig), Switch(&'a SwitchZoneConfig), } @@ -576,7 +484,7 @@ impl<'a> ZoneArgs<'a> { /// If this is an Omicron zone, return its type pub fn omicron_type(&self) -> Option<&'a OmicronZoneType> { match self { - ZoneArgs::Omicron(zone_config) => Some(&zone_config.zone.zone_type), + ZoneArgs::Omicron(zone_config) => Some(&zone_config.zone_type), ZoneArgs::Switch(_) => None, } } @@ -1430,12 +1338,11 @@ impl ServiceManager { // dataset into the zone. Additionally, construct a "unique enough" name // so we can create multiple zones of this type without collision. let unique_name = match &request { - ZoneArgs::Omicron(zone_config) => Some(zone_config.zone.id), + ZoneArgs::Omicron(zone_config) => Some(zone_config.id), ZoneArgs::Switch(_) => None, }; let datasets: Vec<_> = match &request { ZoneArgs::Omicron(zone_config) => zone_config - .zone .dataset_name() .map(|n| zone::Dataset { name: n.full_name() }) .into_iter() @@ -1455,7 +1362,7 @@ impl ServiceManager { // are falling back to searching `/opt/oxide` in addition to the install // datasets. let image_source = match &request { - ZoneArgs::Omicron(zone_config) => &zone_config.zone.image_source, + ZoneArgs::Omicron(zone_config) => &zone_config.image_source, ZoneArgs::Switch(_) => &OmicronZoneImageSource::InstallDataset, }; let file_source = self.inner.zone_image_resolver.file_source_for( @@ -1465,7 +1372,7 @@ impl ServiceManager { let zone_type_str = match &request { ZoneArgs::Omicron(zone_config) => { - zone_config.zone.zone_type.kind().zone_prefix() + zone_config.zone_type.kind().zone_prefix() } ZoneArgs::Switch(_) => "switch", }; @@ -1516,12 +1423,8 @@ impl ServiceManager { .add_instance(ServiceInstanceBuilder::new("default")); let running_zone = match &request { - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: OmicronZoneType::Clickhouse { address, .. }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::Clickhouse { address, .. }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1605,13 +1508,8 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::ClickhouseServer { address, .. }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::ClickhouseServer { address, .. }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1695,13 +1593,8 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::ClickhouseKeeper { address, .. }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::ClickhouseKeeper { address, .. }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1778,13 +1671,9 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - id: zone_id, - zone_type: OmicronZoneType::CockroachDb { address, .. }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + id: zone_id, + zone_type: OmicronZoneType::CockroachDb { address, .. }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1849,13 +1738,8 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::Crucible { address, dataset }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::Crucible { address, dataset }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1907,12 +1791,8 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: OmicronZoneType::CruciblePantry { address }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::CruciblePantry { address }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1954,13 +1834,9 @@ impl ServiceManager { .map_err(|err| Error::io("crucible pantry profile", err))?; RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - id, - zone_type: OmicronZoneType::Oximeter { address }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + id, + zone_type: OmicronZoneType::Oximeter { address }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -1995,16 +1871,12 @@ impl ServiceManager { })?; RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::ExternalDns { - http_address, - dns_address, - nic, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: + OmicronZoneType::ExternalDns { + http_address, + dns_address, + nic, .. }, .. @@ -2058,17 +1930,13 @@ impl ServiceManager { })?; RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::BoundaryNtp { - address, - dns_servers, - ntp_servers, - domain, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: + OmicronZoneType::BoundaryNtp { + address, + dns_servers, + ntp_servers, + domain, .. }, .. @@ -2146,12 +2014,8 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: OmicronZoneType::InternalNtp { address }, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: OmicronZoneType::InternalNtp { address }, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -2207,17 +2071,13 @@ impl ServiceManager { RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::InternalDns { - http_address, - dns_address, - gz_address, - gz_address_index, - .. - }, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: + OmicronZoneType::InternalDns { + http_address, + dns_address, + gz_address, + gz_address_index, .. }, .. @@ -2302,19 +2162,15 @@ impl ServiceManager { })?; RunningZone::boot(installed_zone).await? } - ZoneArgs::Omicron(OmicronZoneConfigLocal { - zone: - OmicronZoneConfig { - zone_type: - OmicronZoneType::Nexus { - internal_address, - external_tls, - external_dns_servers, - .. - }, - id, + ZoneArgs::Omicron(OmicronZoneConfig { + zone_type: + OmicronZoneType::Nexus { + internal_address, + external_tls, + external_dns_servers, .. }, + id, .. }) => { let Some(info) = self.inner.sled_info.get() else { @@ -3194,34 +3050,24 @@ impl ServiceManager { Ok(running_zone) } - // Ensures that a single Omicron zone is running. + // Attempt to start a single Omicron zone. // // This method is NOT idempotent. // - // - If the zone already exists, in any form, it is fully removed - // before being initialized. This is primarily intended to remove "partially - // stopped/started" zones with detritus from interfering with a new zone - // being launched. - // - If zones need time to be synchronized before they are initialized - // (e.g., this is a hard requirement for CockroachDb) they can check the - // `time_is_synchronized` argument. - // - `all_u2_pools` provides a snapshot into durable storage on this sled, - // which gives the storage manager an opportunity to validate the zone's - // storage configuration against the reality of the current sled. + // Callers must do any "pre-zone-start" validation, including: + // + // * No other zone of this same name is still running + // * Time is synchronized, if the zone requires it + // * Any datasets the zone depends on exist and have been configured and/or + // mounted appropriately pub(crate) async fn start_omicron_zone( &self, zone: &OmicronZoneConfig, zone_root_path: PathInPool, ) -> Result { - // TODO-john fixme - let config = OmicronZoneConfigLocal { - zone: zone.clone(), - root: zone_root_path.path.clone(), - }; - let runtime = self .initialize_zone( - ZoneArgs::Omicron(&config), + ZoneArgs::Omicron(zone), zone_root_path, // filesystems= &[], @@ -4093,13 +3939,4 @@ mod test { &serde_json::to_string_pretty(&schema).unwrap(), ); } - - #[test] - fn test_all_zones_requests_schema() { - let schema = schemars::schema_for!(OmicronZonesConfigLocal); - expectorate::assert_contents( - "../schema/all-zones-requests.json", - &serde_json::to_string_pretty(&schema).unwrap(), - ); - } }