From 51be3f46c11fab0ae5af34af820df6d58f26268a Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sat, 18 Oct 2025 15:41:09 -0700 Subject: [PATCH 1/4] saga re-assignment should only grab sagas from Nexus of same generation --- nexus/reconfigurator/execution/src/sagas.rs | 260 ++++++++++++++++++-- nexus/types/src/deployment.rs | 2 +- 2 files changed, 235 insertions(+), 27 deletions(-) diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index 346bc5b7eae..087cf2893d6 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -9,11 +9,36 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::{Blueprint, BlueprintZoneDisposition}; use omicron_common::api::external::Error; -use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; use slog::{debug, info, warn}; -/// For each expunged Nexus zone, re-assign sagas owned by that Nexus to the -/// specified nexus (`nexus_id`). +/// For each expunged Nexus zone in the same generation as the current Nexus, +/// re-assign sagas owned by that Nexus to the specified nexus (`nexus_id`). +/// +/// Reassigning sagas in this way is how we ensure that that sagas complete even +/// when the Nexus that was running them fails permanently (leading to +/// expungement). +/// +/// Reassignment of sagas assigned to any expunged Nexus nodes is a prerequisite +/// for Nexus handoff (during upgrade). That's because in general, Nexus is +/// tightly coupled to saga implementations and it's not safe for different +/// versions of a Nexus to operate on the same saga (even at different times). +/// For more on this, see RFD 289. As a result, we finish all sagas prior to +/// handing off from one version to the next. +/// +/// Strictly speaking, it should not be required to limit re-assignment to Nexus +/// instances of the same generation. As mentioned above, sagas do not survive +/// across generations, so the only sagas that exist ought to be from the same +/// generation that Nexus is running. This is a belt-and-suspenders check put +/// in place because the impact of getting this wrong (running a saga created by +/// an older version) could be pretty bad. +/// +/// We could use the Nexus image for this rather than the generation. +/// In practice, these should be equivalent here because we bump the Nexus +/// generation on deployed systems when (and only when) there's a new Nexus +/// image. But handoff works in terms of generations -- that's the abstraction +/// we've created for "a group of Nexus instances that speak the same database +/// version and can exchange sagas". So that's what we use here. pub(crate) async fn reassign_sagas_from_expunged( opctx: &OpContext, datastore: &DataStore, @@ -22,29 +47,12 @@ pub(crate) async fn reassign_sagas_from_expunged( ) -> Result { let log = &opctx.log; - // Identify any Nexus zones that have been expunged and need to have sagas - // re-assigned. - // - // TODO: Currently, we take any expunged Nexus instances and attempt to - // assign all their sagas to ourselves. Per RFD 289, we can only re-assign - // sagas between two instances of Nexus that are at the same version. Right - // now this can't happen so there's nothing to do here to ensure that - // constraint. However, once we support allowing the control plane to be - // online _during_ an upgrade, there may be multiple different Nexus - // instances running at the same time. At that point, we will need to make - // sure that we only ever try to assign ourselves sagas from other Nexus - // instances that we know are running the same version as ourselves. - let nexus_zone_ids: Vec<_> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) - .filter_map(|(_, z)| { - z.zone_type - .is_nexus() - .then(|| nexus_db_model::SecId(z.id.into_untyped_uuid())) - }) - .collect(); - - debug!(log, "re-assign sagas: found Nexus instances"; - "nexus_zone_ids" => ?nexus_zone_ids); + let nexus_zone_ids = find_expunged_same_generation(blueprint, nexus_id)?; + debug!( + log, + "re-assign sagas: found expunged Nexus instances with matching generation"; + "nexus_zone_ids" => ?nexus_zone_ids + ); let result = datastore.sagas_reassign_sec(opctx, &nexus_zone_ids, nexus_id).await; @@ -68,3 +76,203 @@ pub(crate) async fn reassign_sagas_from_expunged( } } } + +/// Returns the list of Nexus ids for expunged (and ready-to-cleanup) Nexus +/// zones in the same generation as the given Nexus id +fn find_expunged_same_generation( + blueprint: &Blueprint, + nexus_id: SecId, +) -> Result, Error> { + let nexus_zone_id = OmicronZoneUuid::from_untyped_uuid(nexus_id.0); + let active_nexus_generation = + blueprint.find_generation_for_self(nexus_zone_id)?; + Ok(blueprint + .all_nexus_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .filter_map(|(_sled_id, zone_config, nexus_config)| { + (nexus_config.nexus_generation == active_nexus_generation) + .then_some(zone_config.id) + }) + .map(|id| SecId(id.into_untyped_uuid())) + .collect()) +} + +#[cfg(test)] +mod test { + use super::*; + use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; + use nexus_reconfigurator_planning::example::ExampleSystemBuilder; + use nexus_reconfigurator_planning::planner::PlannerRng; + use nexus_types::deployment::BlueprintSource; + use omicron_common::api::external::Generation; + use omicron_test_utils::dev::test_setup_log; + use std::collections::BTreeSet; + use uuid::Uuid; + + #[test] + fn test_find_expunged_same_generation() { + const TEST_NAME: &str = "test_find_expunged_same_generation"; + + let logctx = test_setup_log(TEST_NAME); + let log = &logctx.log; + + // To do an exhaustive test of `find_expunged_same_generation()`, we + // want some expunged zones and some non-expunged zones in each of two + // different generations. + + // Frst, create a basic blueprint with several Nexus zones in generation 1. + let (example, blueprint1) = + ExampleSystemBuilder::new(log, TEST_NAME).nexus_count(4).build(); + let g1 = Generation::new(); + let g1_nexus_ids: Vec<_> = blueprint1 + .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .map(|(sled_id, zone_config, nexus_config)| { + assert_eq!(nexus_config.nexus_generation, g1); + (sled_id, zone_config.id, zone_config.image_source.clone()) + }) + .collect(); + + // Expunge two of these Nexus zones and mark them ready for cleanup + // immediately. + let (g1_expunge_ids, g1_keep_ids) = g1_nexus_ids.split_at(2); + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint1, + &example.input, + &example.collection, + "test suite", + PlannerRng::from_entropy(), + ) + .expect("new blueprint builder"); + + for (sled_id, expunge_id, _image_source) in g1_expunge_ids { + builder + .sled_expunge_zone(*sled_id, *expunge_id) + .expect("expunge zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup( + *sled_id, + *expunge_id, + ) + .expect("mark zone for cleanup"); + } + + // Create the same number of Nexus zones in the next generation. + // We'll use the same images. + let g2 = g1.next(); + for (sled_id, _zone_id, image_source) in &g1_nexus_ids { + builder + .sled_add_zone_nexus(*sled_id, image_source.clone(), g2) + .expect("add Nexus zone"); + } + + let blueprint2 = builder.build(BlueprintSource::Test); + let g2_nexus_ids: Vec<_> = blueprint2 + .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(sled_id, zone_config, nexus_config)| { + (nexus_config.nexus_generation == g2) + .then_some((sled_id, zone_config.id)) + }) + .collect(); + + // Now expunge a few of those, too. This time, only the first is going + // to be marked ready for cleanup. + let (g2_expunge_ids, g2_keep_ids) = g2_nexus_ids.split_at(2); + let mut builder = BlueprintBuilder::new_based_on( + log, + &blueprint2, + &example.input, + &example.collection, + "test suite", + PlannerRng::from_entropy(), + ) + .expect("new blueprint builder"); + + let (sled_id, g2_expunged_cleaned_up) = g2_expunge_ids[0]; + builder + .sled_expunge_zone(sled_id, g2_expunged_cleaned_up) + .expect("expunge zone"); + builder + .sled_mark_expunged_zone_ready_for_cleanup( + g2_expunge_ids[0].0, + g2_expunged_cleaned_up, + ) + .expect("mark zone for cleanup"); + + let (sled_id, g2_expunged_not_cleaned_up) = g2_expunge_ids[1]; + builder + .sled_expunge_zone(sled_id, g2_expunged_not_cleaned_up) + .expect("expunge zone"); + + let blueprint3 = builder.build(BlueprintSource::Test); + + // Finally, we have: + // + // - g1_keep_ids: two in-service Nexus zones in generation 1 + // - g1_expunge_ids: two expunged Nexus zones in generation 1, + // both cleaned up + // - g2_keep_ids: two in-service Nexus zones in generation 2 + // - g2_expunge_ids: expunged Nexus zones in generation 2, + // only the first of which is ready for cleanup + // + // Now we can exhaustively test various cases. + + // For the in-service zones in generation 1, we should find the expunged + // zones in generation 1. + let g1_matched: BTreeSet = g1_expunge_ids + .into_iter() + .map(|(_sled_id, zone_id, _image_source)| { + SecId(zone_id.into_untyped_uuid()) + }) + .collect(); + for (_sled_id, zone_id, _image_source) in g1_keep_ids { + let matched = find_expunged_same_generation( + &blueprint3, + SecId(zone_id.into_untyped_uuid()), + ) + .unwrap(); + assert_eq!( + matched.into_iter().collect::>(), + g1_matched + ); + } + + // It should be impossible in a real system for the + // expunged-and-ready-to-cleanup zones to execute this function. Being + // ready to cleanup means we know they're gone. So there's nothing to + // test here. + + // For the in-service zones in generation 2, we should find the + // expunged-and-ready-for-cleanup zone in generation 2. + let g2_matched = SecId(g2_expunged_cleaned_up.into_untyped_uuid()); + for (_sled_id, zone_id) in g2_keep_ids { + let matched = find_expunged_same_generation( + &blueprint3, + SecId(zone_id.into_untyped_uuid()), + ) + .unwrap(); + assert_eq!(matched.len(), 1); + assert_eq!(matched[0], g2_matched); + } + + // It is possible for the expunged and not-yet-ready-for-cleanup zone in + // generation 2 to wind up calling this function. It should not find + // itself! + let matched = find_expunged_same_generation( + &blueprint3, + SecId(g2_expunged_not_cleaned_up.into_untyped_uuid()), + ) + .unwrap(); + assert_eq!(matched.len(), 1); + assert_eq!(matched[0], g2_matched); + + // Test the sole error case: if we cannot figure out which generation we + // were given. + let error = + find_expunged_same_generation(&blueprint3, SecId(Uuid::new_v4())) + .expect_err("made-up Nexus should not exist"); + assert!(matches!(error, Error::InternalError { internal_message } + if internal_message.contains("did not find Nexus"))); + + logctx.cleanup_successful(); + } +} diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 11b09f7e5d3..80aff853a10 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -479,7 +479,7 @@ impl Blueprint { nexus_id: OmicronZoneUuid, ) -> Result { for (_sled_id, zone_config, nexus_config) in - self.all_nexus_zones(BlueprintZoneDisposition::is_in_service) + self.all_nexus_zones(BlueprintZoneDisposition::could_be_running) { if zone_config.id == nexus_id { return Ok(nexus_config.nexus_generation); From 44a002b97d2e5abccc7b946f23a316ae4848e925 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 4 Nov 2025 09:43:08 -0800 Subject: [PATCH 2/4] fix live test --- live-tests/tests/test_nexus_add_remove.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 405c06ecc92..bf78cea0c6b 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -25,7 +25,7 @@ use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; use nexus_types::deployment::SledFilter; use nexus_types::deployment::blueprint_zone_type; -use omicron_common::address::NEXUS_INTERNAL_PORT; +use omicron_common::address::NEXUS_LOCKSTEP_PORT; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; use slog::{debug, info}; @@ -133,7 +133,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { assert_eq!(new_zone.kind(), ZoneKind::Nexus); let new_zone_addr = new_zone.underlay_ip(); let new_zone_sockaddr = - SocketAddrV6::new(new_zone_addr, NEXUS_INTERNAL_PORT, 0, 0); + SocketAddrV6::new(new_zone_addr, NEXUS_LOCKSTEP_PORT, 0, 0); let new_zone_client = lc.specific_internal_nexus_client(new_zone_sockaddr); // Wait for the new Nexus zone to show up and be usable. From d5739b9a4df11e53736fb0c222db592093d72d19 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 4 Nov 2025 10:53:52 -0800 Subject: [PATCH 3/4] more live test fixes --- live-tests/tests/test_nexus_handoff.rs | 25 ++++++++++++------------- nexus/types/src/deployment/zone_type.rs | 11 +++++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index 7c819b47769..0a1d22942c8 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -131,7 +131,9 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { .map(|(id, current)| { ( id, - lc.specific_internal_nexus_client(current.cfg.internal_address), + lc.specific_internal_nexus_client( + current.cfg.lockstep_address(), + ), ) }) .collect(); @@ -193,18 +195,15 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // Find the new Nexus zones and make clients for them. let new_nexus_clients = blueprint_new_nexus - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(_sled_id, z)| { - let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { - nexus_generation, - internal_address, - .. - }) = &z.zone_type - else { - return None; - }; - (*nexus_generation == next_generation).then(|| { - (z.id, lc.specific_internal_nexus_client(*internal_address)) + .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .filter_map(|(_sled_id, zone_cfg, nexus_config)| { + (nexus_config.nexus_generation == next_generation).then(|| { + ( + zone_cfg.id, + lc.specific_internal_nexus_client( + nexus_config.lockstep_address(), + ), + ) }) }) .collect::>(); diff --git a/nexus/types/src/deployment/zone_type.rs b/nexus/types/src/deployment/zone_type.rs index ff58c4bc915..18a81ec7e5c 100644 --- a/nexus/types/src/deployment/zone_type.rs +++ b/nexus/types/src/deployment/zone_type.rs @@ -577,6 +577,17 @@ pub mod blueprint_zone_type { pub nexus_generation: Generation, } + impl Nexus { + pub fn lockstep_address(&self) -> SocketAddrV6 { + SocketAddrV6::new( + *self.internal_address.ip(), + self.lockstep_port, + 0, + 0, + ) + } + } + #[derive( Debug, Clone, From f81bfc541b9f131d54505c67d993da376ff52e75 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 4 Nov 2025 11:01:26 -0800 Subject: [PATCH 4/4] missed a typo --- nexus/reconfigurator/execution/src/sagas.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index 087cf2893d6..929225056c8 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -119,7 +119,8 @@ mod test { // want some expunged zones and some non-expunged zones in each of two // different generations. - // Frst, create a basic blueprint with several Nexus zones in generation 1. + // First, create a basic blueprint with several Nexus zones in + // generation 1. let (example, blueprint1) = ExampleSystemBuilder::new(log, TEST_NAME).nexus_count(4).build(); let g1 = Generation::new();