From 828fff35e7bcd4e5194fdfb1e6b7ef9ebb93579b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 21 Feb 2025 13:40:31 -0500 Subject: [PATCH 01/11] clean_up_expunged_zones: act on zones ready for cleanup --- nexus/reconfigurator/execution/src/lib.rs | 20 +++++--------- .../execution/src/omicron_zones.rs | 26 ++++--------------- nexus/types/src/deployment.rs | 6 +++++ 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 19e53dd7fec..5a5ed3bf8a8 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -148,13 +148,12 @@ pub async fn realize_blueprint_with_overrides( sled_list.clone(), ); - let deploy_zones_done = register_cleanup_expunged_zones_step( + register_cleanup_expunged_zones_step( &engine.for_component(ExecutionComponent::OmicronZones), &opctx, datastore, resolver, blueprint, - deploy_zones_done, ); register_decommission_sleds_step( @@ -475,34 +474,27 @@ fn register_cleanup_expunged_zones_step<'a>( datastore: &'a DataStore, resolver: &'a Resolver, blueprint: &'a Blueprint, - deploy_zones_done: StepHandle, -) -> StepHandle { +) { registrar .new_step( ExecutionStepId::Remove, "Cleanup expunged zones", - move |cx| async move { - let done = deploy_zones_done.into_value(cx.token()).await; + move |_cx| async move { omicron_zones::clean_up_expunged_zones( &opctx, datastore, resolver, - // TODO-correctness Once the planner fills in - // `ready_for_cleanup`, we should filter on that here and - // stop requiring `deploy_zones_done`. This is necessary to - // fix omicron#6999. blueprint.all_omicron_zones( - BlueprintZoneDisposition::is_expunged, + BlueprintZoneDisposition::is_ready_for_cleanup, ), - &done, ) .await .map_err(merge_anyhow_list)?; - StepSuccess::new(done).into() + StepSuccess::new(()).into() }, ) - .register() + .register(); } fn register_decommission_sleds_step<'a>( diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 776984988c6..afbcc51c51f 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -18,7 +18,6 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::BlueprintZonesConfig; use omicron_common::address::COCKROACH_ADMIN_PORT; @@ -110,20 +109,12 @@ pub(crate) async fn clean_up_expunged_zones( datastore: &DataStore, resolver: &R, expunged_zones: impl Iterator, - _deploy_zones_done: &DeployZonesDone, ) -> Result<(), Vec> { let errors: Vec = stream::iter(expunged_zones) .filter_map(|(sled_id, config)| async move { - // We expect to only be called with expunged zones; skip any with a - // different disposition. - // - // TODO We should be looking at `ready_for_cleanup` here! But - // currently the planner never sets it to true, so we're dependent - // on `DeployZonesDone` instead. - if !matches!( - config.disposition, - BlueprintZoneDisposition::Expunged { .. } - ) { + // We expect to only be called with expunged zones that are ready + // for cleanup; skip any with a different disposition. + if !config.disposition.is_ready_for_cleanup() { return None; } @@ -360,6 +351,7 @@ mod test { OmicronZoneDataset, OmicronZonesConfig, SledRole, }; use nexus_test_utils_macros::nexus_test; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::{ Blueprint, BlueprintDatasetsConfig, BlueprintPhysicalDisksConfig, BlueprintSledConfig, BlueprintTarget, BlueprintZoneImageSource, @@ -687,7 +679,7 @@ mod test { let crdb_zone = BlueprintZoneConfig { disposition: BlueprintZoneDisposition::Expunged { as_of_generation: Generation::new(), - ready_for_cleanup: false, + ready_for_cleanup: true, }, id: OmicronZoneUuid::new_v4(), filesystem_pool: Some(ZpoolName::new_external(ZpoolUuid::new_v4())), @@ -718,10 +710,6 @@ mod test { } let fake_resolver = FixedResolver(vec![mock_admin.addr()]); - // This is a unit test, so pretend we already successfully called - // deploy_zones. - let deploy_zones_done = DeployZonesDone(()); - // We haven't yet inserted a mapping from zone ID to cockroach node ID // in the db, so trying to clean up the zone should log a warning but // otherwise succeed, without attempting to contact our mock admin @@ -732,7 +720,6 @@ mod test { datastore, &fake_resolver, iter::once((any_sled_id, &crdb_zone)), - &deploy_zones_done, ) .await .expect("unknown node ID: no cleanup"); @@ -779,7 +766,6 @@ mod test { datastore, &fake_resolver, iter::once((any_sled_id, &crdb_zone)), - &deploy_zones_done, ) .await .expect("decommissioned test node"); @@ -811,7 +797,6 @@ mod test { datastore, &fake_resolver, iter::once((any_sled_id, &crdb_zone)), - &deploy_zones_done, ) .await .expect_err("no successful response should result in failure"); @@ -840,7 +825,6 @@ mod test { datastore, &fake_resolver, iter::once((any_sled_id, &crdb_zone)), - &deploy_zones_done, ) .await .expect("decommissioned test node"); diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 51b7a3227a2..b383ced4830 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -812,6 +812,12 @@ impl BlueprintZoneDisposition { } => !ready_for_cleanup, } } + + /// Returns true if `self` indicates the zone is expunged and ready for + /// cleanup. + pub fn is_ready_for_cleanup(self) -> bool { + matches!(self, Self::Expunged { ready_for_cleanup: true, .. }) + } } impl fmt::Display for BlueprintZoneDisposition { From bd4a86d3de842c82bf8ac55648bf69754b8166b0 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 21 Feb 2025 13:52:21 -0500 Subject: [PATCH 02/11] support_bundle_fail_expunged: act on zones ready for cleanup --- .../src/db/datastore/support_bundle.rs | 16 ++++++---------- nexus/reconfigurator/execution/src/lib.rs | 13 +++++-------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 96d9b0a726a..6520c823078 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -20,6 +20,7 @@ use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use futures::FutureExt; +use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -215,16 +216,12 @@ impl DataStore { ) -> Result { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - // For this blueprint: The set of all expunged Nexus zones + // For this blueprint: The set of all expunged Nexus zones that are + // ready for cleanup let invalid_nexus_zones = blueprint - .all_omicron_zones( - nexus_types::deployment::BlueprintZoneDisposition::is_expunged, - ) + .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) .filter_map(|(_sled, zone)| { - if matches!( - zone.zone_type, - nexus_types::deployment::BlueprintZoneType::Nexus(_) - ) { + if zone.zone_type.is_nexus() { Some(zone.id.into_untyped_uuid()) } else { None @@ -484,7 +481,6 @@ mod test { use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDatasetFilter; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::external::LookupType; use omicron_common::api::internal::shared::DatasetKind::Debug as DebugDatasetKind; @@ -985,7 +981,7 @@ mod test { if zone.id == bundle.assigned_nexus.unwrap().into() { zone.disposition = BlueprintZoneDisposition::Expunged { as_of_generation: *Generation::new(), - ready_for_cleanup: false, + ready_for_cleanup: true, }; } } diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 5a5ed3bf8a8..da538cb4616 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -182,13 +182,12 @@ pub async fn realize_blueprint_with_overrides( blueprint, ); - let deploy_zones_done = register_support_bundle_failure_step( + register_support_bundle_failure_step( &engine.for_component(ExecutionComponent::SupportBundles), &opctx, datastore, blueprint, nexus_id, - deploy_zones_done, ); let reassign_saga_output = register_reassign_sagas_step( @@ -270,14 +269,12 @@ fn register_support_bundle_failure_step<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, nexus_id: OmicronZoneUuid, - deploy_zones_done: StepHandle, -) -> StepHandle { +) { registrar .new_step( ExecutionStepId::Ensure, "Mark support bundles as failed if they rely on an expunged disk or sled", - move |cx| async move { - let done = deploy_zones_done.into_value(cx.token()).await; + move |_cx| async move { datastore .support_bundle_fail_expunged( &opctx, blueprint, nexus_id @@ -285,10 +282,10 @@ fn register_support_bundle_failure_step<'a>( .await .map_err(|err| anyhow!(err))?; - StepSuccess::new(done).into() + StepSuccess::new(()).into() }, ) - .register() + .register(); } fn register_sled_list_step<'a>( From e33103a31549b9dc1ed7b1ab7e3212db5c4aedd9 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 21 Feb 2025 14:55:34 -0500 Subject: [PATCH 03/11] reassign_sagas_from_expunged: act on zones ready for cleanup --- nexus/reconfigurator/execution/src/lib.rs | 19 +++++++------------ .../execution/src/omicron_zones.rs | 13 ++++++------- nexus/reconfigurator/execution/src/sagas.rs | 8 +------- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index da538cb4616..9b9f3d34962 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -20,7 +20,6 @@ use nexus_types::identity::Asset; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; -use omicron_zones::DeployZonesDone; use slog::info; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; @@ -125,7 +124,7 @@ pub async fn realize_blueprint_with_overrides( sled_list.clone(), ); - let deploy_zones_done = register_deploy_zones_step( + register_deploy_zones_step( &engine.for_component(ExecutionComponent::OmicronZones), &opctx, blueprint, @@ -196,7 +195,6 @@ pub async fn realize_blueprint_with_overrides( datastore, blueprint, nexus_id, - deploy_zones_done, ); let register_cockroach_output = register_cockroachdb_settings_step( @@ -377,14 +375,14 @@ fn register_deploy_zones_step<'a>( opctx: &'a OpContext, blueprint: &'a Blueprint, sleds: SharedStepHandle>>, -) -> StepHandle { +) { registrar .new_step( ExecutionStepId::Ensure, "Deploy Omicron zones", move |cx| async move { let sleds_by_id = sleds.into_value(cx.token()).await; - let done = omicron_zones::deploy_zones( + omicron_zones::deploy_zones( &opctx, &sleds_by_id, blueprint @@ -395,10 +393,10 @@ fn register_deploy_zones_step<'a>( .await .map_err(merge_anyhow_list)?; - StepSuccess::new(done).into() + StepSuccess::new(()).into() }, ) - .register() + .register(); } fn register_plumb_firewall_rules_step<'a>( @@ -617,7 +615,6 @@ fn register_reassign_sagas_step<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, nexus_id: OmicronZoneUuid, - deploy_zones_done: StepHandle, ) -> StepHandle { // For this and subsequent steps, we'll assume that any errors that we // encounter do *not* require stopping execution. We'll just accumulate @@ -628,15 +625,13 @@ fn register_reassign_sagas_step<'a>( .new_step( ExecutionStepId::Ensure, "Reassign sagas", - move |cx| async move { - let done = deploy_zones_done.into_value(cx.token()).await; - + move |_cx| async move { // For any expunged Nexus zones, re-assign in-progress sagas to // some other Nexus. If this fails for some reason, it doesn't // affect anything else. let sec_id = nexus_db_model::SecId::from(nexus_id); let reassigned = sagas::reassign_sagas_from_expunged( - &opctx, datastore, blueprint, sec_id, &done, + &opctx, datastore, blueprint, sec_id, ) .await .context("failed to re-assign sagas"); diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index afbcc51c51f..8fbb5bb4d0a 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -32,18 +32,13 @@ use std::collections::BTreeMap; use std::net::SocketAddr; use std::net::SocketAddrV6; -/// Typestate indicating that the deploy disks step was performed. -#[derive(Debug)] -#[must_use = "token indicating completion of deploy_zones"] -pub(crate) struct DeployZonesDone(()); - /// Idempotently ensure that the specified Omicron zones are deployed to the /// corresponding sleds pub(crate) async fn deploy_zones<'a, I>( opctx: &OpContext, sleds_by_id: &BTreeMap, zones: I, -) -> Result> +) -> Result<(), Vec> where I: Iterator, { @@ -100,7 +95,11 @@ where .collect() .await; - if errors.is_empty() { Ok(DeployZonesDone(())) } else { Err(errors) } + if errors.is_empty() { + Ok(()) + } else { + Err(errors) + } } /// Idempontently perform any cleanup actions necessary for expunged zones. diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index 92f1b9f8c06..346bc5b7eae 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -4,7 +4,6 @@ //! Re-assign sagas from expunged Nexus zones -use crate::omicron_zones::DeployZonesDone; use nexus_db_model::SecId; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; @@ -20,7 +19,6 @@ pub(crate) async fn reassign_sagas_from_expunged( datastore: &DataStore, blueprint: &Blueprint, nexus_id: SecId, - _deploy_zones_done: &DeployZonesDone, ) -> Result { let log = &opctx.log; @@ -36,12 +34,8 @@ pub(crate) async fn reassign_sagas_from_expunged( // instances running at the same time. At that point, we will need to make // sure that we only ever try to assign ourselves sagas from other Nexus // instances that we know are running the same version as ourselves. - // - // TODO-correctness Once the planner fills in `ready_for_cleanup`, we should - // filter on that here and stop requiring `deploy_zones_done`. This is - // necessary to fix omicron#6999. let nexus_zone_ids: Vec<_> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) .filter_map(|(_, z)| { z.zone_type .is_nexus() From a347074c8d854550c2ea6b9b8e08c6ffa8b542e3 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 25 Feb 2025 14:32:44 -0500 Subject: [PATCH 04/11] make more step failures not stop execution --- nexus/reconfigurator/execution/src/lib.rs | 40 +++++++++++------------ 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 9b9f3d34962..7461e736f91 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -271,16 +271,21 @@ fn register_support_bundle_failure_step<'a>( registrar .new_step( ExecutionStepId::Ensure, - "Mark support bundles as failed if they rely on an expunged disk or sled", + "Mark support bundles as failed if they rely on \ + an expunged disk or sled", move |_cx| async move { - datastore - .support_bundle_fail_expunged( - &opctx, blueprint, nexus_id - ) + let res = match datastore + .support_bundle_fail_expunged(&opctx, blueprint, nexus_id) .await - .map_err(|err| anyhow!(err))?; - - StepSuccess::new(()).into() + { + Ok(report) => StepSuccess::new(()) + .with_message(format!( + "support bundle expunge report: {report:?}" + )) + .build(), + Err(err) => StepWarning::new((), err.to_string()).build(), + }; + Ok(res) }, ) .register(); @@ -382,7 +387,7 @@ fn register_deploy_zones_step<'a>( "Deploy Omicron zones", move |cx| async move { let sleds_by_id = sleds.into_value(cx.token()).await; - omicron_zones::deploy_zones( + let res = omicron_zones::deploy_zones( &opctx, &sleds_by_id, blueprint @@ -391,9 +396,8 @@ fn register_deploy_zones_step<'a>( .map(|(sled_id, sled)| (*sled_id, &sled.zones_config)), ) .await - .map_err(merge_anyhow_list)?; - - StepSuccess::new(()).into() + .map_err(merge_anyhow_list); + result_to_step_result(res) }, ) .register(); @@ -475,7 +479,7 @@ fn register_cleanup_expunged_zones_step<'a>( ExecutionStepId::Remove, "Cleanup expunged zones", move |_cx| async move { - omicron_zones::clean_up_expunged_zones( + let res = omicron_zones::clean_up_expunged_zones( &opctx, datastore, resolver, @@ -484,9 +488,8 @@ fn register_cleanup_expunged_zones_step<'a>( ), ) .await - .map_err(merge_anyhow_list)?; - - StepSuccess::new(()).into() + .map_err(merge_anyhow_list); + result_to_step_result(res) }, ) .register(); @@ -616,11 +619,6 @@ fn register_reassign_sagas_step<'a>( blueprint: &'a Blueprint, nexus_id: OmicronZoneUuid, ) -> StepHandle { - // For this and subsequent steps, we'll assume that any errors that we - // encounter do *not* require stopping execution. We'll just accumulate - // them and return them all at the end. - // - // TODO We should probably do this with more of the errors above, too. registrar .new_step( ExecutionStepId::Ensure, From 1f6020d82ebb2990e95cd93c64052f56c6560659 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 25 Feb 2025 14:36:29 -0500 Subject: [PATCH 05/11] result_to_step_result -> map_err_to_step_warning --- nexus/reconfigurator/execution/src/lib.rs | 37 +++++++++++------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 7461e736f91..e031c8440d4 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -216,18 +216,17 @@ pub async fn realize_blueprint_with_overrides( Ok(output.into_value(result.token()).await) } -// Convert a `Result<(), anyhow::Error>` nto a `StepResult` containing either a -// `StepSuccess` or `StepWarning` and wrap it in `Result::Ok`. +// Convert a `Result<(), anyhow::Error>` into a `StepResult` containing either a +// `StepSuccess` or `StepWarning`. // -// This is necessary because we never want to return an error from execution. -// Doing so stops execution at the errored step and prevents other independent -// steps after the errored step from executing. -fn result_to_step_result( +// Most steps use this to avoid stoping execution at the errored step, which +// would prevent other independent steps after the errored step from executing. +fn map_err_to_step_warning( res: Result<(), anyhow::Error>, -) -> Result, anyhow::Error> { +) -> StepResult<(), ReconfiguratorExecutionSpec> { match res { - Ok(_) => Ok(StepSuccess::new(()).build()), - Err(e) => Ok(StepWarning::new((), e.to_string()).build()), + Ok(_) => StepSuccess::new(()).build(), + Err(e) => StepWarning::new((), e.to_string()).build(), } } @@ -342,7 +341,7 @@ fn register_deploy_disks_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -369,7 +368,7 @@ fn register_deploy_datasets_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -397,7 +396,7 @@ fn register_deploy_zones_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -429,7 +428,7 @@ fn register_plumb_firewall_rules_step<'a>( ) .await .context("failed to plumb service firewall rules to sleds"); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -461,7 +460,7 @@ fn register_dns_records_step<'a>( ) .await .map_err(|e| anyhow!("{}", InlineErrorChain::new(&e))); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -489,7 +488,7 @@ fn register_cleanup_expunged_zones_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -519,7 +518,7 @@ fn register_decommission_sleds_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -545,7 +544,7 @@ fn register_decommission_disks_step<'a>( ) .await .map_err(merge_anyhow_list); - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); @@ -572,7 +571,7 @@ fn register_deploy_clickhouse_cluster_nodes_step<'a>( ) .await .map_err(merge_anyhow_list); - return result_to_step_result(res); + return Ok(map_err_to_step_warning(res)); } StepSuccess::new(()).into() @@ -600,7 +599,7 @@ fn register_deploy_clickhouse_single_node_step<'a>( .filter(|(_, z)| z.zone_type.is_clickhouse()), ) .await; - result_to_step_result(res) + Ok(map_err_to_step_warning(res)) }, ) .register(); From fc7581cc87f86076cc76b95a46e3cd8afd5dfba9 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 25 Feb 2025 14:37:46 -0500 Subject: [PATCH 06/11] more consistent error reporting: cockroachdb settings step --- nexus/reconfigurator/execution/src/lib.rs | 28 +++++------------------ 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index e031c8440d4..ccd779c7c03 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -197,7 +197,7 @@ pub async fn realize_blueprint_with_overrides( nexus_id, ); - let register_cockroach_output = register_cockroachdb_settings_step( + register_cockroachdb_settings_step( &engine.for_component(ExecutionComponent::Cockroach), &opctx, datastore, @@ -207,7 +207,6 @@ pub async fn realize_blueprint_with_overrides( let output = register_finalize_step( &engine.for_component(ExecutionComponent::Cockroach), reassign_saga_output, - register_cockroach_output, ); // All steps are registered, so execute the engine. @@ -664,34 +663,24 @@ fn register_cockroachdb_settings_step<'a>( opctx: &'a OpContext, datastore: &'a DataStore, blueprint: &'a Blueprint, -) -> StepHandle> { +) { registrar .new_step( ExecutionStepId::Ensure, "Ensure CockroachDB settings", move |_cx| async move { - if let Err(error) = + let res = cockroachdb::ensure_settings(&opctx, datastore, blueprint) - .await - { - // We treat errors as non-fatal here, but we still want to - // log them. It's okay to just log the message here without - // the chain of sources, since we collect the full chain in - // the last step (`register_finalize_step`). - let message = error.to_string(); - StepWarning::new(Some(error), message).into() - } else { - StepSuccess::new(None).into() - } + .await; + Ok(map_err_to_step_warning(res)) }, ) - .register() + .register(); } fn register_finalize_step( registrar: &ComponentRegistrar<'_, '_>, reassign_saga_output: StepHandle, - register_cockroach_output: StepHandle>, ) -> StepHandle { registrar .new_step( @@ -700,13 +689,8 @@ fn register_finalize_step( move |cx| async move { let reassign_saga_output = reassign_saga_output.into_value(cx.token()).await; - let register_cockroach_output = - register_cockroach_output.into_value(cx.token()).await; let mut errors = Vec::new(); - if let Some(error) = register_cockroach_output { - errors.push(error); - } if let Some(error) = reassign_saga_output.error { errors.push(error); } From 5b5bd7fe98dcccee5a29625d127916cb866afd6f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 25 Feb 2025 14:46:40 -0500 Subject: [PATCH 07/11] remove `finalize` step (most steps can emit warnings now) --- nexus/reconfigurator/execution/src/lib.rs | 63 +++-------------------- 1 file changed, 8 insertions(+), 55 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index ccd779c7c03..7154d5e5a96 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -204,15 +204,13 @@ pub async fn realize_blueprint_with_overrides( blueprint, ); - let output = register_finalize_step( - &engine.for_component(ExecutionComponent::Cockroach), - reassign_saga_output, - ); - // All steps are registered, so execute the engine. let result = engine.execute().await?; - Ok(output.into_value(result.token()).await) + let needs_saga_recovery = + reassign_saga_output.into_value(result.token()).await; + + Ok(RealizeBlueprintOutput { needs_saga_recovery }) } // Convert a `Result<(), anyhow::Error>` into a `StepResult` containing either a @@ -225,7 +223,7 @@ fn map_err_to_step_warning( ) -> StepResult<(), ReconfiguratorExecutionSpec> { match res { Ok(_) => StepSuccess::new(()).build(), - Err(e) => StepWarning::new((), e.to_string()).build(), + Err(e) => StepWarning::new((), format!("{e:#}")).build(), } } @@ -616,7 +614,7 @@ fn register_reassign_sagas_step<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, nexus_id: OmicronZoneUuid, -) -> StepHandle { +) -> StepHandle { registrar .new_step( ExecutionStepId::Ensure, @@ -633,24 +631,10 @@ fn register_reassign_sagas_step<'a>( .context("failed to re-assign sagas"); match reassigned { Ok(needs_saga_recovery) => { - let output = ReassignSagaOutput { - needs_saga_recovery, - error: None, - }; - StepSuccess::new(output).into() + Ok(StepSuccess::new(needs_saga_recovery).build()) } Err(error) => { - // We treat errors as non-fatal here, but we still want - // to log them. It's okay to just log the message here - // without the chain of sources, since we collect the - // full chain in the last step - // (`register_finalize_step`). - let message = error.to_string(); - let output = ReassignSagaOutput { - needs_saga_recovery: false, - error: Some(error), - }; - StepWarning::new(output, message).into() + Ok(StepWarning::new(false, error.to_string()).build()) } } }, @@ -677,34 +661,3 @@ fn register_cockroachdb_settings_step<'a>( ) .register(); } - -fn register_finalize_step( - registrar: &ComponentRegistrar<'_, '_>, - reassign_saga_output: StepHandle, -) -> StepHandle { - registrar - .new_step( - ExecutionStepId::Finalize, - "Finalize and check for errors", - move |cx| async move { - let reassign_saga_output = - reassign_saga_output.into_value(cx.token()).await; - - let mut errors = Vec::new(); - if let Some(error) = reassign_saga_output.error { - errors.push(error); - } - - if errors.is_empty() { - StepSuccess::new(RealizeBlueprintOutput { - needs_saga_recovery: reassign_saga_output - .needs_saga_recovery, - }) - .into() - } else { - Err(merge_anyhow_list(errors)) - } - }, - ) - .register() -} From eab96e14d826aba4339e2fec726ae6a9283ea204 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 3 Mar 2025 10:05:57 -0500 Subject: [PATCH 08/11] cargo fmt --- nexus/reconfigurator/execution/src/omicron_zones.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 8fbb5bb4d0a..6e538124886 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -95,11 +95,7 @@ where .collect() .await; - if errors.is_empty() { - Ok(()) - } else { - Err(errors) - } + if errors.is_empty() { Ok(()) } else { Err(errors) } } /// Idempontently perform any cleanup actions necessary for expunged zones. From b6253ee258b6c2c4ac7042a1dc6564f7f7bd1c6c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 3 Mar 2025 10:11:54 -0500 Subject: [PATCH 09/11] comments and dead code removal --- nexus/reconfigurator/execution/src/lib.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 7154d5e5a96..494ef692cfc 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -602,12 +602,7 @@ fn register_deploy_clickhouse_single_node_step<'a>( .register(); } -#[derive(Debug)] -struct ReassignSagaOutput { - needs_saga_recovery: bool, - error: Option, -} - +// Returns a boolean indicating whether saga recovery is needed. fn register_reassign_sagas_step<'a>( registrar: &ComponentRegistrar<'_, 'a>, opctx: &'a OpContext, From be02ab4540ff96bd26cad2f8d9fb2553b7f3d9ed Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Mar 2025 10:16:15 -0500 Subject: [PATCH 10/11] typo --- nexus/reconfigurator/execution/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index 494ef692cfc..a8d7343748c 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -216,7 +216,7 @@ pub async fn realize_blueprint_with_overrides( // Convert a `Result<(), anyhow::Error>` into a `StepResult` containing either a // `StepSuccess` or `StepWarning`. // -// Most steps use this to avoid stoping execution at the errored step, which +// Most steps use this to avoid stopping execution at the errored step, which // would prevent other independent steps after the errored step from executing. fn map_err_to_step_warning( res: Result<(), anyhow::Error>, From 7924b9b86175faeadbcca489b4fcf9af77bc426c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 4 Mar 2025 11:18:19 -0500 Subject: [PATCH 11/11] fix test (put zones failure doesn't fail entire execution anymore) --- nexus/reconfigurator/execution/src/lib.rs | 10 +- .../background/tasks/blueprint_execution.rs | 97 +++++++++---------- nexus/types/src/deployment/execution/spec.rs | 6 +- 3 files changed, 51 insertions(+), 62 deletions(-) diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index a8d7343748c..db232d15240 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -266,7 +266,7 @@ fn register_support_bundle_failure_step<'a>( ) { registrar .new_step( - ExecutionStepId::Ensure, + ExecutionStepId::Cleanup, "Mark support bundles as failed if they rely on \ an expunged disk or sled", move |_cx| async move { @@ -472,7 +472,7 @@ fn register_cleanup_expunged_zones_step<'a>( ) { registrar .new_step( - ExecutionStepId::Remove, + ExecutionStepId::Cleanup, "Cleanup expunged zones", move |_cx| async move { let res = omicron_zones::clean_up_expunged_zones( @@ -499,7 +499,7 @@ fn register_decommission_sleds_step<'a>( ) { registrar .new_step( - ExecutionStepId::Remove, + ExecutionStepId::Cleanup, "Decommission sleds", move |_cx| async move { let res = sled_state::decommission_sleds( @@ -529,7 +529,7 @@ fn register_decommission_disks_step<'a>( ) { registrar .new_step( - ExecutionStepId::Remove, + ExecutionStepId::Cleanup, "Decommission expunged disks", move |_cx| async move { let res = omicron_physical_disks::decommission_expunged_disks( @@ -612,7 +612,7 @@ fn register_reassign_sagas_step<'a>( ) -> StepHandle { registrar .new_step( - ExecutionStepId::Ensure, + ExecutionStepId::Cleanup, "Reassign sagas", move |_cx| async move { // For any expunged Nexus zones, re-assign in-progress sagas to diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index b8f93bb58f5..358e5232999 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -168,6 +168,7 @@ mod test { use httptest::Expectation; use httptest::matchers::{not, request}; use httptest::responders::status_code; + use itertools::Itertools as _; use nexus_db_model::{ ByteCount, SledBaseboard, SledSystemHardware, SledUpdate, Zpool, }; @@ -178,7 +179,7 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::execution::{ EventBuffer, EventReport, ExecutionComponent, ExecutionStepId, - ReconfiguratorExecutionSpec, StepInfo, + StepOutcome, StepStatus, }; use nexus_types::deployment::{ Blueprint, BlueprintDatasetsConfig, BlueprintPhysicalDisksConfig, @@ -196,13 +197,12 @@ mod test { use omicron_uuid_kinds::PhysicalDiskUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::ZpoolUuid; - use serde::Deserialize; use serde_json::json; use std::collections::BTreeMap; use std::net::SocketAddr; use std::sync::Arc; use tokio::sync::watch; - use update_engine::{NestedError, TerminalKind}; + use update_engine::{CompletionReason, NestedError, TerminalKind}; use uuid::Uuid; type ControlPlaneTestContext = @@ -538,26 +538,51 @@ mod test { .respond_with(status_code(500)), ); - #[derive(Deserialize)] - struct ErrorResult { - execution_error: NestedError, - } - let mut value = task.activate(&opctx).await; let event_buffer = extract_event_buffer(&mut value); - println!("after failure: {:?}", value); - let result: ErrorResult = serde_json::from_value(value).unwrap(); - assert_eq!( - result.execution_error.message(), - "step failed: Deploy Omicron zones" - ); + let ensure_zones_outcome = { + let ensure_id = + serde_json::to_value(ExecutionStepId::Ensure).unwrap(); + let zones_component = + serde_json::to_value(ExecutionComponent::OmicronZones).unwrap(); + match event_buffer + .iter_steps_recursive() + .filter_map(|(_, step)| { + let info = step.step_info(); + if info.id == ensure_id && info.component == zones_component + { + match step.step_status() { + StepStatus::Completed { + reason: CompletionReason::StepCompleted(info), + } => Some(&info.outcome), + status => { + panic!("unexpected step status: {status:?}") + } + } + } else { + None + } + }) + .exactly_one() + { + Ok(outcome) => outcome, + Err(outcomes) => panic!( + "expected exactly one step outcome, but found {}", + outcomes.count() + ), + } + }; - assert_event_buffer_failed_at( - &event_buffer, - ExecutionComponent::OmicronZones, - ExecutionStepId::Ensure, - ); + match ensure_zones_outcome { + StepOutcome::Warning { message, .. } => { + assert!( + message.contains("Failed to put OmicronZonesConfig"), + "unexpected warning message: {message}" + ); + } + other => panic!("unexpected zones outcome: {other:?}"), + } s1.verify_and_clear(); s2.verify_and_clear(); @@ -597,38 +622,4 @@ mod test { "execution should have completed successfully" ); } - - fn assert_event_buffer_failed_at( - event_buffer: &EventBuffer, - component: ExecutionComponent, - step_id: ExecutionStepId, - ) { - let execution_status = event_buffer - .root_execution_summary() - .expect("event buffer has root execution summary") - .execution_status; - let terminal_info = - execution_status.terminal_info().unwrap_or_else(|| { - panic!( - "execution status has terminal info: {:?}", - execution_status - ); - }); - assert_eq!( - terminal_info.kind, - TerminalKind::Failed, - "execution should have failed" - ); - let step = - event_buffer.get(&terminal_info.step_key).expect("step exists"); - let step_info = StepInfo::::from_generic( - step.step_info().clone(), - ) - .expect("step info follows ReconfiguratorExecutionSpec"); - assert_eq!( - (step_info.component, step_info.id), - (component, step_id), - "component and step id matches expected" - ); - } } diff --git a/nexus/types/src/deployment/execution/spec.rs b/nexus/types/src/deployment/execution/spec.rs index f6264d4deee..1b5ac356836 100644 --- a/nexus/types/src/deployment/execution/spec.rs +++ b/nexus/types/src/deployment/execution/spec.rs @@ -47,11 +47,9 @@ pub enum ExecutionComponent { pub enum ExecutionStepId { /// Fetch information that will be used in subsequent steps. Fetch, - Add, - Remove, + /// Perform cleanup actions on removed items. + Cleanup, /// Idempotent "ensure" or "deploy" step that delegates removes and adds to /// other parts of the system. Ensure, - /// Finalize the blueprint and check for errors at the end of execution. - Finalize, }