Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 36 additions & 24 deletions nexus/reconfigurator/execution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use omicron_physical_disks::DeployDisksDone;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::OmicronZoneUuid;
use omicron_uuid_kinds::SledUuid;
use omicron_zones::DeployZonesDone;
use slog::info;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
Expand Down Expand Up @@ -102,14 +103,6 @@ pub async fn realize_blueprint_with_overrides(
blueprint,
);

register_support_bundle_failure_step(
&engine.for_component(ExecutionComponent::SupportBundles),
&opctx,
datastore,
blueprint,
nexus_id,
);

let sled_list = register_sled_list_step(
&engine.for_component(ExecutionComponent::SledList),
&opctx,
Expand All @@ -131,7 +124,7 @@ pub async fn realize_blueprint_with_overrides(
sled_list.clone(),
);

register_deploy_zones_step(
let deploy_zones_done = register_deploy_zones_step(
&engine.for_component(ExecutionComponent::OmicronZones),
&opctx,
blueprint,
Expand All @@ -154,12 +147,13 @@ pub async fn realize_blueprint_with_overrides(
sled_list.clone(),
);

register_cleanup_expunged_zones_step(
let deploy_zones_done = register_cleanup_expunged_zones_step(
&engine.for_component(ExecutionComponent::OmicronZones),
&opctx,
datastore,
resolver,
blueprint,
deploy_zones_done,
);

register_decommission_sleds_step(
Expand Down Expand Up @@ -188,12 +182,22 @@ pub async fn realize_blueprint_with_overrides(
blueprint,
);

let deploy_zones_done = register_support_bundle_failure_step(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixes #7529

&engine.for_component(ExecutionComponent::SupportBundles),
&opctx,
datastore,
blueprint,
nexus_id,
deploy_zones_done,
);

let reassign_saga_output = register_reassign_sagas_step(
&engine.for_component(ExecutionComponent::OmicronZones),
&opctx,
datastore,
blueprint,
nexus_id,
deploy_zones_done,
);

let register_cockroach_output = register_cockroachdb_settings_step(
Expand Down Expand Up @@ -249,23 +253,25 @@ fn register_support_bundle_failure_step<'a>(
datastore: &'a DataStore,
blueprint: &'a Blueprint,
nexus_id: OmicronZoneUuid,
) {
deploy_zones_done: StepHandle<DeployZonesDone>,
) -> StepHandle<DeployZonesDone> {
registrar
.new_step(
ExecutionStepId::Ensure,
"Mark support bundles as failed if they rely on an expunged disk or sled",
move |_cx| async move {
move |cx| async move {
let done = deploy_zones_done.into_value(cx.token()).await;
datastore
.support_bundle_fail_expunged(
&opctx, blueprint, nexus_id
)
.await
.map_err(|err| anyhow!(err))?;

StepSuccess::new(()).into()
StepSuccess::new(done).into()
},
)
.register();
.register()
}

fn register_sled_list_step<'a>(
Expand Down Expand Up @@ -354,25 +360,25 @@ fn register_deploy_zones_step<'a>(
opctx: &'a OpContext,
blueprint: &'a Blueprint,
sleds: SharedStepHandle<Arc<BTreeMap<SledUuid, Sled>>>,
) {
) -> StepHandle<DeployZonesDone> {
registrar
.new_step(
ExecutionStepId::Ensure,
"Deploy Omicron zones",
move |cx| async move {
let sleds_by_id = sleds.into_value(cx.token()).await;
omicron_zones::deploy_zones(
let done = omicron_zones::deploy_zones(
&opctx,
&sleds_by_id,
&blueprint.blueprint_zones,
)
.await
.map_err(merge_anyhow_list)?;

StepSuccess::new(()).into()
StepSuccess::new(done).into()
},
)
.register();
.register()
}

fn register_plumb_firewall_rules_step<'a>(
Expand Down Expand Up @@ -447,25 +453,28 @@ fn register_cleanup_expunged_zones_step<'a>(
datastore: &'a DataStore,
resolver: &'a Resolver,
blueprint: &'a Blueprint,
) {
deploy_zones_done: StepHandle<DeployZonesDone>,
) -> StepHandle<DeployZonesDone> {
registrar
.new_step(
ExecutionStepId::Remove,
"Cleanup expunged zones",
move |_cx| async move {
move |cx| async move {
let done = deploy_zones_done.into_value(cx.token()).await;
omicron_zones::clean_up_expunged_zones(
&opctx,
datastore,
resolver,
blueprint.all_omicron_zones(BlueprintZoneFilter::Expunged),
&done,
)
.await
.map_err(merge_anyhow_list)?;

StepSuccess::new(()).into()
StepSuccess::new(done).into()
},
)
.register();
.register()
}

fn register_decommission_sleds_step<'a>(
Expand Down Expand Up @@ -591,6 +600,7 @@ fn register_reassign_sagas_step<'a>(
datastore: &'a DataStore,
blueprint: &'a Blueprint,
nexus_id: OmicronZoneUuid,
deploy_zones_done: StepHandle<DeployZonesDone>,
) -> StepHandle<ReassignSagaOutput> {
// For this and subsequent steps, we'll assume that any errors that we
// encounter do *not* require stopping execution. We'll just accumulate
Expand All @@ -601,13 +611,15 @@ fn register_reassign_sagas_step<'a>(
.new_step(
ExecutionStepId::Ensure,
"Reassign sagas",
move |_cx| async move {
move |cx| async move {
let done = deploy_zones_done.into_value(cx.token()).await;

// For any expunged Nexus zones, re-assign in-progress sagas to
// some other Nexus. If this fails for some reason, it doesn't
// affect anything else.
let sec_id = nexus_db_model::SecId::from(nexus_id);
let reassigned = sagas::reassign_sagas_from_expunged(
&opctx, datastore, blueprint, sec_id,
&opctx, datastore, blueprint, sec_id, &done,
)
.await
.context("failed to re-assign sagas");
Expand Down
26 changes: 20 additions & 6 deletions nexus/reconfigurator/execution/src/omicron_zones.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,18 @@ use std::collections::BTreeMap;
use std::net::SocketAddr;
use std::net::SocketAddrV6;

/// Typestate indicating that the deploy disks step was performed.
#[derive(Debug)]
#[must_use = "token indicating completion of deploy_zones"]
pub(crate) struct DeployZonesDone(());

/// Idempotently ensure that the specified Omicron zones are deployed to the
/// corresponding sleds
pub(crate) async fn deploy_zones(
opctx: &OpContext,
sleds_by_id: &BTreeMap<SledUuid, Sled>,
zones: &BTreeMap<SledUuid, BlueprintZonesConfig>,
) -> Result<(), Vec<anyhow::Error>> {
) -> Result<DeployZonesDone, Vec<anyhow::Error>> {
let errors: Vec<_> = stream::iter(zones)
.filter_map(|(sled_id, config)| async move {
let db_sled = match sleds_by_id.get(sled_id) {
Expand Down Expand Up @@ -94,7 +99,7 @@ pub(crate) async fn deploy_zones(
.await;

if errors.is_empty() {
Ok(())
Ok(DeployZonesDone(()))
} else {
Err(errors)
}
Expand All @@ -106,6 +111,7 @@ pub(crate) async fn clean_up_expunged_zones<R: CleanupResolver>(
datastore: &DataStore,
resolver: &R,
expunged_zones: impl Iterator<Item = (SledUuid, &BlueprintZoneConfig)>,
_deploy_zones_done: &DeployZonesDone,
) -> Result<(), Vec<anyhow::Error>> {
let errors: Vec<anyhow::Error> = stream::iter(expunged_zones)
.filter_map(|(sled_id, config)| async move {
Expand Down Expand Up @@ -429,7 +435,7 @@ mod test {
// Get a success result back when the blueprint has an empty set of
// zones.
let (_, blueprint) = create_blueprint(BTreeMap::new());
deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
_ = deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
.await
.expect("failed to deploy no zones");

Expand Down Expand Up @@ -487,7 +493,7 @@ mod test {
}

// Execute it.
deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
_ = deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
.await
.expect("failed to deploy initial zones");

Expand All @@ -504,7 +510,7 @@ mod test {
.respond_with(status_code(204)),
);
}
deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
_ = deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
.await
.expect("failed to deploy same zones");
s1.verify_and_clear();
Expand Down Expand Up @@ -590,7 +596,7 @@ mod test {
}

// Activate the task
deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
_ = deploy_zones(&opctx, &sleds_by_id, &blueprint.blueprint_zones)
.await
.expect("failed to deploy last round of zones");
s1.verify_and_clear();
Expand Down Expand Up @@ -641,6 +647,10 @@ mod test {
}
let fake_resolver = FixedResolver(vec![mock_admin.addr()]);

// This is a unit test, so pretend we already successfully called
// deploy_zones.
let deploy_zones_done = DeployZonesDone(());

// We haven't yet inserted a mapping from zone ID to cockroach node ID
// in the db, so trying to clean up the zone should log a warning but
// otherwise succeed, without attempting to contact our mock admin
Expand All @@ -651,6 +661,7 @@ mod test {
datastore,
&fake_resolver,
iter::once((any_sled_id, &crdb_zone)),
&deploy_zones_done,
)
.await
.expect("unknown node ID: no cleanup");
Expand Down Expand Up @@ -697,6 +708,7 @@ mod test {
datastore,
&fake_resolver,
iter::once((any_sled_id, &crdb_zone)),
&deploy_zones_done,
)
.await
.expect("decommissioned test node");
Expand Down Expand Up @@ -728,6 +740,7 @@ mod test {
datastore,
&fake_resolver,
iter::once((any_sled_id, &crdb_zone)),
&deploy_zones_done,
)
.await
.expect_err("no successful response should result in failure");
Expand Down Expand Up @@ -756,6 +769,7 @@ mod test {
datastore,
&fake_resolver,
iter::once((any_sled_id, &crdb_zone)),
&deploy_zones_done,
)
.await
.expect("decommissioned test node");
Expand Down
2 changes: 2 additions & 0 deletions nexus/reconfigurator/execution/src/sagas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

//! Re-assign sagas from expunged Nexus zones

use crate::omicron_zones::DeployZonesDone;
use nexus_db_model::SecId;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::DataStore;
Expand All @@ -20,6 +21,7 @@ pub(crate) async fn reassign_sagas_from_expunged(
datastore: &DataStore,
blueprint: &Blueprint,
nexus_id: SecId,
_deploy_zones_done: &DeployZonesDone,
) -> Result<bool, Error> {
let log = &opctx.log;

Expand Down
Loading