Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ use clap::builder::PossibleValuesParser;
use clap::builder::TypedValueParser;
use db_metadata::DbMetadataArgs;
use db_metadata::DbMetadataCommands;
use db_metadata::cmd_db_metadata_force_nexus_quiesce;
use db_metadata::cmd_db_metadata_list_nexus;
use diesel::BoolExpressionMethods;
use diesel::ExpressionMethods;
Expand Down Expand Up @@ -1148,6 +1149,12 @@ impl DbArgs {
}) => {
cmd_db_metadata_list_nexus(&opctx, &datastore).await
}
DbCommands::DbMetadata(DbMetadataArgs {
command: DbMetadataCommands::ForceNexusQuiesce(args),
}) => {
let token = omdb.check_allow_destructive()?;
cmd_db_metadata_force_nexus_quiesce(&opctx, &datastore, args, token).await
}
DbCommands::CrucibleDataset(CrucibleDatasetArgs {
command: CrucibleDatasetCommands::List,
}) => {
Expand Down
72 changes: 72 additions & 0 deletions dev-tools/omdb/src/bin/omdb/db/db_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
//! `omdb db db_metadata` subcommands

use super::display_option_blank;

use crate::check_allow_destructive::DestructiveOperationToken;
use anyhow::Context;
use anyhow::bail;
use clap::ArgAction;
use clap::Args;
use clap::Subcommand;
use nexus_db_model::DbMetadataNexusState;
Expand All @@ -27,8 +31,37 @@ pub struct DbMetadataArgs {

#[derive(Debug, Subcommand, Clone)]
pub enum DbMetadataCommands {
/// Lists the `db_metadata_nexus` records for all Nexuses.
#[clap(alias = "ls-nexus")]
ListNexus,

/// !!! DANGEROUS !!! Updates a `db_metadata_nexus` record to 'Quiesced'
///
/// THIS OPERATION IS DANGEROUS. It is the responsibility of the caller
/// to ensure that the specified Nexus zone is not running.
///
/// If the Nexus being updated is actually running, this operation
/// may cause arbitrary data corruption, as it can allow multiple Nexuses
/// at distinct database verions to inadvertently be running concurrently.
///
/// This operation is intended to assist in the explicit case where a Nexus
/// is unable to finish marking itself quiesced during the handoff process,
/// and cannot be expunged.
ForceNexusQuiesce(ForceNexusQuiesceArgs),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Take it or leave it:

Suggested change
ForceNexusQuiesce(ForceNexusQuiesceArgs),
ForceMarkNexusQuiesced(ForceMarkNexusQuiescedArgs),

The idea is to convey that you're marking Nexus quiesced, whether it is or not -- and not that you're actually quiescing anything.

}

#[derive(Debug, Args, Clone)]
pub struct ForceNexusQuiesceArgs {
/// The UUID of the Nexus zone to be marked quiesced
id: OmicronZoneUuid,

/// If "true": don't bother parsing the target blueprint to identify the
/// validity of the [`id`] argument.
///
/// Forcing Nexus to quiesce is already an unsafe operation; this makes
/// it even less safe. Use with caution.
#[arg(long, action=ArgAction::SetTrue)]
ignore_target_blueprint: bool,
Comment on lines +58 to +64
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big deal, but it's not super obvious what the implications of this are. Maybe:

Suggested change
/// If "true": don't bother parsing the target blueprint to identify the
/// validity of the [`id`] argument.
///
/// Forcing Nexus to quiesce is already an unsafe operation; this makes
/// it even less safe. Use with caution.
#[arg(long, action=ArgAction::SetTrue)]
ignore_target_blueprint: bool,
/// Skip checking the target blueprint to determine whether Nexus zone `id` is from the generation of Nexus zones that could be active or handing off.
///
/// Forcing Nexus to quiesce is already an unsafe operation; this makes
/// it even less safe. Use with caution.
#[arg(long, action=ArgAction::SetTrue)]
skip_zone_expunged_check: bool,

}

// DB Metadata
Expand Down Expand Up @@ -152,3 +185,42 @@ pub async fn cmd_db_metadata_list_nexus(

Ok(())
}

pub async fn cmd_db_metadata_force_nexus_quiesce(
opctx: &OpContext,
datastore: &DataStore,
args: &ForceNexusQuiesceArgs,
_destruction_token: DestructiveOperationToken,
) -> Result<(), anyhow::Error> {
if !args.ignore_target_blueprint {
let (_, current_target_blueprint) = datastore
.blueprint_target_get_current_full(opctx)
.await
.context("loading current target blueprint")?;
let nexus_generation = current_target_blueprint
.all_nexus_zones(BlueprintZoneDisposition::is_in_service)
.find_map(|(_, zone, nexus_zone)| {
if zone.id == args.id {
Some(nexus_zone.nexus_generation)
} else {
None
}
});

let Some(gen) = nexus_generation else {
bail!("Nexus {} not found in blueprint", args.id);
};
let bp_gen = current_target_blueprint.nexus_generation;
if bp_gen <= gen {
bail!(
"Nexus {} not ready to quiesce (nexus generation {gen} >= blueprint gen {bp_gen})",
args.id
);
}
}

datastore.database_nexus_access_update_quiesced(args.id).await?;
println!("Quiesced {}", args.id);

Ok(())
}
20 changes: 20 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ stderr:
note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
note: database schema version matches expected (<redacted database version>)
=============================================
EXECUTING COMMAND: omdb ["--destructive", "db", "db-metadata", "force-nexus-quiesce", "..........<REDACTED_UUID>..........."]
termination: Exited(1)
---------------------------------------------
stdout:
---------------------------------------------
stderr:
note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
note: database schema version matches expected (<redacted database version>)
Error: Nexus ..........<REDACTED_UUID>........... not ready to quiesce (nexus generation 1 >= blueprint gen 1)
=============================================
EXECUTING COMMAND: omdb ["db", "disks", "list"]
termination: Exited(0)
---------------------------------------------
Expand Down Expand Up @@ -1756,3 +1766,13 @@ note: database schema version matches expected (<redacted database version>)
assembling reconfigurator state ... done
wrote <TMP_PATH_REDACTED>
=============================================
EXECUTING COMMAND: omdb ["--destructive", "db", "db-metadata", "force-nexus-quiesce", "--ignore-target-blueprint", "..........<REDACTED_UUID>..........."]
termination: Exited(0)
---------------------------------------------
stdout:
Quiesced ..........<REDACTED_UUID>...........
---------------------------------------------
stderr:
note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
note: database schema version matches expected (<redacted database version>)
=============================================
27 changes: 27 additions & 0 deletions dev-tools/omdb/tests/test_all_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,19 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {

let invocations: &[&[&str]] = &[
&["db", "db-metadata", "ls-nexus"],
// We expect this operation to fail (the nexus generation is the same
// as the one in the target blueprint - it shouldn't be trying to
// quiesce yet).
//
// We test a version of this command which sets this record to quiesced
// anyway as the final invocation.
&[
"--destructive",
"db",
"db-metadata",
"force-nexus-quiesce",
&cptestctx.server.server_context().nexus.id().to_string(),
],
&["db", "disks", "list"],
&["db", "dns", "show"],
&["db", "dns", "diff", "external", "2"],
Expand Down Expand Up @@ -274,6 +287,20 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
// We can't easily test the sled agent output because that's only
// provided by a real sled agent, which is not available in the
// ControlPlaneTestContext.

// This operation will set the "db_metadata_nexus" state to quiesced.
//
// This would normally only be set by a Nexus as it shuts itself down;
// save it for last to avoid causing a weird state while testing other
// commands.
&[
"--destructive",
"db",
"db-metadata",
"force-nexus-quiesce",
"--ignore-target-blueprint",
&cptestctx.server.server_context().nexus.id().to_string(),
],
];

let mut redactor = Redactor::default();
Expand Down
Loading