From 29d20936a2c515f94177ab3a6a83302f0c957c8f Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 28 May 2025 11:43:29 -0700
Subject: [PATCH 1/5] initial draft: planning SP, some pieces missing

---
 Cargo.lock                                    |    2 +
 nexus/reconfigurator/planning/Cargo.toml      |    2 +
 nexus/reconfigurator/planning/src/lib.rs      |    1 +
 .../planning/src/mgs_updates/mod.rs           | 1197 +++++++++++++++++
 nexus/types/src/deployment.rs                 |    1 -
 5 files changed, 1202 insertions(+), 1 deletion(-)
 create mode 100644 nexus/reconfigurator/planning/src/mgs_updates/mod.rs
diff --git a/Cargo.lock b/Cargo.lock
index 4e7338cd2fc..4bbbb4baac5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6466,6 +6466,7 @@ dependencies = [
  "clickhouse-admin-types",
  "daft",
  "debug-ignore",
+ "dropshot",
  "expectorate",
  "gateway-client",
  "id-map",
@@ -6488,6 +6489,7 @@ dependencies = [
  "oxnet",
  "proptest",
  "rand 0.8.5",
+ "semver 1.0.26",
  "sled-agent-client",
  "slog",
  "slog-error-chain",
diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml
index 695e65eb584..d45ffec496c 100644
--- a/nexus/reconfigurator/planning/Cargo.toml
+++ b/nexus/reconfigurator/planning/Cargo.toml
@@ -42,9 +42,11 @@ uuid.workspace = true
 omicron-workspace-hack.workspace = true
 
 [dev-dependencies]
+dropshot.workspace = true
 expectorate.workspace = true
 maplit.workspace = true
 omicron-common = { workspace = true, features = ["testing"] }
 omicron-test-utils.workspace = true
 proptest.workspace = true
+semver.workspace = true
 test-strategy.workspace = true
diff --git a/nexus/reconfigurator/planning/src/lib.rs b/nexus/reconfigurator/planning/src/lib.rs
index 8262380075c..a11c5e4132c 100644
--- a/nexus/reconfigurator/planning/src/lib.rs
+++ b/nexus/reconfigurator/planning/src/lib.rs
@@ -9,5 +9,6 @@
 pub mod blueprint_builder;
 pub mod blueprint_editor;
 pub mod example;
+pub mod mgs_updates;
 pub mod planner;
 pub mod system;
diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
new file mode 100644
index 00000000000..241b05f7e6c
--- /dev/null
+++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
@@ -0,0 +1,1197 @@
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+//! Facilities for making choices about MGS-managed updates
+
+use nexus_types::deployment::ExpectedVersion;
+use nexus_types::deployment::PendingMgsUpdate;
+use nexus_types::deployment::PendingMgsUpdateDetails;
+use nexus_types::deployment::PendingMgsUpdates;
+use nexus_types::inventory::BaseboardId;
+use nexus_types::inventory::CabooseWhich;
+use nexus_types::inventory::Collection;
+use omicron_common::api::external::TufRepoDescription;
+use slog::{debug, error, info, warn};
+use slog_error_chain::InlineErrorChain;
+use std::collections::BTreeSet;
+use std::sync::Arc;
+use thiserror::Error;
+use tufaceous_artifact::ArtifactVersion;
+use tufaceous_artifact::KnownArtifactKind;
+
+pub fn plan_mgs_updates(
+    log: &slog::Logger,
+    inventory: &Collection,
+    current_boards: &BTreeSet<Arc<BaseboardId>>,
+    current_updates: &PendingMgsUpdates,
+    current_artifacts: Option<&TufRepoDescription>,
+    nmax_updates: usize,
+) -> PendingMgsUpdates {
+    let mut rv = PendingMgsUpdates::new();
+    let mut boards_preferred = BTreeSet::new();
+
+    // Determine the status of all currently pending updates by comparing what
+    // they were trying to do (and their preconditions) against the current
+    // state (from inventory).
+    //
+    // If a pending update is either done or impossible, we'll prioritize
+    // evaluating the same board for any further updates.  For the "done" case,
+    // this will cause us to update one board's SP, RoT, etc. before moving onto
+    // another board.  For the "impossible" case, this should just fix up the
+    // update request with updated preconditions so that it can complete.
+    //
+    // If a pending update is in-progress or if we cannot determine its status
+    // because inventory is incomplete, then we'll "keep" it (that is, we copy
+    // the same update into the `PendingMgsUpdates` that we're returning).
+    for update in current_updates {
+        match mgs_update_status(log, inventory, update) {
+            Ok(MgsUpdateStatus::Done) => {
+                info!(
+                    log,
+                    "SP update completed \
+                     (will remove it and re-evaluate board)";
+                    update
+                );
+                boards_preferred.insert(update.baseboard_id.clone());
+            }
+            Ok(MgsUpdateStatus::Impossible) => {
+                info!(
+                    log,
+                    "SP update impossible
+                     (will remove it and re-evaluate board)";
+                    update
+                );
+                boards_preferred.insert(update.baseboard_id.clone());
+            }
+            Ok(MgsUpdateStatus::NotDone) => {
+                info!(
+                    log,
+                    "SP update not yet completed (will keep it)";
+                    update
+                );
+                rv.insert(update.clone());
+            }
+            Err(error) => {
+                info!(
+                    log,
+                    "cannot determine SP update status (will keep it)";
+                    update,
+                    InlineErrorChain::new(&error)
+                );
+                rv.insert(update.clone());
+            }
+        }
+    }
+
+    // If we don't have any current artifacts (i.e., there is no target release
+    // set), then we cannot configure more updates.
+    let Some(current_artifacts) = &current_artifacts else {
+        warn!(log, "cannot issue more SP updates (no current artifacts)");
+        return rv;
+    };
+
+    // Next, configure new updates for any boards that need an update, up to
+    // `nmax_updates`.
+    //
+    // For the reasons mentioned above, we'll start with the boards that just
+    // had an in-progress update that we elected not to keep.  Then we'll look
+    // at all the other boards.  Note that using `extend` here will cause the
+    // boards that we're prioritizing to appear twice in this list.
+    let non_preferred =
+        current_boards.iter().filter(|b| !boards_preferred.contains(*b));
+    let candidates = boards_preferred.iter().chain(non_preferred);
+    for board in candidates {
+        if rv.len() == nmax_updates {
+            info!(
+                log,
+                "reached maximum number of pending SP updates";
+                "max" => nmax_updates
+            );
+            return rv;
+        }
+
+        match try_make_update(log, board, inventory, current_artifacts) {
+            Some(update) => {
+                info!(log, "configuring SP update"; &update);
+                rv.insert(update);
+            }
+            None => {
+                info!(log, "skipping board for SP update"; board);
+            }
+        }
+    }
+
+    info!(log, "ran out of boards for SP update");
+    rv
+}
+
+#[derive(Debug)]
+enum MgsUpdateStatus {
+    /// the requested update has completed (i.e., the active slot is running the
+    /// requested version)
+    Done,
+    /// the requested update has not completed, but remains possible
+    /// (i.e., the active slot is not running the requested version and the
+    /// preconditions remain true)
+    NotDone,
+    /// the requested update has not completed and the preconditions are not
+    /// currently met
+    Impossible,
+}
+
+#[derive(Debug, Error)]
+enum MgsUpdateStatusError {
+    #[error("no SP info found in inventory")]
+    MissingSpInfo,
+    #[error("no caboose found for active slot in inventory")]
+    MissingActiveCaboose,
+    #[error("not yet implemented")]
+    NotYetImplemented,
+}
+
+/// Determine the status of a single MGS update based on what's in inventory for
+/// that board.
+fn mgs_update_status(
+    log: &slog::Logger,
+    inventory: &Collection,
+    update: &PendingMgsUpdate,
+) -> Result<MgsUpdateStatus, MgsUpdateStatusError> {
+    let baseboard_id = &update.baseboard_id;
+    let desired_version = &update.artifact_version;
+
+    // Check the contents of the cabooses against what we expect either before
+    // or after the update.
+    //
+    // We check this before anything else because if we get back
+    // `MgsUpdateStatus::Done`, then we're done no matter what else is true.
+    let caboose_status = match &update.details {
+        PendingMgsUpdateDetails::Sp {
+            expected_active_version,
+            expected_inactive_version,
+        } => {
+            let Some(active_caboose) =
+                inventory.caboose_for(CabooseWhich::SpSlot0, baseboard_id)
+            else {
+                return Err(MgsUpdateStatusError::MissingActiveCaboose);
+            };
+
+            let found_inactive_version = inventory
+                .caboose_for(CabooseWhich::SpSlot1, baseboard_id)
+                .map(|c| c.caboose.version.as_ref());
+
+            Ok(mgs_update_status_sp(
+                desired_version,
+                expected_active_version,
+                expected_inactive_version,
+                &active_caboose.caboose.version,
+                found_inactive_version,
+            ))
+        }
+        PendingMgsUpdateDetails::Rot { .. } => {
+            return Err(MgsUpdateStatusError::NotYetImplemented);
+        }
+    };
+
+    // If we're able to reach a clear determination based on the caboose status
+    // alone, great.  Return that.
+    if matches!(
+        caboose_status,
+        Err(_) | Ok(MgsUpdateStatus::Done) | Ok(MgsUpdateStatus::Impossible)
+    ) {
+        return caboose_status;
+    }
+
+    // If based on the caboose we're only able to determine that the update is
+    // not yet done, there's another "impossible" case to consider: that the
+    // baseboard has moved in the rack.
+    let sp_info = inventory
+        .sps
+        .get(baseboard_id)
+        .ok_or(MgsUpdateStatusError::MissingSpInfo)?;
+    if sp_info.sp_type != update.sp_type {
+        // This should be impossible.  This same baseboard has somehow changed
+        // its type (e.g., sled vs. switch vs. PSC).  This doesn't affect what
+        // we do here but definitely raises a red flag.
+        error!(
+            log,
+            "baseboard appears to have changed board type";
+            "sp_info" => #?sp_info,
+            update,
+        );
+        Ok(MgsUpdateStatus::Impossible)
+    } else if u32::from(sp_info.sp_slot) != update.slot_id {
+        warn!(
+            log,
+            "baseboard with in-progress SP update has moved";
+            "sp_info" => #?sp_info,
+            update,
+        );
+        Ok(MgsUpdateStatus::Impossible)
+    } else {
+        caboose_status
+    }
+}
+
+fn mgs_update_status_sp(
+    desired_version: &ArtifactVersion,
+    expected_active_version: &ArtifactVersion,
+    expected_inactive_version: &ExpectedVersion,
+    found_active_version: &str,
+    found_inactive_version: Option<&str>,
+) -> MgsUpdateStatus {
+    if found_active_version == desired_version.as_str() {
+        // If we find the desired version in the active slot, we're done.
+        return MgsUpdateStatus::Done;
+    }
+
+    // The update hasn't completed.
+    //
+    // Check to make sure the contents of the active slot are still what they
+    // were when we configured this update.  If not, then this update cannot
+    // proceed as currently configured.  It will fail its precondition check.
+    if found_active_version != expected_active_version.as_str() {
+        return MgsUpdateStatus::Impossible;
+    }
+
+    // Similarly, check the contents of the inactive slot to determine if it
+    // still matches what we saw when we configured this update.  If not, then
+    // this update cannot proceed as currently configured.  It will fail its
+    // precondition check.
+    //
+    // This logic is more complex than for the active slot because unlike the
+    // active slot, it's possible for both the found contents and the expected
+    // contents to be missing and that's not necessarily an error.
+    match (found_inactive_version, expected_inactive_version) {
+        (Some(_), ExpectedVersion::NoValidVersion) => {
+            // We expected nothing in the inactive slot, but found something.
+            MgsUpdateStatus::Impossible
+        }
+        (Some(found), ExpectedVersion::Version(expected)) => {
+            if found == expected.as_str() {
+                // We found something in the inactive slot that matches what we
+                // expected.
+                MgsUpdateStatus::NotDone
+            } else {
+                // We found something in the inactive slot that differs from
+                // what we expected.
+                MgsUpdateStatus::Impossible
+            }
+        }
+        (None, ExpectedVersion::Version(_)) => {
+            // We expected something in the inactive slot, but found nothing.
+            // This case is tricky because we can't tell from the inventory
+            // whether we transiently failed to fetch the caboose for some
+            // reason or whether the caboose is actually garbage.  We choose to
+            // assume that it's actually garbage, which would mean that this
+            // update as-configured is impossible.  This will cause us to
+            // generate a new update that expects garbage in the inactive slot.
+            // If we're right, great.  If we're wrong, then *that* update will
+            // be impossible to complete, but we should fix this again if the
+            // transient error goes away.
+            //
+            // If we instead assumed that this was a transient error, we'd do
+            // nothing here instead.  But if the caboose was really missing,
+            // then we'd get stuck forever waiting for something that would
+            // never happen.
+            MgsUpdateStatus::Impossible
+        }
+        (None, ExpectedVersion::NoValidVersion) => {
+            // We expected nothing in the inactive slot and found nothing there.
+            // No problem!
+            MgsUpdateStatus::NotDone
+        }
+    }
+}
+
+fn try_make_update(
+    log: &slog::Logger,
+    baseboard_id: &Arc<BaseboardId>,
+    inventory: &Collection,
+    current_artifacts: &TufRepoDescription,
+) -> Option<PendingMgsUpdate> {
+    // TODO When we add support for planning RoT, RoT bootloader, and host OS
+    // updates, we'll try these in a hardcoded priority order until any of them
+    // returns `Some`.  For now, we only plan SP updates.
+    try_make_update_sp(log, baseboard_id, inventory, current_artifacts)
+}
+
+fn try_make_update_sp(
+    log: &slog::Logger,
+    baseboard_id: &Arc<BaseboardId>,
+    inventory: &Collection,
+    current_artifacts: &TufRepoDescription,
+) -> Option<PendingMgsUpdate> {
+    let Some(sp_info) = inventory.sps.get(baseboard_id) else {
+        warn!(
+            log,
+            "cannot configure SP update for board \
+             (missing SP info from inventory)";
+            baseboard_id
+        );
+        return None;
+    };
+
+    let Some(active_caboose) =
+        inventory.caboose_for(CabooseWhich::SpSlot0, baseboard_id)
+    else {
+        warn!(
+            log,
+            "cannot configure SP update for board \
+             (missing active caboose from inventory)";
+            baseboard_id,
+        );
+        return None;
+    };
+
+    let Ok(expected_active_version) = active_caboose.caboose.version.parse()
+    else {
+        warn!(
+            log,
+            "cannot configure SP update for board \
+             (cannot parse current active version as an ArtifactVersion)";
+            baseboard_id,
+            "found_version" => &active_caboose.caboose.version,
+        );
+        return None;
+    };
+
+    let board = &active_caboose.caboose.board;
+    let matching_artifacts: Vec<_> = current_artifacts
+        .artifacts
+        .iter()
+        .filter(|a| {
+            // A matching SP artifact will have:
+            //
+            // - "name" matching the board name (found above from caboose)
+            // - "kind" matching one of the known SP kinds
+
+            if a.id.name != *board {
+                return false;
+            }
+
+            match a.id.kind.to_known() {
+                None => false,
+                Some(
+                    KnownArtifactKind::GimletSp
+                    | KnownArtifactKind::PscSp
+                    | KnownArtifactKind::SwitchSp,
+                ) => true,
+                Some(
+                    KnownArtifactKind::GimletRot
+                    | KnownArtifactKind::Host
+                    | KnownArtifactKind::Trampoline
+                    | KnownArtifactKind::ControlPlane
+                    | KnownArtifactKind::Zone
+                    | KnownArtifactKind::PscRot
+                    | KnownArtifactKind::SwitchRot
+                    | KnownArtifactKind::GimletRotBootloader
+                    | KnownArtifactKind::PscRotBootloader
+                    | KnownArtifactKind::SwitchRotBootloader,
+                ) => false,
+            }
+        })
+        .collect();
+    if matching_artifacts.is_empty() {
+        warn!(
+            log,
+            "cannot configure SP update for board (no matching artifact)";
+            baseboard_id,
+        );
+        return None;
+    }
+
+    if matching_artifacts.len() > 1 {
+        // This is noteworthy, but not a problem.  We'll just pick one.
+        warn!(log, "found more than one matching artifact for SP update");
+    }
+
+    let artifact = matching_artifacts[0];
+
+    // If the artifact's version matches what's deployed, then no update is
+    // needed.
+    if artifact.id.version == expected_active_version {
+        debug!(log, "no SP update needed for board"; baseboard_id);
+        return None;
+    }
+
+    // Begin configuring an update.
+    let expected_inactive_version = match inventory
+        .caboose_for(CabooseWhich::SpSlot1, baseboard_id)
+        .map(|c| c.caboose.version.parse::<ArtifactVersion>())
+        .transpose()
+    {
+        Ok(None) => ExpectedVersion::NoValidVersion,
+        Ok(Some(v)) => ExpectedVersion::Version(v),
+        Err(_) => {
+            warn!(
+                log,
+                "cannot configure SP update for board
+                 (found inactive slot contents but version was not valid)";
+                baseboard_id
+            );
+            return None;
+        }
+    };
+
+    Some(PendingMgsUpdate {
+        baseboard_id: baseboard_id.clone(),
+        sp_type: sp_info.sp_type,
+        slot_id: u32::from(sp_info.sp_slot),
+        details: PendingMgsUpdateDetails::Sp {
+            expected_active_version,
+            expected_inactive_version,
+        },
+        artifact_hash: artifact.hash,
+        artifact_version: artifact.id.version.clone(),
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use crate::mgs_updates::plan_mgs_updates;
+    use chrono::Utc;
+    use dropshot::ConfigLogging;
+    use dropshot::ConfigLoggingLevel;
+    use gateway_client::types::PowerState;
+    use gateway_client::types::RotState;
+    use gateway_client::types::SpComponentCaboose;
+    use gateway_client::types::SpState;
+    use gateway_client::types::SpType;
+    use nexus_types::deployment::ExpectedVersion;
+    use nexus_types::deployment::PendingMgsUpdate;
+    use nexus_types::deployment::PendingMgsUpdateDetails;
+    use nexus_types::deployment::PendingMgsUpdates;
+    use nexus_types::inventory::CabooseWhich;
+    use nexus_types::inventory::Collection;
+    use nexus_types::inventory::RotSlot;
+    use omicron_common::api::external::TufArtifactMeta;
+    use omicron_common::api::external::TufRepoDescription;
+    use omicron_common::api::external::TufRepoMeta;
+    use omicron_common::update::ArtifactId;
+    use omicron_test_utils::dev::LogContext;
+    use std::collections::BTreeMap;
+    use std::collections::BTreeSet;
+    use tufaceous_artifact::ArtifactHash;
+    use tufaceous_artifact::ArtifactVersion;
+    use tufaceous_artifact::KnownArtifactKind;
+
+    /// Version that will be used for all artifacts in the TUF repo
+    const ARTIFACT_VERSION_2: ArtifactVersion =
+        ArtifactVersion::new_const("2.0.0");
+    /// Version that will be "deployed" in the SP we want to update
+    const ARTIFACT_VERSION_1: ArtifactVersion =
+        ArtifactVersion::new_const("1.0.0");
+    /// Version that's different from the other two
+    const ARTIFACT_VERSION_1_5: ArtifactVersion =
+        ArtifactVersion::new_const("1.5.0");
+
+    /// Hash of fake artifact for fake gimlet-e SP
+    const ARTIFACT_HASH_SP_GIMLET_E: ArtifactHash = ArtifactHash([1; 32]);
+    /// Hash of fake artifact for fake gimlet-d SP
+    const ARTIFACT_HASH_SP_GIMLET_D: ArtifactHash = ArtifactHash([2; 32]);
+    /// Hash of fake artifact for fake sidecar-b SP
+    const ARTIFACT_HASH_SP_SIDECAR_B: ArtifactHash = ArtifactHash([5; 32]);
+    /// Hash of fake artifact for fake sidecar-c SP
+    const ARTIFACT_HASH_SP_SIDECAR_C: ArtifactHash = ArtifactHash([6; 32]);
+    /// Hash of fake artifact for fake psc-b SP
+    const ARTIFACT_HASH_SP_PSC_B: ArtifactHash = ArtifactHash([9; 32]);
+    /// Hash of fake artifact for fake psc-c SP
+    const ARTIFACT_HASH_SP_PSC_C: ArtifactHash = ArtifactHash([10; 32]);
+
+    // unused artifact hashes
+
+    const ARTIFACT_HASH_CONTROL_PLANE: ArtifactHash = ArtifactHash([33; 32]);
+    const ARTIFACT_HASH_NEXUS: ArtifactHash = ArtifactHash([34; 32]);
+    const ARTIFACT_HASH_HOST_OS: ArtifactHash = ArtifactHash([35; 32]);
+
+    fn test_artifact_for_board(board: &str) -> ArtifactHash {
+        match board {
+            "gimlet-d" => ARTIFACT_HASH_SP_GIMLET_D,
+            "gimlet-e" => ARTIFACT_HASH_SP_GIMLET_E,
+            "sidecar-b" => ARTIFACT_HASH_SP_SIDECAR_B,
+            "sidecar-c" => ARTIFACT_HASH_SP_SIDECAR_C,
+            "psc-b" => ARTIFACT_HASH_SP_PSC_B,
+            "psc-c" => ARTIFACT_HASH_SP_PSC_C,
+            _ => panic!("test bug: no artifact for board {board:?}"),
+        }
+    }
+
+    /// Describes the SPs in the environment used in these tests
+    ///
+    /// There will be:
+    ///
+    /// - 4 sled SPs
+    /// - 2 switch SPs
+    /// - 2 PSC SPs
+    ///
+    /// The specific set of hardware (boards) vary and are hardcoded:
+    ///
+    /// - sled 0: gimlet-d
+    /// - other sleds: gimlet-e
+    /// - switch 0: sidecar-b
+    /// - switch 1: sidecar-c
+    /// - psc 0: psc-b
+    /// - psc 1: psc-c
+    fn test_config() -> BTreeMap<(SpType, u32), (&'static str, &'static str)> {
+        BTreeMap::from([
+            ((SpType::Sled, 0), ("sled_0", "gimlet-d")),
+            ((SpType::Sled, 1), ("sled_1", "gimlet-e")),
+            ((SpType::Sled, 2), ("sled_2", "gimlet-e")),
+            ((SpType::Sled, 3), ("sled_3", "gimlet-e")),
+            ((SpType::Switch, 0), ("switch_0", "sidecar-b")),
+            ((SpType::Switch, 1), ("switch_1", "sidecar-c")),
+            ((SpType::Power, 0), ("power_0", "psc-b")),
+            ((SpType::Power, 1), ("power_1", "psc-c")),
+        ])
+    }
+
+    /// Returns a TufRepoDescription that we can use to exercise the planning
+    /// code.
+    fn make_tuf_repo() -> TufRepoDescription {
+        const SYSTEM_VERSION: semver::Version = semver::Version::new(0, 0, 1);
+        const SYSTEM_HASH: ArtifactHash = ArtifactHash([3; 32]);
+
+        // Include a bunch of SP-related artifacts, as well as a few others just
+        // to make sure those are properly ignored.
+        let artifacts = vec![
+            make_artifact(
+                "control-plane",
+                KnownArtifactKind::ControlPlane,
+                ARTIFACT_HASH_CONTROL_PLANE,
+            ),
+            make_artifact(
+                "nexus",
+                KnownArtifactKind::Zone,
+                ARTIFACT_HASH_NEXUS,
+            ),
+            make_artifact(
+                "host-os",
+                KnownArtifactKind::Host,
+                ARTIFACT_HASH_HOST_OS,
+            ),
+            make_artifact(
+                "gimlet-d",
+                KnownArtifactKind::GimletSp,
+                test_artifact_for_board("gimlet-d"),
+            ),
+            make_artifact(
+                "gimlet-e",
+                KnownArtifactKind::GimletSp,
+                test_artifact_for_board("gimlet-e"),
+            ),
+            make_artifact(
+                "sidecar-b",
+                KnownArtifactKind::SwitchSp,
+                test_artifact_for_board("sidecar-b"),
+            ),
+            make_artifact(
+                "sidecar-c",
+                KnownArtifactKind::SwitchSp,
+                test_artifact_for_board("sidecar-c"),
+            ),
+            make_artifact(
+                "psc-b",
+                KnownArtifactKind::PscSp,
+                test_artifact_for_board("psc-b"),
+            ),
+            make_artifact(
+                "psc-c",
+                KnownArtifactKind::PscSp,
+                test_artifact_for_board("psc-c"),
+            ),
+        ];
+
+        TufRepoDescription {
+            repo: TufRepoMeta {
+                hash: SYSTEM_HASH,
+                targets_role_version: 0,
+                valid_until: Utc::now(),
+                system_version: SYSTEM_VERSION,
+                file_name: String::new(),
+            },
+            artifacts,
+        }
+    }
+
+    fn make_artifact(
+        name: &str,
+        kind: KnownArtifactKind,
+        hash: ArtifactHash,
+    ) -> TufArtifactMeta {
+        TufArtifactMeta {
+            id: ArtifactId {
+                name: name.to_string(),
+                version: ARTIFACT_VERSION_2,
+                kind: kind.into(),
+            },
+            hash,
+            size: 0, // unused here
+        }
+    }
+
+    // Construct inventory for an environment suitable for our testing.
+    //
+    // See test_config() for information about the hardware.  All SPs will
+    // appear to be running version `active_version` except those identified in
+    // `active_version_exceptions`.  All SPs will appear to have
+    // `inactive_version` in the inactive slot.
+    fn make_collection(
+        active_version: ArtifactVersion,
+        active_version_exceptions: &BTreeMap<(SpType, u32), ArtifactVersion>,
+        inactive_version: ExpectedVersion,
+    ) -> Collection {
+        let mut builder = nexus_inventory::CollectionBuilder::new(
+            "planning_mgs_updates_basic",
+        );
+
+        let dummy_sp_state = SpState {
+            base_mac_address: [0; 6],
+            hubris_archive_id: String::from("unused"),
+            model: String::from("unused"),
+            power_state: PowerState::A0,
+            revision: 0,
+            rot: RotState::V3 {
+                active: RotSlot::A,
+                pending_persistent_boot_preference: None,
+                persistent_boot_preference: RotSlot::A,
+                slot_a_error: None,
+                slot_a_fwid: Default::default(),
+                slot_b_error: None,
+                slot_b_fwid: Default::default(),
+                stage0_error: None,
+                stage0_fwid: Default::default(),
+                stage0next_error: None,
+                stage0next_fwid: Default::default(),
+                transient_boot_preference: None,
+            },
+            serial_number: String::from("unused"),
+        };
+
+        let test_config = test_config();
+        for ((sp_type, sp_slot), (serial, caboose_board)) in test_config {
+            let sp_state = SpState {
+                model: format!("dummy_{}", sp_type),
+                serial_number: serial.to_string(),
+                ..dummy_sp_state.clone()
+            };
+
+            let baseboard_id = builder
+                .found_sp_state("test", sp_type, sp_slot, sp_state)
+                .unwrap();
+            let active_version = active_version_exceptions
+                .get(&(sp_type, sp_slot))
+                .unwrap_or(&active_version);
+
+            builder
+                .found_caboose(
+                    &baseboard_id,
+                    CabooseWhich::SpSlot0,
+                    "test",
+                    SpComponentCaboose {
+                        board: caboose_board.to_string(),
+                        epoch: None,
+                        git_commit: String::from("unused"),
+                        name: caboose_board.to_string(),
+                        sign: None,
+                        version: active_version.as_str().to_string(),
+                    },
+                )
+                .unwrap();
+
+            if let ExpectedVersion::Version(inactive_version) =
+                &inactive_version
+            {
+                builder
+                    .found_caboose(
+                        &baseboard_id,
+                        CabooseWhich::SpSlot1,
+                        "test",
+                        SpComponentCaboose {
+                            board: caboose_board.to_string(),
+                            epoch: None,
+                            git_commit: String::from("unused"),
+                            name: caboose_board.to_string(),
+                            sign: None,
+                            version: inactive_version.as_str().to_string(),
+                        },
+                    )
+                    .unwrap();
+            }
+        }
+
+        builder.build()
+    }
+
+    // Short hand-rolled update sequence that exercises some basic behavior.
+    #[test]
+    fn test_basic() {
+        let logctx = LogContext::new(
+            "planning_mgs_updates_basic",
+            &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
+        );
+        let log = &logctx.log;
+
+        // Test that with no updates pending and no TUF repo specified, there
+        // will remain no updates pending.
+        let collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let current_boards = &collection.baseboards;
+        let initial_updates = PendingMgsUpdates::new();
+        let nmax_updates = 1;
+        let updates = plan_mgs_updates(
+            log,
+            &collection,
+            current_boards,
+            &initial_updates,
+            None,
+            nmax_updates,
+        );
+        assert!(updates.is_empty());
+
+        // Test that when a TUF repo is specified and one SP is outdated, then
+        // it's configured with an update (and the update looks correct).
+        let repo = make_tuf_repo();
+        let updates = plan_mgs_updates(
+            log,
+            &collection,
+            current_boards,
+            &initial_updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_eq!(updates.len(), 1);
+        let first_update = updates.iter().next().expect("at least one update");
+        assert_eq!(first_update.baseboard_id.serial_number, "sled_0");
+        assert_eq!(first_update.sp_type, SpType::Sled);
+        assert_eq!(first_update.slot_id, 0);
+        assert_eq!(first_update.artifact_hash, ARTIFACT_HASH_SP_GIMLET_D);
+        assert_eq!(first_update.artifact_version, ARTIFACT_VERSION_2);
+
+        // Test that when an update is already pending, and nothing changes
+        // about the state of the world (i.e., the inventory), then the planner
+        // makes no changes.
+        let later_updates = plan_mgs_updates(
+            log,
+            &collection,
+            current_boards,
+            &updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_eq!(updates, later_updates);
+
+        // Test that when two updates are needed, but one is already pending,
+        // then the other one is *not* started (because it exceeds
+        // nmax_updates).
+        let later_collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([
+                ((SpType::Sled, 0), ARTIFACT_VERSION_1),
+                ((SpType::Switch, 1), ARTIFACT_VERSION_1),
+            ]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let later_updates = plan_mgs_updates(
+            log,
+            &later_collection,
+            current_boards,
+            &updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_eq!(updates, later_updates);
+
+        // At this point, we're ready to test that when the first update
+        // completes, then th esecond one *is* started.  This tests two
+        // different things: first that we noticed the first one completed, and
+        // second that we noticed another thing needed an update
+        let later_collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Switch, 1), ARTIFACT_VERSION_1)]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let later_updates = plan_mgs_updates(
+            log,
+            &later_collection,
+            current_boards,
+            &updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_eq!(later_updates.len(), 1);
+        let next_update =
+            later_updates.iter().next().expect("at least one update");
+        assert_ne!(first_update, next_update);
+        assert_eq!(next_update.baseboard_id.serial_number, "switch_1");
+        assert_eq!(next_update.sp_type, SpType::Switch);
+        assert_eq!(next_update.slot_id, 1);
+        assert_eq!(next_update.artifact_hash, ARTIFACT_HASH_SP_SIDECAR_C);
+        assert_eq!(next_update.artifact_version, ARTIFACT_VERSION_2);
+
+        // Finally, test that when all SPs are in spec, then no updates are
+        // configured.
+        let updated_collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::new(),
+            ExpectedVersion::NoValidVersion,
+        );
+        let later_updates = plan_mgs_updates(
+            log,
+            &updated_collection,
+            current_boards,
+            &later_updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert!(later_updates.is_empty());
+
+        // Test that we don't try to update boards that aren't in
+        // `current_boards`, even if they're in inventory and outdated.
+        let collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let updates = plan_mgs_updates(
+            log,
+            &collection,
+            &BTreeSet::new(),
+            &PendingMgsUpdates::new(),
+            Some(&repo),
+            nmax_updates,
+        );
+        assert!(updates.is_empty());
+        let updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &PendingMgsUpdates::new(),
+            Some(&repo),
+            nmax_updates,
+        );
+        // We verified most of the details above.  Here we're just double
+        // checking that the baseboard being missing is the only reason that no
+        // update was generated.
+        assert_eq!(updates.len(), 1);
+
+        // Verify the precondition details of an ordinary update.
+        let old_update =
+            updates.into_iter().next().expect("at least one update");
+        let PendingMgsUpdateDetails::Sp {
+            expected_active_version: old_expected_active_version,
+            expected_inactive_version: old_expected_inactive_version,
+        } = &old_update.details
+        else {
+            panic!("expected SP update");
+        };
+        assert_eq!(ARTIFACT_VERSION_1, *old_expected_active_version);
+        assert_eq!(
+            ExpectedVersion::NoValidVersion,
+            *old_expected_inactive_version
+        );
+
+        // Test that if the inactive slot contents have changed, then we'll get
+        // a new update reflecting that.
+        let collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]),
+            ExpectedVersion::Version(ARTIFACT_VERSION_1),
+        );
+        let new_updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_ne!(updates, new_updates);
+        assert_eq!(new_updates.len(), 1);
+        let new_update =
+            new_updates.into_iter().next().expect("at least one update");
+        assert_eq!(old_update.baseboard_id, new_update.baseboard_id);
+        assert_eq!(old_update.sp_type, new_update.sp_type);
+        assert_eq!(old_update.slot_id, new_update.slot_id);
+        assert_eq!(old_update.artifact_hash, new_update.artifact_hash);
+        assert_eq!(old_update.artifact_version, new_update.artifact_version);
+        let PendingMgsUpdateDetails::Sp {
+            expected_active_version: new_expected_active_version,
+            expected_inactive_version: new_expected_inactive_version,
+        } = &new_update.details
+        else {
+            panic!("expected SP update");
+        };
+        assert_eq!(ARTIFACT_VERSION_1, *new_expected_active_version);
+        assert_eq!(
+            ExpectedVersion::Version(ARTIFACT_VERSION_1),
+            *new_expected_inactive_version
+        );
+
+        // Test that if instead it's the active slot whose contents have changed
+        // to something other than the new expected version, then we'll also get
+        // a new update reflecting that.
+        let collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1_5)]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let new_updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert_ne!(updates, new_updates);
+        assert_eq!(new_updates.len(), 1);
+        let new_update =
+            new_updates.into_iter().next().expect("at least one update");
+        assert_eq!(old_update.baseboard_id, new_update.baseboard_id);
+        assert_eq!(old_update.sp_type, new_update.sp_type);
+        assert_eq!(old_update.slot_id, new_update.slot_id);
+        assert_eq!(old_update.artifact_hash, new_update.artifact_hash);
+        assert_eq!(old_update.artifact_version, new_update.artifact_version);
+        let PendingMgsUpdateDetails::Sp {
+            expected_active_version: new_expected_active_version,
+            expected_inactive_version: new_expected_inactive_version,
+        } = &new_update.details
+        else {
+            panic!("expected SP update");
+        };
+        assert_eq!(ARTIFACT_VERSION_1_5, *new_expected_active_version);
+        assert_eq!(
+            ExpectedVersion::NoValidVersion,
+            *new_expected_inactive_version
+        );
+
+        logctx.cleanup_successful();
+    }
+
+    // Updates a whole system's SPs one at a time
+    #[test]
+    fn test_whole_system_sequential() {
+        let logctx = LogContext::new(
+            "planning_mgs_updates_whole_system_sequential",
+            &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
+        );
+        let log = &logctx.log;
+        let mut latest_updates = PendingMgsUpdates::new();
+        let repo = make_tuf_repo();
+        let nmax_updates = 1;
+
+        // Maintain a map of SPs that we've updated.  We'll use this to
+        // configure the inventory collection that we create at each step.
+        let mut exceptions = BTreeMap::new();
+
+        // We do not control the order of updates.  But we expect to update each
+        // of the SPs in this map.  When we do, we expect to find the given
+        // artifact.
+        let mut expected_updates: BTreeMap<_, _> = test_config()
+            .into_iter()
+            .map(|(k, (serial, board_name))| {
+                (k, (serial, test_artifact_for_board(board_name)))
+            })
+            .collect();
+
+        for _ in 0..expected_updates.len() {
+            // Generate an inventory collection reflecting that everything is at
+            // version 1 except for what we've already updated.
+            let collection = make_collection(
+                ARTIFACT_VERSION_1,
+                &exceptions,
+                ExpectedVersion::NoValidVersion,
+            );
+
+            // For this test, all systems that are found in inventory are part
+            // of the control plane.
+            let current_boards = &collection.baseboards;
+
+            // Run the planner and verify that we got one of our expected
+            // updates.
+            let new_updates = plan_mgs_updates(
+                log,
+                &collection,
+                current_boards,
+                &latest_updates,
+                Some(&repo),
+                nmax_updates,
+            );
+            assert_eq!(new_updates.len(), 1);
+            let update =
+                new_updates.iter().next().expect("at least one update");
+            verify_one_sp_update(&mut expected_updates, update);
+
+            // Update `exceptions` for the next iteration.
+            let sp_type = update.sp_type;
+            let sp_slot = update.slot_id;
+            assert!(
+                exceptions
+                    .insert((sp_type, sp_slot), ARTIFACT_VERSION_2)
+                    .is_none()
+            );
+            latest_updates = new_updates;
+        }
+
+        // Take one more lap.  It should reflect zero updates.
+        let collection = make_collection(
+            ARTIFACT_VERSION_1,
+            &exceptions,
+            ExpectedVersion::NoValidVersion,
+        );
+        let last_updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &latest_updates,
+            Some(&repo),
+            nmax_updates,
+        );
+        assert!(last_updates.is_empty());
+
+        logctx.cleanup_successful();
+    }
+
+    // Updates a whole system's SPs at once
+    #[test]
+    fn test_whole_system_simultaneous() {
+        let logctx = LogContext::new(
+            "planning_mgs_updates_whole_system_simultaneous",
+            &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
+        );
+        let log = &logctx.log;
+        let repo = make_tuf_repo();
+
+        let mut expected_updates: BTreeMap<_, _> = test_config()
+            .into_iter()
+            .map(|(k, (serial, board_name))| {
+                (k, (serial, test_artifact_for_board(board_name)))
+            })
+            .collect();
+
+        // Update the whole system at once.
+        let collection = make_collection(
+            ARTIFACT_VERSION_1,
+            &BTreeMap::new(),
+            ExpectedVersion::NoValidVersion,
+        );
+        let all_updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &PendingMgsUpdates::new(),
+            Some(&repo),
+            usize::MAX,
+        );
+        assert_eq!(all_updates.len(), expected_updates.len());
+        for update in &all_updates {
+            verify_one_sp_update(&mut expected_updates, update);
+        }
+
+        // Now, notice when they've all been updated, even if the limit is only
+        // one.
+        let collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::new(),
+            ExpectedVersion::NoValidVersion,
+        );
+        let all_updates_done = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &all_updates,
+            Some(&repo),
+            1,
+        );
+        assert!(all_updates_done.is_empty());
+
+        logctx.cleanup_successful();
+    }
+
+    fn verify_one_sp_update(
+        expected_updates: &mut BTreeMap<(SpType, u32), (&str, ArtifactHash)>,
+        update: &PendingMgsUpdate,
+    ) {
+        let sp_type = update.sp_type;
+        let sp_slot = update.slot_id;
+        println!("found update: {} slot {}", sp_type, sp_slot);
+        let (expected_serial, expected_artifact) = expected_updates
+            .remove(&(sp_type, sp_slot))
+            .expect("unexpected update");
+        assert_eq!(update.artifact_hash, expected_artifact);
+        assert_eq!(update.artifact_version, ARTIFACT_VERSION_2);
+        assert_eq!(update.baseboard_id.serial_number, *expected_serial);
+        let PendingMgsUpdateDetails::Sp {
+            expected_active_version,
+            expected_inactive_version,
+        } = &update.details
+        else {
+            panic!("expected SP update");
+        };
+        assert_eq!(*expected_active_version, ARTIFACT_VERSION_1);
+        assert_eq!(*expected_inactive_version, ExpectedVersion::NoValidVersion);
+    }
+
+    // Tests the case where an SP appears to move while an update is pending
+    #[test]
+    fn test_sp_move() {
+        let logctx = LogContext::new(
+            "planning_mgs_updates_whole_system_simultaneous",
+            &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug },
+        );
+
+        // Configure an update for one SP.
+        let log = &logctx.log;
+        let repo = make_tuf_repo();
+        let mut collection = make_collection(
+            ARTIFACT_VERSION_2,
+            &BTreeMap::from([((SpType::Sled, 0), ARTIFACT_VERSION_1)]),
+            ExpectedVersion::NoValidVersion,
+        );
+        let updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &PendingMgsUpdates::new(),
+            Some(&repo),
+            1,
+        );
+        assert!(!updates.is_empty());
+        let update = updates.into_iter().next().expect("at least one update");
+
+        // Move an SP (as if someone had moved the sled to a different cubby).
+        // This is awful, but at least it's easy.
+        let sp_info = collection
+            .sps
+            .values_mut()
+            .find(|sp| sp.sp_type == SpType::Sled && sp.sp_slot == 0)
+            .expect("missing sled 0 SP");
+        sp_info.sp_slot = 9;
+
+        // Plan again.  The configured update should be updated to reflect the
+        // new location.
+        let new_updates = plan_mgs_updates(
+            log,
+            &collection,
+            &collection.baseboards,
+            &updates,
+            Some(&repo),
+            1,
+        );
+        assert!(!new_updates.is_empty());
+        let new_update =
+            new_updates.into_iter().next().expect("at least one update");
+        assert_ne!(new_update.slot_id, update.slot_id);
+        assert_eq!(new_update.baseboard_id, update.baseboard_id);
+        assert_eq!(new_update.sp_type, update.sp_type);
+        assert_eq!(new_update.artifact_hash, update.artifact_hash);
+        assert_eq!(new_update.artifact_version, update.artifact_version);
+        assert_eq!(new_update.details, update.details);
+
+        logctx.cleanup_successful();
+    }
+}
diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs
index 6a02dd05c87..b761ac901db 100644
--- a/nexus/types/src/deployment.rs
+++ b/nexus/types/src/deployment.rs
@@ -1188,7 +1188,6 @@ pub struct PendingMgsUpdate {
     pub details: PendingMgsUpdateDetails,
 
     /// which artifact to apply to this device
-    /// (implies which component is being updated)
     pub artifact_hash: ArtifactHash,
     pub artifact_version: ArtifactVersion,
 }

From 38dc53a758be4d2ad3b09c7b300e9854e20dcf14 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Tue, 3 Jun 2025 13:56:40 -0700
Subject: [PATCH 2/5] add docs

---
 .../planning/src/mgs_updates/mod.rs            | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
index 241b05f7e6c..1e56ad14c29 100644
--- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
+++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
@@ -20,6 +20,19 @@ use thiserror::Error;
 use tufaceous_artifact::ArtifactVersion;
 use tufaceous_artifact::KnownArtifactKind;
 
+/// Generates a new set of `PendingMgsUpdates` based on:
+///
+/// * `inventory`: the latest inventory
+/// * `current_boards`: a set of baseboards to consider updating
+///   (it is possible to have baseboards in inventory that would never be
+///   updated because they're not considered part of the current system)
+/// * `current_updates`: the most recent set of configured `PendingMgsUpdates`
+/// * `current_artifacts`: information about artifacts frmo the current target
+///   release (if any)
+/// * `nmax_updates`: the maximum number of updates allowed at once
+///
+/// By current policy, `nmax_updates` is always 1, but the implementation here
+/// supports more than one update per invocation.
 pub fn plan_mgs_updates(
     log: &slog::Logger,
     inventory: &Collection,
@@ -233,6 +246,8 @@ fn mgs_update_status(
     }
 }
 
+/// Compares a configured SP update with information from inventory and
+/// determines the current status of the update.  See `MgsUpdateStatus`.
 fn mgs_update_status_sp(
     desired_version: &ArtifactVersion,
     expected_active_version: &ArtifactVersion,
@@ -304,6 +319,8 @@ fn mgs_update_status_sp(
     }
 }
 
+/// Determine if the given baseboard needs any MGS-driven update (e.g., update
+/// to its SP, RoT, etc.).  If so, returns the update.  If not, returns `None`.
 fn try_make_update(
     log: &slog::Logger,
     baseboard_id: &Arc<BaseboardId>,
@@ -316,6 +333,7 @@ fn try_make_update(
     try_make_update_sp(log, baseboard_id, inventory, current_artifacts)
 }
 
+/// Determine if the given baseboard needs an SP update and, if so, returns it.
 fn try_make_update_sp(
     log: &slog::Logger,
     baseboard_id: &Arc<BaseboardId>,

From f28b929eb3c4c4411020dc2bcd37367be882593c Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 4 Jun 2025 07:05:19 -0700
Subject: [PATCH 3/5] fix openapi

---
 openapi/nexus-internal.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json
index cf5954d825e..851df54d050 100644
--- a/openapi/nexus-internal.json
+++ b/openapi/nexus-internal.json
@@ -5385,7 +5385,7 @@
         "type": "object",
         "properties": {
           "artifact_hash": {
-            "description": "which artifact to apply to this device (implies which component is being updated)",
+            "description": "which artifact to apply to this device",
             "type": "string",
             "format": "hex string (32 bytes)"
           },

From bd7bcd66097bdaa1528a3101cd9bef07f3ead783 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 4 Jun 2025 07:11:52 -0700
Subject: [PATCH 4/5] update comments based on review feedback

---
 .../reconfigurator/planning/src/mgs_updates/mod.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
index 1e56ad14c29..3102c116de7 100644
--- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
+++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
@@ -150,6 +150,13 @@ enum MgsUpdateStatus {
     NotDone,
     /// the requested update has not completed and the preconditions are not
     /// currently met
+    ///
+    /// The only way for this update to complete as-written is if reality
+    /// changes such that the preconditions become met.  But the only normal
+    /// operation that could make that happen is blueprint execution, which
+    /// won't do anything while the preconditions aren't met.  So if an update
+    /// is in this state, generally the planner must remove this update (and
+    /// presumably add one with updated preconditions).
     Impossible,
 }
 
@@ -329,7 +336,8 @@ fn try_make_update(
 ) -> Option<PendingMgsUpdate> {
     // TODO When we add support for planning RoT, RoT bootloader, and host OS
     // updates, we'll try these in a hardcoded priority order until any of them
-    // returns `Some`.  For now, we only plan SP updates.
+    // returns `Some`.  The order is described in RFD 565 section "Update
+    // Sequence".  For now, we only plan SP updates.
     try_make_update_sp(log, baseboard_id, inventory, current_artifacts)
 }
 
@@ -420,7 +428,9 @@ fn try_make_update_sp(
     }
 
     if matching_artifacts.len() > 1 {
-        // This is noteworthy, but not a problem.  We'll just pick one.
+        // This should be impossible unless we shipped a TUF repo with multiple
+        // artifacts for the same board.  But it doesn't prevent us from picking
+        // one and proceeding.  Make a note and proceed.
         warn!(log, "found more than one matching artifact for SP update");
     }
 

From bf2ab7cac4f6390f4215b590b079268bb20b02e1 Mon Sep 17 00:00:00 2001
From: David Pacheco <dap@oxidecomputer.com>
Date: Wed, 4 Jun 2025 09:58:45 -0700
Subject: [PATCH 5/5] review feedback

---
 nexus/reconfigurator/planning/src/mgs_updates/mod.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
index 3102c116de7..ca6a953a3a8 100644
--- a/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
+++ b/nexus/reconfigurator/planning/src/mgs_updates/mod.rs
@@ -27,7 +27,7 @@ use tufaceous_artifact::KnownArtifactKind;
 ///   (it is possible to have baseboards in inventory that would never be
 ///   updated because they're not considered part of the current system)
 /// * `current_updates`: the most recent set of configured `PendingMgsUpdates`
-/// * `current_artifacts`: information about artifacts frmo the current target
+/// * `current_artifacts`: information about artifacts from the current target
 ///   release (if any)
 /// * `nmax_updates`: the maximum number of updates allowed at once
 ///
@@ -71,7 +71,7 @@ pub fn plan_mgs_updates(
             Ok(MgsUpdateStatus::Impossible) => {
                 info!(
                     log,
-                    "SP update impossible
+                    "SP update impossible \
                      (will remove it and re-evaluate board)";
                     update
                 );
@@ -115,7 +115,7 @@ pub fn plan_mgs_updates(
         current_boards.iter().filter(|b| !boards_preferred.contains(*b));
     let candidates = boards_preferred.iter().chain(non_preferred);
     for board in candidates {
-        if rv.len() == nmax_updates {
+        if rv.len() >= nmax_updates {
             info!(
                 log,
                 "reached maximum number of pending SP updates";
@@ -454,7 +454,7 @@ fn try_make_update_sp(
         Err(_) => {
             warn!(
                 log,
-                "cannot configure SP update for board
+                "cannot configure SP update for board \
                  (found inactive slot contents but version was not valid)";
                 baseboard_id
             );
@@ -834,7 +834,7 @@ mod test {
         assert_eq!(updates, later_updates);
 
         // At this point, we're ready to test that when the first update
-        // completes, then th esecond one *is* started.  This tests two
+        // completes, then the second one *is* started.  This tests two
         // different things: first that we noticed the first one completed, and
         // second that we noticed another thing needed an update
         let later_collection = make_collection(